SVN: [6858] trunk/pywikipedia - Pywikipedia-svn

8 May 2009

Revision: 6858
Author:   nicdumz
Date:     2009-05-08 15:23:29 +0000 (Fri, 08 May 2009)
Log Message:
-----------
BeautifulSoup 3.1.0.1 + relevant wikipedia.py fix
Modified Paths:
--------------
    trunk/pywikipedia/BeautifulSoup.py
    trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/BeautifulSoup.py
===================================================================

--- trunk/pywikipedia/BeautifulSoup.py	2009-05-08 07:52:49 UTC (rev 6857)
+++ trunk/pywikipedia/BeautifulSoup.py	2009-05-08 15:23:29 UTC (rev 6858)
@@ -42,7 +42,7 @@
Here, have some legalese:
-Copyright (c) 2004-2007, Leonard Richardson
+Copyright (c) 2004-2009, Leonard Richardson
All rights reserved.
@@ -79,27 +79,38 @@
 from __future__ import generators
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "3.0.6"
-__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
+__version__ = "3.1.0.1"
+__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
 __license__ = "New-style BSD"
-from sgmllib import SGMLParser, SGMLParseError
 import codecs
+import markupbase
 import types
 import re
-import sgmllib
+from HTMLParser import HTMLParser, HTMLParseError
 try:
-  from htmlentitydefs import name2codepoint
+    from htmlentitydefs import name2codepoint
 except ImportError:
-  name2codepoint = {}
+    name2codepoint = {}
+try:
+    set
+except NameError:
+    from sets import Set as set
-#This hack makes Beautiful Soup able to parse XML with namespaces
-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+#These hacks make Beautiful Soup able to parse XML with namespaces
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
DEFAULT_OUTPUT_ENCODING = "utf-8"
# First, the classes that represent markup elements.
+def sob(unicode, encoding):
+    """Returns either the given Unicode string or its encoding."""
+    if encoding is None:
+        return unicode
+    else:
+        return unicode.encode(encoding)
+
 class PageElement:
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
@@ -391,8 +402,20 @@
class NavigableString(unicode, PageElement):
+    def __new__(cls, value):
+        """Create a new NavigableString.
+
+        When unpickling a NavigableString, this method is called with
+        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+        passed in to the superclass's __new__ or the superclass won't know
+        how to handle non-ASCII characters.
+        """
+        if isinstance(value, unicode):
+            return unicode.__new__(cls, value)
+        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
     def __getnewargs__(self):
-        return (NavigableString.__str__(self),)
+        return (unicode(self),)
def __getattr__(self, attr):
         """text.string gives you text. This is for backwards
@@ -403,34 +426,32 @@
         else:
             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
-    def __unicode__(self):
-        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
+    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return self.decode().encode(encoding)
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        if encoding:
-            return self.encode(encoding)
-        else:
-            return self
+    def decodeGivenEventualEncoding(self, eventualEncoding):
+        return self
class CData(NavigableString):
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
+    def decodeGivenEventualEncoding(self, eventualEncoding):
+        return u'<![CDATA[' + self + u']]>'
class ProcessingInstruction(NavigableString):
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+
+    def decodeGivenEventualEncoding(self, eventualEncoding):
         output = self
-        if "%SOUP-ENCODING%" in output:
-            output = self.substituteEncoding(output, encoding)
-        return "<?%s?>" % self.toEncoding(output, encoding)
+        if u'%SOUP-ENCODING%' in output:
+            output = self.substituteEncoding(output, eventualEncoding)
+        return u'<?' + output + u'?>'
class Comment(NavigableString):
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return "<!--%s-->" % NavigableString.__str__(self, encoding)
+    def decodeGivenEventualEncoding(self, eventualEncoding):
+        return u'<!--' + self + u'-->'
class Declaration(NavigableString):
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return "<!%s>" % NavigableString.__str__(self, encoding)
+    def decodeGivenEventualEncoding(self, eventualEncoding):
+        return u'<!' + self + u'>'
class Tag(PageElement):
@@ -496,11 +517,13 @@
         self.convertXMLEntities = parser.convertXMLEntities
         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
-        # Convert any HTML, XML, or numeric entities in the attribute values.
-        convert = lambda(k, val): (k,
-                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
-                                          self._convertEntities,
-                                          val))
+        def convert(kval):
+            "Converts HTML, XML and numeric entities in the attribute value."
+            k, val = kval
+            if val is None:
+                return kval
+            return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+                              self._convertEntities, val))
         self.attrs = map(convert, self.attrs)
def get(self, key, default=None):
@@ -591,11 +614,8 @@
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         """Renders this tag as a string."""
-        return self.__str__(encoding)
+        return self.decode(eventualEncoding=encoding)
-    def __unicode__(self):
-        return self.__str__(None)
-
     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
                                            + ")")
@@ -605,24 +625,30 @@
         appropriate XML entity for an XML special character."""
         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
-    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                prettyPrint=False, indentLevel=0):
-        """Returns a string or Unicode representation of this tag and
-        its contents. To get Unicode, pass None for encoding.
+    def __unicode__(self):
+        return self.decode()
-        NOTE: since Python's HTML parser consumes whitespace, this
-        method is not certain to reproduce the whitespace present in
-        the original string."""
+    def __str__(self):
+        return self.encode()
-        encodedName = self.toEncoding(self.name, encoding)
+    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
+               prettyPrint=False, indentLevel=0):
+        return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
+    def decode(self, prettyPrint=False, indentLevel=0,
+               eventualEncoding=DEFAULT_OUTPUT_ENCODING):
+        """Returns a string or Unicode representation of this tag and
+        its contents. To get Unicode, pass None for encoding."""
+
         attrs = []
         if self.attrs:
             for key, val in self.attrs:
                 fmt = '%s="%s"'
                 if isString(val):
-                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
-                        val = self.substituteEncoding(val, encoding)
+                    if (self.containsSubstitutions
+                        and eventualEncoding is not None
+                        and '%SOUP-ENCODING%' in val):
+                        val = self.substituteEncoding(val, eventualEncoding)
# The attribute value either:
                     #
@@ -651,22 +677,26 @@
                     # ampersands that aren't part of entities. We need
                     # to escape those to XML entities too.
                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
-
-                attrs.append(fmt % (self.toEncoding(key, encoding),
-                                    self.toEncoding(val, encoding)))
+                if val is None:
+                    # Handle boolean attributes.
+                    decoded = key
+                else:
+                    decoded = fmt % (key, val)
+                attrs.append(decoded)
         close = ''
         closeTag = ''
         if self.isSelfClosing:
             close = ' /'
         else:
-            closeTag = '</%s>' % encodedName
+            closeTag = '</%s>' % self.name
indentTag, indentContents = 0, 0
         if prettyPrint:
             indentTag = indentLevel
             space = (' ' * (indentTag-1))
             indentContents = indentTag + 1
-        contents = self.renderContents(encoding, prettyPrint, indentContents)
+        contents = self.decodeContents(prettyPrint, indentContents,
+                                       eventualEncoding)
         if self.hidden:
             s = contents
         else:
@@ -676,7 +706,7 @@
                 attributeString = ' ' + ' '.join(attrs)
             if prettyPrint:
                 s.append(space)
-            s.append('<%s%s%s>' % (encodedName, attributeString, close))
+            s.append('<%s%s%s>' % (self.name, attributeString, close))
             if prettyPrint:
                 s.append("\n")
             s.append(contents)
@@ -701,19 +731,23 @@
         self.extract()
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return self.__str__(encoding, True)
+        return self.encode(encoding, True)
-    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+    def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
                        prettyPrint=False, indentLevel=0):
+        return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
+
+    def decodeContents(self, prettyPrint=False, indentLevel=0,
+                       eventualEncoding=DEFAULT_OUTPUT_ENCODING):
         """Renders the contents of this tag as a string in the given
         encoding. If encoding is None, returns a Unicode string.."""
         s=[]
         for c in self:
             text = None
             if isinstance(c, NavigableString):
-                text = c.__str__(encoding)
+                text = c.decodeGivenEventualEncoding(eventualEncoding)
             elif isinstance(c, Tag):
-                s.append(c.__str__(encoding, prettyPrint, indentLevel))
+                s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
             if text and prettyPrint:
                 text = text.strip()
             if text:
@@ -754,7 +788,7 @@
         return self._findAll(name, attrs, text, limit, generator, **kwargs)
     findChildren = findAll
-    # Pre-3.x compatibility methods
+    # Pre-3.x compatibility methods. Will go away in 4.0.
     first = find
     fetch = findAll
@@ -764,6 +798,15 @@
     def firstText(self, text=None, recursive=True):
         return self.find(text=text, recursive=recursive)
+    # 3.x compatibility methods. Will go away in 4.0.
+    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+                       prettyPrint=False, indentLevel=0):
+        if encoding is None:
+            return self.decodeContents(prettyPrint, indentLevel, encoding)
+        else:
+            return self.encodeContents(encoding, prettyPrint, indentLevel)
+
+
     #Private methods
def _getAttrMap(self):
@@ -776,26 +819,24 @@
         return self.attrMap
#Generator methods
+    def recursiveChildGenerator(self):
+        if not len(self.contents):
+            raise StopIteration
+        stopNode = self._lastRecursiveChild().next
+        current = self.contents[0]
+        while current is not stopNode:
+            yield current
+            current = current.next
+
     def childGenerator(self):
-        for i in range(0, len(self.contents)):
-            yield self.contents[i]
+        if not len(self.contents):
+            raise StopIteration
+        current = self.contents[0]
+        while current:
+            yield current
+            current = current.nextSibling
         raise StopIteration
-    def recursiveChildGenerator(self):
-        stack = [(self, 0)]
-        while stack:
-            tag, start = stack.pop()
-            if isinstance(tag, Tag):
-                for i in range(start, len(tag.contents)):
-                    a = tag.contents[i]
-                    yield a
-                    if isinstance(a, Tag) and tag.contents:
-                        if i < len(tag.contents) - 1:
-                            stack.append((tag, i+1))
-                        stack.append((a, 0))
-                        break
-        raise StopIteration
-
 # Next, a couple classes to represent queries and their results.
 class SoupStrainer:
     """Encapsulates a number of ways of matching a markup element (tag or
@@ -896,13 +937,14 @@
             #other ways of matching match the tag name as a string.
             if isinstance(markup, Tag):
                 markup = markup.name
-            if markup and not isString(markup):
+            if markup is not None and not isString(markup):
                 markup = unicode(markup)
             #Now we know that chunk is either a string, or None.
             if hasattr(matchAgainst, 'match'):
                 # It's a regexp object.
                 result = markup and matchAgainst.search(markup)
-            elif isList(matchAgainst):
+            elif (isList(matchAgainst)
+                  and (markup is not None or not isString(matchAgainst))):
                 result = markup in matchAgainst
             elif hasattr(matchAgainst, 'items'):
                 result = markup.has_key(matchAgainst)
@@ -928,8 +970,8 @@
 def isList(l):
     """Convenience method that works with all 2.x versions of Python
     to determine whether or not something is listlike."""
-    return hasattr(l, '__iter__') \
-           or (type(l) in (types.ListType, types.TupleType))
+    return ((hasattr(l, '__iter__') and not isString(l))
+            or (type(l) in (types.ListType, types.TupleType)))
def isString(s):
     """Convenience method that works with all 2.x versions of Python
@@ -949,7 +991,7 @@
             #It's a map. Merge it.
             for k,v in portion.items():
                 built[k] = v
-        elif isList(portion):
+        elif isList(portion) and not isString(portion):
             #It's a list. Map each item to the default.
             for k in portion:
                 built[k] = default
@@ -960,8 +1002,123 @@
# Now, the parser classes.
-class BeautifulStoneSoup(Tag, SGMLParser):
+class HTMLParserBuilder(HTMLParser):
+    def __init__(self, soup):
+        HTMLParser.__init__(self)
+        self.soup = soup
+
+    # We inherit feed() and reset().
+
+    def handle_starttag(self, name, attrs):
+        if name == 'meta':
+            self.soup.extractCharsetFromMeta(attrs)
+        else:
+            self.soup.unknown_starttag(name, attrs)
+
+    def handle_endtag(self, name):
+        self.soup.unknown_endtag(name)
+
+    def handle_data(self, content):
+        self.soup.handle_data(content)
+
+    def _toStringSubclass(self, text, subclass):
+        """Adds a certain piece of text to the tree as a NavigableString
+        subclass."""
+        self.soup.endData()
+        self.handle_data(text)
+        self.soup.endData(subclass)
+
+    def handle_pi(self, text):
+        """Handle a processing instruction as a ProcessingInstruction
+        object, possibly one with a %SOUP-ENCODING% slot into which an
+        encoding will be plugged later."""
+        if text[:3] == "xml":
+            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+        self._toStringSubclass(text, ProcessingInstruction)
+
+    def handle_comment(self, text):
+        "Handle comments as Comment objects."
+        self._toStringSubclass(text, Comment)
+
+    def handle_charref(self, ref):
+        "Handle character references as data."
+        if self.soup.convertEntities:
+            data = unichr(int(ref))
+        else:
+            data = '&#%s;' % ref
+        self.handle_data(data)
+
+    def handle_entityref(self, ref):
+        """Handle entity references as data, possibly converting known
+        HTML and/or XML entity references to the corresponding Unicode
+        characters."""
+        data = None
+        if self.soup.convertHTMLEntities:
+            try:
+                data = unichr(name2codepoint[ref])
+            except KeyError:
+                pass
+
+        if not data and self.soup.convertXMLEntities:
+                data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+        if not data and self.soup.convertHTMLEntities and \
+            not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+                # TODO: We've got a problem here. We're told this is
+                # an entity reference, but it's not an XML entity
+                # reference or an HTML entity reference. Nonetheless,
+                # the logical thing to do is to pass it through as an
+                # unrecognized entity reference.
+                #
+                # Except: when the input is "&carol;" this function
+                # will be called with input "carol". When the input is
+                # "AT&T", this function will be called with input
+                # "T". We have no way of knowing whether a semicolon
+                # was present originally, so we don't know whether
+                # this is an unknown entity or just a misplaced
+                # ampersand.
+                #
+                # The more common case is a misplaced ampersand, so I
+                # escape the ampersand and omit the trailing semicolon.
+                data = "&amp;%s" % ref
+        if not data:
+            # This case is different from the one above, because we
+            # haven't already gone through a supposedly comprehensive
+            # mapping of entities to Unicode characters. We might not
+            # have gone through any mapping at all. So the chances are
+            # very high that this is a real entity, and not a
+            # misplaced ampersand.
+            data = "&%s;" % ref
+        self.handle_data(data)
+
+    def handle_decl(self, data):
+        "Handle DOCTYPEs and the like as Declaration objects."
+        self._toStringSubclass(data, Declaration)
+
+    def parse_declaration(self, i):
+        """Treat a bogus SGML declaration as raw data. Treat a CDATA
+        declaration as a CData object."""
+        j = None
+        if self.rawdata[i:i+9] == '<![CDATA[':
+             k = self.rawdata.find(']]>', i)
+             if k == -1:
+                 k = len(self.rawdata)
+             data = self.rawdata[i+9:k]
+             j = k+3
+             self._toStringSubclass(data, CData)
+        else:
+            try:
+                j = HTMLParser.parse_declaration(self, i)
+            except HTMLParseError:
+                toHandle = self.rawdata[i:]
+                self.handle_data(toHandle)
+                j = i + len(toHandle)
+        return j
+
+
+class BeautifulStoneSoup(Tag):
+
     """This class contains the basic parser and search code. It defines
     a parser that knows nothing about tag behavior except for the
     following:
@@ -982,6 +1139,7 @@
     NESTABLE_TAGS = {}
     RESET_NESTING_TAGS = {}
     QUOTE_TAGS = {}
+    PRESERVE_WHITESPACE_TAGS = []
MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
                        lambda x: x.group(1) + ' />'),
@@ -1005,14 +1163,15 @@
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
-                 convertEntities=None, selfClosingTags=None):
+                 convertEntities=None, selfClosingTags=None, isHTML=False,
+                 builder=HTMLParserBuilder):
         """The Soup object is initialized as the 'root tag', and the
         provided markup (which can be a string or a file-like object)
         is fed into the underlying parser.
-        sgmllib will process most bad HTML, and the BeautifulSoup
+        HTMLParser will process most bad HTML, and the BeautifulSoup
         class has some tricks for dealing with some HTML that kills
-        sgmllib, but Beautiful Soup can nonetheless choke or lose data
+        HTMLParser, but Beautiful Soup can nonetheless choke or lose data
         if your data uses self-closing tags or declarations
         incorrectly.
@@ -1022,7 +1181,7 @@
         you'll get better performance.
The default parser massage techniques fix the two most common
-        instances of invalid HTML that choke sgmllib:
+        instances of invalid HTML that choke HTMLParser:
<br/> (No space between name of closing tag and tag close)
          <! --Comment--> (Extraneous whitespace in declaration)
@@ -1060,29 +1219,21 @@
             self.escapeUnrecognizedEntities = False
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
-        SGMLParser.__init__(self)
+        self.builder = builder(self)
+        self.reset()
if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
         self.markup = markup
         self.markupMassage = markupMassage
         try:
-            self._feed()
+            self._feed(isHTML=isHTML)
         except StopParsing:
             pass
-        self.markup = None                 # The markup can now be GCed
+        self.markup = None                 # The markup can now be GCed.
+        self.builder = None                # So can the builder.
-    def convert_charref(self, name):
-        """This method fixes a bug in Python's SGMLParser."""
-        try:
-            n = int(name)
-        except ValueError:
-            return
-        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
-            return
-        return self.convert_codepoint(n)
-
-    def _feed(self, inDocumentEncoding=None):
+    def _feed(self, inDocumentEncoding=None, isHTML=False):
         # Convert the document to Unicode.
         markup = self.markup
         if isinstance(markup, unicode):
@@ -1091,9 +1242,10 @@
         else:
             dammit = UnicodeDammit\
                      (markup, [self.fromEncoding, inDocumentEncoding],
-                      smartQuotesTo=self.smartQuotesTo)
+                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
             markup = dammit.unicode
             self.originalEncoding = dammit.originalEncoding
+            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
         if markup:
             if self.markupMassage:
                 if not isList(self.markupMassage):
@@ -1106,27 +1258,14 @@
                 # was relying on the existence of markupMassage, this
                 # might cause problems.
                 del(self.markupMassage)
-        self.reset()
+        self.builder.reset()
-        SGMLParser.feed(self, markup)
+        self.builder.feed(markup)
         # Close out any unfinished strings and close all the open tags.
         self.endData()
         while self.currentTag.name != self.ROOT_TAG_NAME:
             self.popTag()
-    def __getattr__(self, methodName):
-        """This method routes method call requests to either the SGMLParser
-        superclass or the Tag superclass, depending on the method name."""
-        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
-
-        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
-               or methodName.find('do_') == 0:
-            return SGMLParser.__getattr__(self, methodName)
-        elif methodName.find('__') != 0:
-            return Tag.__getattr__(self, methodName)
-        else:
-            raise AttributeError
-
     def isSelfClosingTag(self, name):
         """Returns true iff the given string is the name of a
         self-closing tag according to this parser."""
@@ -1136,7 +1275,7 @@
     def reset(self):
         Tag.__init__(self, self, self.ROOT_TAG_NAME)
         self.hidden = 1
-        SGMLParser.reset(self)
+        self.builder.reset()
         self.currentData = []
         self.currentTag = None
         self.tagStack = []
@@ -1166,8 +1305,10 @@
def endData(self, containerClass=NavigableString):
         if self.currentData:
-            currentData = ''.join(self.currentData)
-            if not currentData.translate(self.STRIP_ASCII_SPACES):
+            currentData = u''.join(self.currentData)
+            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+                not set([tag.name for tag in self.tagStack]).intersection(
+                    self.PRESERVE_WHITESPACE_TAGS)):
                 if '\n' in currentData:
                     currentData = '\n'
                 else:
@@ -1299,100 +1440,10 @@
     def handle_data(self, data):
         self.currentData.append(data)
-    def _toStringSubclass(self, text, subclass):
-        """Adds a certain piece of text to the tree as a NavigableString
-        subclass."""
-        self.endData()
-        self.handle_data(text)
-        self.endData(subclass)
+    def extractCharsetFromMeta(self, attrs):
+        self.unknown_starttag('meta', attrs)
-    def handle_pi(self, text):
-        """Handle a processing instruction as a ProcessingInstruction
-        object, possibly one with a %SOUP-ENCODING% slot into which an
-        encoding will be plugged later."""
-        if text[:3] == "xml":
-            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
-        self._toStringSubclass(text, ProcessingInstruction)
-    def handle_comment(self, text):
-        "Handle comments as Comment objects."
-        self._toStringSubclass(text, Comment)
-
-    def handle_charref(self, ref):
-        "Handle character references as data."
-        if self.convertEntities:
-            data = unichr(int(ref))
-        else:
-            data = '&#%s;' % ref
-        self.handle_data(data)
-
-    def handle_entityref(self, ref):
-        """Handle entity references as data, possibly converting known
-        HTML and/or XML entity references to the corresponding Unicode
-        characters."""
-        data = None
-        if self.convertHTMLEntities:
-            try:
-                data = unichr(name2codepoint[ref])
-            except KeyError:
-                pass
-
-        if not data and self.convertXMLEntities:
-                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
-
-        if not data and self.convertHTMLEntities and \
-            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
-                # TODO: We've got a problem here. We're told this is
-                # an entity reference, but it's not an XML entity
-                # reference or an HTML entity reference. Nonetheless,
-                # the logical thing to do is to pass it through as an
-                # unrecognized entity reference.
-                #
-                # Except: when the input is "&carol;" this function
-                # will be called with input "carol". When the input is
-                # "AT&T", this function will be called with input
-                # "T". We have no way of knowing whether a semicolon
-                # was present originally, so we don't know whether
-                # this is an unknown entity or just a misplaced
-                # ampersand.
-                #
-                # The more common case is a misplaced ampersand, so I
-                # escape the ampersand and omit the trailing semicolon.
-                data = "&amp;%s" % ref
-        if not data:
-            # This case is different from the one above, because we
-            # haven't already gone through a supposedly comprehensive
-            # mapping of entities to Unicode characters. We might not
-            # have gone through any mapping at all. So the chances are
-            # very high that this is a real entity, and not a
-            # misplaced ampersand.
-            data = "&%s;" % ref
-        self.handle_data(data)
-
-    def handle_decl(self, data):
-        "Handle DOCTYPEs and the like as Declaration objects."
-        self._toStringSubclass(data, Declaration)
-
-    def parse_declaration(self, i):
-        """Treat a bogus SGML declaration as raw data. Treat a CDATA
-        declaration as a CData object."""
-        j = None
-        if self.rawdata[i:i+9] == '<![CDATA[':
-             k = self.rawdata.find(']]>', i)
-             if k == -1:
-                 k = len(self.rawdata)
-             data = self.rawdata[i+9:k]
-             j = k+3
-             self._toStringSubclass(data, CData)
-        else:
-            try:
-                j = SGMLParser.parse_declaration(self, i)
-            except SGMLParseError:
-                toHandle = self.rawdata[i:]
-                self.handle_data(toHandle)
-                j = i + len(toHandle)
-        return j
-
 class BeautifulSoup(BeautifulStoneSoup):
"""This parser knows the following facts about HTML:
@@ -1444,12 +1495,15 @@
     def __init__(self, *args, **kwargs):
         if not kwargs.has_key('smartQuotesTo'):
             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+        kwargs['isHTML'] = True
         BeautifulStoneSoup.__init__(self, *args, **kwargs)
SELF_CLOSING_TAGS = buildTagMap(None,
                                     ['br' , 'hr', 'input', 'img', 'meta',
                                     'spacer', 'link', 'frame', 'base'])
+    PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
     QUOTE_TAGS = {'script' : None, 'textarea' : None}
#According to the HTML standard, each of these inline tags can
@@ -1494,9 +1548,9 @@
                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
# Used to detect the charset in a META tag; see start_meta
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-    def start_meta(self, attrs):
+    def extractCharsetFromMeta(self, attrs):
         """Beautiful Soup can detect a charset included in a META tag,
         try to convert the document to that charset, and re-parse the
         document from the beginning."""
@@ -1517,29 +1571,33 @@
         if httpEquiv and contentType: # It's an interesting meta tag.
             match = self.CHARSET_RE.search(contentType)
             if match:
-                if getattr(self, 'declaredHTMLEncoding') or \
-                       (self.originalEncoding == self.fromEncoding):
-                    # This is our second pass through the document, or
-                    # else an encoding was specified explicitly and it
-                    # worked. Rewrite the meta tag.
-                    newAttr = self.CHARSET_RE.sub\
-                              (lambda(match):match.group(1) +
-                               "%SOUP-ENCODING%", contentType)
+                if (self.declaredHTMLEncoding is not None or
+                    self.originalEncoding == self.fromEncoding):
+                    # An HTML encoding was sniffed while converting
+                    # the document to Unicode, or an HTML encoding was
+                    # sniffed during a previous pass through the
+                    # document, or an encoding was specified
+                    # explicitly and it worked. Rewrite the meta tag.
+                    def rewrite(match):
+                        return match.group(1) + "%SOUP-ENCODING%"
+                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
                                                newAttr)
                     tagNeedsEncodingSubstitution = True
                 else:
                     # This is our first pass through the document.
-                    # Go through it again with the new information.
+                    # Go through it again with the encoding information.
                     newCharset = match.group(3)
                     if newCharset and newCharset != self.originalEncoding:
                         self.declaredHTMLEncoding = newCharset
                         self._feed(self.declaredHTMLEncoding)
                         raise StopParsing
+                    pass
         tag = self.unknown_starttag("meta", attrs)
         if tag and tagNeedsEncodingSubstitution:
             tag.containsSubstitutions = True
+
 class StopParsing(Exception):
     pass
@@ -1687,9 +1745,10 @@
                         "x-sjis" : "shift-jis" }
def __init__(self, markup, overrideEncodings=[],
-                 smartQuotesTo='xml'):
+                 smartQuotesTo='xml', isHTML=False):
+        self.declaredHTMLEncoding = None
         self.markup, documentEncoding, sniffedEncoding = \
-                     self._detectEncoding(markup)
+                     self._detectEncoding(markup, isHTML)
         self.smartQuotesTo = smartQuotesTo
         self.triedEncodings = []
         if markup == '' or isinstance(markup, unicode):
@@ -1715,18 +1774,22 @@
             for proposed_encoding in ("utf-8", "windows-1252"):
                 u = self._convertFrom(proposed_encoding)
                 if u: break
+
         self.unicode = u
         if not u: self.originalEncoding = None
-    def _subMSChar(self, orig):
+    def _subMSChar(self, match):
         """Changes a MS smart quote character to an XML or HTML
         entity."""
+        orig = match.group(1)
         sub = self.MS_CHARS.get(orig)
         if type(sub) == types.TupleType:
             if self.smartQuotesTo == 'xml':
-                sub = '&#x%s;' % sub[1]
+                sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
             else:
-                sub = '&%s;' % sub[0]
+                sub = '&'.encode() + sub[0].encode() + ';'.encode()
+        else:
+            sub = sub.encode()
         return sub
def _convertFrom(self, proposed):
@@ -1741,9 +1804,9 @@
         if self.smartQuotesTo and proposed.lower() in("windows-1252",
                                                       "iso-8859-1",
                                                       "iso-8859-2"):
-            markup = re.compile("([\x80-\x9f])").sub \
-                     (lambda(x): self._subMSChar(x.group(1)),
-                      markup)
+            smart_quotes_re = "([\x80-\x9f])"
+            smart_quotes_compiled = re.compile(smart_quotes_re)
+            markup = smart_quotes_compiled.sub(self._subMSChar, markup)
try:
             # print "Trying to convert document to %s" % proposed
@@ -1782,7 +1845,7 @@
         newdata = unicode(data, encoding)
         return newdata
-    def _detectEncoding(self, xml_data):
+    def _detectEncoding(self, xml_data, isHTML=False):
         """Given a document, tries to detect its XML encoding."""
         xml_encoding = sniffed_xml_encoding = None
         try:
@@ -1830,13 +1893,19 @@
             else:
                 sniffed_xml_encoding = 'ascii'
                 pass
-            xml_encoding_match = re.compile \
-                                 ('^<?.*encoding=['"](.*?)['"].*?>')\
-                                 .match(xml_data)
         except:
             xml_encoding_match = None
-        if xml_encoding_match:
-            xml_encoding = xml_encoding_match.groups()[0].lower()
+        xml_encoding_re = '^<?.*encoding=['"](.*?)['"].*?>'.encode()
+        xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
+        if not xml_encoding_match and isHTML:
+            meta_re = '<\s*meta[^>]+charset=([^>]*?)[;'">]'.encode()
+            regexp = re.compile(meta_re, re.I)
+            xml_encoding_match = regexp.search(xml_data)
+        if xml_encoding_match is not None:
+            xml_encoding = xml_encoding_match.groups()[0].decode(
+                'ascii').lower()
+            if isHTML:
+                self.declaredHTMLEncoding = xml_encoding
             if sniffed_xml_encoding and \
                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
@@ -1927,5 +1996,5 @@
 #By default, act as an HTML pretty-printer.
 if __name__ == '__main__':
     import sys
-    soup = BeautifulSoup(sys.stdin.read())
+    soup = BeautifulSoup(sys.stdin)
     print soup.prettify()
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2009-05-08 07:52:49 UTC (rev 6857)
+++ trunk/pywikipedia/wikipedia.py	2009-05-08 15:23:29 UTC (rev 6858)
@@ -4995,7 +4995,7 @@
                     else:
                         tree = BeautifulStoneSoup(xml)
                         self._mediawiki_messages = _dict([(tag.get('name').lower(), html2unicode(tag.string))
-                                for tag in tree.findAll('message')])
+                                for tag in tree.findAll('message') if tag.string])
if not self._mediawiki_messages:
                     # No messages could be added.