[Pywikipedia-l] SVN: [4779] trunk/pywikipedia/BeautifulSoup.py

rotem at svn.wikimedia.org rotem at svn.wikimedia.org
Sat Dec 29 17:48:36 UTC 2007


Revision: 4779
Author:   rotem
Date:     2007-12-29 17:48:36 +0000 (Sat, 29 Dec 2007)

Log Message:
-----------
Update Beautiful Soup to version 3.0.5, per http://lists.wikimedia.org/pipermail/pywikipedia-l/2007-December/001627.html .

Modified Paths:
--------------
    trunk/pywikipedia/BeautifulSoup.py

Modified: trunk/pywikipedia/BeautifulSoup.py
===================================================================
--- trunk/pywikipedia/BeautifulSoup.py	2007-12-29 15:50:13 UTC (rev 4778)
+++ trunk/pywikipedia/BeautifulSoup.py	2007-12-29 17:48:36 UTC (rev 4779)
@@ -11,7 +11,7 @@
 structure. An ill-formed XML/HTML document yields a correspondingly
 ill-formed data structure. If your document is only locally
 well-formed, you can use this library to find and process the
-well-formed part of it. The BeautifulSoup class 
+well-formed part of it.
 
 Beautiful Soup works with Python 2.2 and up. It has no external
 dependencies, but you'll have more success at converting data to UTF-8
@@ -24,7 +24,7 @@
   http://cjkpython.i18n.org/
 
 Beautiful Soup defines classes for two main parsing strategies:
-    
+
  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
    language that kind of looks like XML.
 
@@ -40,13 +40,48 @@
 documentation:
 http://www.crummy.com/software/BeautifulSoup/documentation.html
 
+Here, have some legalese:
+
+Copyright (c) 2004-2007, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+  * Neither the name of the the Beautiful Soup Consortium and All
+    Night Kosher Bakery nor the names of its contributors may be
+    used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
 """
 from __future__ import generators
 
 __author__ = "Leonard Richardson (leonardr at segfault.org)"
-__version__ = "3.0.4"
+__version__ = "3.0.5"
 __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
-__license__ = "PSF"
+__license__ = "New-style BSD"
 
 from sgmllib import SGMLParser, SGMLParseError
 import codecs
@@ -71,7 +106,7 @@
 
     def setup(self, parent=None, previous=None):
         """Sets up the initial relations between this element and
-        other elements."""        
+        other elements."""
         self.parent = parent
         self.previous = previous
         self.next = None
@@ -81,7 +116,7 @@
             self.previousSibling = self.parent.contents[-1]
             self.previousSibling.nextSibling = self
 
-    def replaceWith(self, replaceWith):        
+    def replaceWith(self, replaceWith):
         oldParent = self.parent
         myIndex = self.parent.contents.index(self)
         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
@@ -92,11 +127,11 @@
                 # means that when we extract it, the index of this
                 # element will change.
                 myIndex = myIndex - 1
-        self.extract()        
+        self.extract()
         oldParent.insert(myIndex, replaceWith)
-        
+
     def extract(self):
-        """Destructively rips this element out of the tree."""        
+        """Destructively rips this element out of the tree."""
         if self.parent:
             try:
                 self.parent.contents.remove(self)
@@ -105,7 +140,7 @@
 
         #Find the two elements that would be next to each other if
         #this element (and any children) hadn't been parsed. Connect
-        #the two.        
+        #the two.
         lastChild = self._lastRecursiveChild()
         nextElement = lastChild.next
 
@@ -116,12 +151,12 @@
         self.previous = None
         lastChild.next = None
 
-        self.parent = None        
+        self.parent = None
         if self.previousSibling:
             self.previousSibling.nextSibling = self.nextSibling
         if self.nextSibling:
             self.nextSibling.previousSibling = self.previousSibling
-        self.previousSibling = self.nextSibling = None       
+        self.previousSibling = self.nextSibling = None
 
     def _lastRecursiveChild(self):
         "Finds the last element beneath this object to be parsed."
@@ -134,12 +169,12 @@
         if (isinstance(newChild, basestring)
             or isinstance(newChild, unicode)) \
             and not isinstance(newChild, NavigableString):
-            newChild = NavigableString(newChild)        
+            newChild = NavigableString(newChild)
 
         position =  min(position, len(self.contents))
         if hasattr(newChild, 'parent') and newChild.parent != None:
             # We're 'inserting' an element that's already one
-            # of this object's children. 
+            # of this object's children.
             if newChild.parent == self:
                 index = self.find(newChild)
                 if index and index < position:
@@ -149,7 +184,7 @@
                     # will jump down one.
                     position = position - 1
             newChild.extract()
-            
+
         newChild.parent = self
         previousChild = None
         if position == 0:
@@ -161,13 +196,13 @@
             newChild.previousSibling.nextSibling = newChild
             newChild.previous = previousChild._lastRecursiveChild()
         if newChild.previous:
-            newChild.previous.next = newChild        
+            newChild.previous.next = newChild
 
         newChildsLastElement = newChild._lastRecursiveChild()
 
         if position >= len(self.contents):
             newChild.nextSibling = None
-            
+
             parent = self
             parentsNextSibling = None
             while not parentsNextSibling:
@@ -180,8 +215,8 @@
             else:
                 newChildsLastElement.next = None
         else:
-            nextChild = self.contents[position]            
-            newChild.nextSibling = nextChild            
+            nextChild = self.contents[position]
+            newChild.nextSibling = nextChild
             if newChild.nextSibling:
                 newChild.nextSibling.previousSibling = newChild
             newChildsLastElement.next = nextChild
@@ -190,6 +225,10 @@
             newChildsLastElement.next.previous = newChildsLastElement
         self.contents.insert(position, newChild)
 
+    def append(self, tag):
+        """Appends the given tag to the contents of this tag."""
+        self.insert(len(self.contents), tag)
+
     def findNext(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the first item that matches the given criteria and
         appears after this Tag in the document."""
@@ -269,7 +308,7 @@
         if l:
             r = l[0]
         return r
-    
+
     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
         "Iterates over a generator looking for things that match."
 
@@ -294,7 +333,7 @@
         return results
 
     #These Generators can be used to navigate starting from both
-    #NavigableStrings and Tags.                
+    #NavigableStrings and Tags.
     def nextGenerator(self):
         i = self
         while i:
@@ -328,7 +367,7 @@
     # Utility methods
     def substituteEncoding(self, str, encoding=None):
         encoding = encoding or "utf-8"
-        return str.replace("%SOUP-ENCODING%", encoding)    
+        return str.replace("%SOUP-ENCODING%", encoding)
 
     def toEncoding(self, s, encoding=None):
         """Encodes an object to a string in some encoding, or to Unicode.
@@ -350,6 +389,9 @@
 
 class NavigableString(unicode, PageElement):
 
+    def __getnewargs__(self):
+        return (NavigableString.__str__(self),)
+
     def __getattr__(self, attr):
         """text.string gives you text. This is for backwards
         compatibility for Navigable*String, but for CData* it lets you
@@ -360,14 +402,14 @@
             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 
     def __unicode__(self):
-        return self.__str__(None)
+        return unicode(str(self))
 
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         if encoding:
             return self.encode(encoding)
         else:
             return self
-        
+
 class CData(NavigableString):
 
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
@@ -382,22 +424,56 @@
 
 class Comment(NavigableString):
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return "<!--%s-->" % NavigableString.__str__(self, encoding)    
+        return "<!--%s-->" % NavigableString.__str__(self, encoding)
 
 class Declaration(NavigableString):
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return "<!%s>" % NavigableString.__str__(self, encoding)        
+        return "<!%s>" % NavigableString.__str__(self, encoding)
 
 class Tag(PageElement):
 
     """Represents a found HTML tag with its attributes and contents."""
 
-    XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
-                                      '"' : "quote",
-                                      "&" : "amp",
-                                      "<" : "lt",
-                                      ">" : "gt" }
+    def _invert(h):
+        "Cheap function to invert a hash."
+        i = {}
+        for k,v in h.items():
+            i[v] = k
+        return i
 
+    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
+                                      "quot" : '"',
+                                      "amp" : "&",
+                                      "lt" : "<",
+                                      "gt" : ">" }
+
+    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
+
+    def _convertEntities(self, match):
+        """Used in a call to re.sub to replace HTML, XML, and numeric
+        entities with the appropriate Unicode characters. If HTML
+        entities are being converted, any unrecognized entities are
+        escaped."""
+        x = match.group(1)
+        if self.convertHTMLEntities and x in name2codepoint:
+            return unichr(name2codepoint[x])
+        elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
+            if self.convertXMLEntities:
+                return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+            else:
+                return u'&%s;' % x
+        elif len(x) > 0 and x[0] == '#':
+            # Handle numeric entities
+            if len(x) > 1 and x[1] == 'x':
+                return unichr(int(x[2:], 16))
+            else:
+                return unichr(int(x[1:]))
+
+        elif self.escapeUnrecognizedEntities:
+            return u'&amp;%s;' % x
+        else:
+            return u'&%s;' % x
+
     def __init__(self, parser, name, attrs=None, parent=None,
                  previous=None):
         "Basic constructor."
@@ -414,12 +490,22 @@
         self.setup(parent, previous)
         self.hidden = False
         self.containsSubstitutions = False
+        self.convertHTMLEntities = parser.convertHTMLEntities
+        self.convertXMLEntities = parser.convertXMLEntities
+        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 
+        # Convert any HTML, XML, or numeric entities in the attribute values.
+        convert = lambda(k, val): (k,
+                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+                                          self._convertEntities,
+                                          val))
+        self.attrs = map(convert, self.attrs)
+
     def get(self, key, default=None):
         """Returns the value of the 'key' attribute for the tag, or
         the value given for 'default' if it doesn't have that
         attribute."""
-        return self._getAttrMap().get(key, default)    
+        return self._getAttrMap().get(key, default)
 
     def has_key(self, key):
         return self._getAttrMap().has_key(key)
@@ -444,7 +530,7 @@
         "A tag is non-None even if it has no contents."
         return True
 
-    def __setitem__(self, key, value):        
+    def __setitem__(self, key, value):
         """Setting tag[key] sets the value of the 'key' attribute for the
         tag."""
         self._getAttrMap()
@@ -481,6 +567,7 @@
             return self.find(tag[:-3])
         elif tag.find('__') != 0:
             return self.find(tag)
+        raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
 
     def __eq__(self, other):
         """Returns true iff this tag has the same name, the same attributes,
@@ -507,6 +594,15 @@
     def __unicode__(self):
         return self.__str__(None)
 
+    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           + ")")
+
+    def _sub_entity(self, x):
+        """Used with a regular expression to substitute the
+        appropriate XML entity for an XML special character."""
+        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
                 prettyPrint=False, indentLevel=0):
         """Returns a string or Unicode representation of this tag and
@@ -522,7 +618,7 @@
         if self.attrs:
             for key, val in self.attrs:
                 fmt = '%s="%s"'
-                if isString(val):                    
+                if isString(val):
                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
                         val = self.substituteEncoding(val, encoding)
 
@@ -543,19 +639,17 @@
                     #   embedded single quotes to XML entities.
                     if '"' in val:
                         fmt = "%s='%s'"
-                        # This can't happen naturally, but it can happen
-                        # if you modify an attribute value after parsing.
                         if "'" in val:
+                            # TODO: replace with apos when
+                            # appropriate.
                             val = val.replace("'", "&squot;")
 
                     # Now we're okay w/r/t quotes. But the attribute
                     # value might also contain angle brackets, or
                     # ampersands that aren't part of entities. We need
                     # to escape those to XML entities too.
-                    val = re.sub("([<>]|&(?![^\s]+;))",
-                                 lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
-                                 val)
-                                      
+                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+
                 attrs.append(fmt % (self.toEncoding(key, encoding),
                                     self.toEncoding(val, encoding)))
         close = ''
@@ -577,7 +671,7 @@
             s = []
             attributeString = ''
             if attrs:
-                attributeString = ' ' + ' '.join(attrs)            
+                attributeString = ' ' + ' '.join(attrs)
             if prettyPrint:
                 s.append(space)
             s.append('<%s%s%s>' % (encodedName, attributeString, close))
@@ -609,14 +703,14 @@
             elif isinstance(c, Tag):
                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
             if text and prettyPrint:
-                text = text.strip()              
+                text = text.strip()
             if text:
                 if prettyPrint:
                     s.append(" " * (indentLevel-1))
                 s.append(text)
                 if prettyPrint:
                     s.append("\n")
-        return ''.join(s)    
+        return ''.join(s)
 
     #Soup methods
 
@@ -651,19 +745,13 @@
     # Pre-3.x compatibility methods
     first = find
     fetch = findAll
-    
+
     def fetchText(self, text=None, recursive=True, limit=None):
         return self.findAll(text=text, recursive=recursive, limit=limit)
 
     def firstText(self, text=None, recursive=True):
         return self.find(text=text, recursive=recursive)
-    
-    #Utility methods
 
-    def append(self, tag):
-        """Appends the given tag to the contents of this tag."""
-        self.contents.append(tag)
-
     #Private methods
 
     def _getAttrMap(self):
@@ -672,7 +760,7 @@
         if not getattr(self, 'attrMap'):
             self.attrMap = {}
             for (key, value) in self.attrs:
-                self.attrMap[key] = value 
+                self.attrMap[key] = value
         return self.attrMap
 
     #Generator methods
@@ -680,12 +768,12 @@
         for i in range(0, len(self.contents)):
             yield self.contents[i]
         raise StopIteration
-    
+
     def recursiveChildGenerator(self):
         stack = [(self, 0)]
         while stack:
             tag, start = stack.pop()
-            if isinstance(tag, Tag):            
+            if isinstance(tag, Tag):
                 for i in range(start, len(tag.contents)):
                     a = tag.contents[i]
                     yield a
@@ -720,7 +808,7 @@
             return self.text
         else:
             return "%s|%s" % (self.name, self.attrs)
-    
+
     def searchTag(self, markupName=None, markupAttrs={}):
         found = None
         markup = None
@@ -737,7 +825,7 @@
             if callFunctionWithTagData:
                 match = self.name(markupName, markupAttrs)
             else:
-                match = True            
+                match = True
                 markupAttrMap = None
                 for attr, matchAgainst in self.attrs.items():
                     if not markupAttrMap:
@@ -762,7 +850,7 @@
         #print 'looking for %s in %s' % (self, markup)
         found = None
         # If given a list of items, scan it for a text element that
-        # matches.        
+        # matches.
         if isList(markup) and not isinstance(markup, Tag):
             for element in markup:
                 if isinstance(element, NavigableString) \
@@ -783,8 +871,8 @@
             raise Exception, "I don't know how to match against a %s" \
                   % markup.__class__
         return found
-        
-    def _matches(self, markup, matchAgainst):    
+
+    def _matches(self, markup, matchAgainst):
         #print "Matching %s against %s" % (markup, matchAgainst)
         result = False
         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
@@ -835,7 +923,7 @@
     """Convenience method that works with all 2.x versions of Python
     to determine whether or not something is stringlike."""
     try:
-        return isinstance(s, unicode) or isintance(s, basestring) 
+        return isinstance(s, unicode) or isinstance(s, basestring)
     except NameError:
         return isinstance(s, str)
 
@@ -865,7 +953,7 @@
     """This class contains the basic parser and search code. It defines
     a parser that knows nothing about tag behavior except for the
     following:
-   
+
       You can't close a tag without closing all the tags it encloses.
       That is, "<foo><bar></foo>" actually means
       "<foo><bar></bar></foo>".
@@ -878,10 +966,6 @@
     or when BeautifulSoup makes an assumption counter to what you were
     expecting."""
 
-    XML_ENTITY_LIST = {}
-    for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():
-        XML_ENTITY_LIST[i] = True 
-
     SELF_CLOSING_TAGS = {}
     NESTABLE_TAGS = {}
     RESET_NESTING_TAGS = {}
@@ -897,13 +981,22 @@
 
     HTML_ENTITIES = "html"
     XML_ENTITIES = "xml"
+    XHTML_ENTITIES = "xhtml"
+    # TODO: This only exists for backwards-compatibility
+    ALL_ENTITIES = XHTML_ENTITIES
 
+    # Used when determining whether a text node is all whitespace and
+    # can be replaced with a single space. A text node that contains
+    # fancy Unicode spaces (usually non-breaking) should be left
+    # alone.
+    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+
     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
                  convertEntities=None, selfClosingTags=None):
         """The Soup object is initialized as the 'root tag', and the
         provided markup (which can be a string or a file-like object)
-        is fed into the underlying parser. 
+        is fed into the underlying parser.
 
         sgmllib will process most bad HTML, and the BeautifulSoup
         class has some tricks for dealing with some HTML that kills
@@ -930,14 +1023,33 @@
         self.fromEncoding = fromEncoding
         self.smartQuotesTo = smartQuotesTo
         self.convertEntities = convertEntities
+        # Set the rules for how we'll deal with the entities we
+        # encounter
         if self.convertEntities:
             # It doesn't make sense to convert encoded characters to
             # entities even while you're converting entities to Unicode.
             # Just convert it all to Unicode.
             self.smartQuotesTo = None
+            if convertEntities == self.HTML_ENTITIES:
+                self.convertXMLEntities = False
+                self.convertHTMLEntities = True
+                self.escapeUnrecognizedEntities = True
+            elif convertEntities == self.XHTML_ENTITIES:
+                self.convertXMLEntities = True
+                self.convertHTMLEntities = True
+                self.escapeUnrecognizedEntities = False
+            elif convertEntities == self.XML_ENTITIES:
+                self.convertXMLEntities = True
+                self.convertHTMLEntities = False
+                self.escapeUnrecognizedEntities = False
+        else:
+            self.convertXMLEntities = False
+            self.convertHTMLEntities = False
+            self.escapeUnrecognizedEntities = False
+
         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
         SGMLParser.__init__(self)
-            
+
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
         self.markup = markup
@@ -947,7 +1059,17 @@
         except StopParsing:
             pass
         self.markup = None                 # The markup can now be GCed
-        
+
+    def convert_charref(self, name):
+        """This method fixes a bug in Python's SGMLParser."""
+        try:
+            n = int(name)
+        except ValueError:
+            return
+        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+            return
+        return self.convert_codepoint(n)
+
     def _feed(self, inDocumentEncoding=None):
         # Convert the document to Unicode.
         markup = self.markup
@@ -963,9 +1085,15 @@
         if markup:
             if self.markupMassage:
                 if not isList(self.markupMassage):
-                    self.markupMassage = self.MARKUP_MASSAGE            
+                    self.markupMassage = self.MARKUP_MASSAGE
                 for fix, m in self.markupMassage:
                     markup = fix.sub(m, markup)
+                # TODO: We get rid of markupMassage so that the
+                # soup object can be deepcopied later on. Some
+                # Python installations can't copy regexes. If anyone
+                # was relying on the existence of markupMassage, this
+                # might cause problems.
+                del(self.markupMassage)
         self.reset()
 
         SGMLParser.feed(self, markup)
@@ -992,7 +1120,7 @@
         self-closing tag according to this parser."""
         return self.SELF_CLOSING_TAGS.has_key(name) \
                or self.instanceSelfClosingTags.has_key(name)
-            
+
     def reset(self):
         Tag.__init__(self, self, self.ROOT_TAG_NAME)
         self.hidden = 1
@@ -1002,7 +1130,7 @@
         self.tagStack = []
         self.quoteStack = []
         self.pushTag(self)
-    
+
     def popTag(self):
         tag = self.tagStack.pop()
         # Tags with just one string-owning child get the child as a
@@ -1020,14 +1148,14 @@
     def pushTag(self, tag):
         #print "Push", tag.name
         if self.currentTag:
-            self.currentTag.append(tag)
+            self.currentTag.contents.append(tag)
         self.tagStack.append(tag)
         self.currentTag = self.tagStack[-1]
 
     def endData(self, containerClass=NavigableString):
         if self.currentData:
             currentData = ''.join(self.currentData)
-            if not currentData.strip():
+            if not currentData.translate(self.STRIP_ASCII_SPACES):
                 if '\n' in currentData:
                     currentData = '\n'
                 else:
@@ -1052,7 +1180,7 @@
         the given tag."""
         #print "Popping to %s" % name
         if name == self.ROOT_TAG_NAME:
-            return            
+            return
 
         numPops = 0
         mostRecentTag = None
@@ -1065,7 +1193,7 @@
 
         for i in range(0, numPops):
             mostRecentTag = self.popTag()
-        return mostRecentTag    
+        return mostRecentTag
 
     def _smartPop(self, name):
 
@@ -1076,10 +1204,9 @@
         comes between this tag and the previous tag of this type.
 
         Examples:
-         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
-         <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
-         <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
-         <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+         <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
+         <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
+         <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
 
          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
@@ -1102,7 +1229,7 @@
                 and p.name in nestingResetTriggers) \
                 or (nestingResetTriggers == None and isResetNesting
                     and self.RESET_NESTING_TAGS.has_key(p.name)):
-                
+
                 #If we encounter one of the nesting reset triggers
                 #peculiar to this tag, or we encounter another tag
                 #that causes nesting to reset, pop up to but not
@@ -1121,7 +1248,7 @@
             #print "<%s> is not real!" % name
             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
             self.handle_data('<%s%s>' % (name, attrs))
-            return        
+            return
         self.endData()
 
         if not self.isSelfClosingTag(name) and not selfClosing:
@@ -1137,7 +1264,7 @@
         self.previous = tag
         self.pushTag(tag)
         if selfClosing or self.isSelfClosingTag(name):
-            self.popTag()                
+            self.popTag()
         if name in self.QUOTE_TAGS:
             #print "Beginning quote (%s)" % name
             self.quoteStack.append(name)
@@ -1172,7 +1299,7 @@
         object, possibly one with a %SOUP-ENCODING% slot into which an
         encoding will be plugged later."""
         if text[:3] == "xml":
-            text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
+            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
         self._toStringSubclass(text, ProcessingInstruction)
 
     def handle_comment(self, text):
@@ -1181,8 +1308,7 @@
 
     def handle_charref(self, ref):
         "Handle character references as data."
-        if self.convertEntities in [self.HTML_ENTITIES,
-                                    self.XML_ENTITIES]:
+        if self.convertEntities:
             data = unichr(int(ref))
         else:
             data = '&#%s;' % ref
@@ -1190,20 +1316,47 @@
 
     def handle_entityref(self, ref):
         """Handle entity references as data, possibly converting known
-        HTML entity references to the corresponding Unicode
+        HTML and/or XML entity references to the corresponding Unicode
         characters."""
         data = None
-        if self.convertEntities == self.HTML_ENTITIES or \
-               (self.convertEntities == self.XML_ENTITIES and \
-                self.XML_ENTITY_LIST.get(ref)):
+        if self.convertHTMLEntities:
             try:
                 data = unichr(name2codepoint[ref])
             except KeyError:
                 pass
+
+        if not data and self.convertXMLEntities:
+                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+        if not data and self.convertHTMLEntities and \
+            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+                # TODO: We've got a problem here. We're told this is
+                # an entity reference, but it's not an XML entity
+                # reference or an HTML entity reference. Nonetheless,
+                # the logical thing to do is to pass it through as an
+                # unrecognized entity reference.
+                #
+                # Except: when the input is "&carol;" this function
+                # will be called with input "carol". When the input is
+                # "AT&T", this function will be called with input
+                # "T". We have no way of knowing whether a semicolon
+                # was present originally, so we don't know whether
+                # this is an unknown entity or just a misplaced
+                # ampersand.
+                #
+                # The more common case is a misplaced ampersand, so I
+                # escape the ampersand and omit the trailing semicolon.
+                data = "&amp;%s" % ref
         if not data:
-            data = '&%s;' % ref
+            # This case is different from the one above, because we
+            # haven't already gone through a supposedly comprehensive
+            # mapping of entities to Unicode characters. We might not
+            # have gone through any mapping at all. So the chances are
+            # very high that this is a real entity, and not a
+            # misplaced ampersand.
+            data = "&%s;" % ref
         self.handle_data(data)
-        
+
     def handle_decl(self, data):
         "Handle DOCTYPEs and the like as Declaration objects."
         self._toStringSubclass(data, Declaration)
@@ -1285,8 +1438,8 @@
                                     ['br' , 'hr', 'input', 'img', 'meta',
                                     'spacer', 'link', 'frame', 'base'])
 
-    QUOTE_TAGS = {'script': None}
-    
+    QUOTE_TAGS = {'script' : None, 'textarea' : None}
+
     #According to the HTML standard, each of these inline tags can
     #contain another tag of the same type. Furthermore, it's common
     #to actually use these tags this way.
@@ -1298,7 +1451,7 @@
     #to actually use these tags this way.
     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
 
-    #Lists can contain other lists, but there are restrictions.    
+    #Lists can contain other lists, but there are restrictions.
     NESTABLE_LIST_TAGS = { 'ol' : [],
                            'ul' : [],
                            'li' : ['ul', 'ol'],
@@ -1306,8 +1459,8 @@
                            'dd' : ['dl'],
                            'dt' : ['dl'] }
 
-    #Tables can contain other tables, but there are restrictions.    
-    NESTABLE_TABLE_TAGS = {'table' : [], 
+    #Tables can contain other tables, but there are restrictions.
+    NESTABLE_TABLE_TAGS = {'table' : [],
                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
                            'td' : ['tr'],
                            'th' : ['tr'],
@@ -1377,7 +1530,7 @@
 
 class StopParsing(Exception):
     pass
-   
+
 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
 
     """The BeautifulSoup class is oriented towards skipping over
@@ -1423,7 +1576,7 @@
 
     This also makes it better for subclassing than BeautifulStoneSoup
     or BeautifulSoup."""
-    
+
     RESET_NESTING_TAGS = buildTagMap('noscript')
     NESTABLE_TAGS = {}
 
@@ -1453,7 +1606,7 @@
             parent = self.tagStack[-2]
             parent._getAttrMap()
             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
-                isinstance(tag.contents[0], NavigableString) and 
+                isinstance(tag.contents[0], NavigableString) and
                 not parent.attrMap.has_key(tag.name)):
                 parent[tag.name] = tag.contents[0]
         BeautifulStoneSoup.popTag(self)
@@ -1463,7 +1616,7 @@
 #and "unprofessional" for use in enterprise screen-scraping. We feel
 #your pain! For such-minded folk, the Beautiful Soup Consortium And
 #All-Night Kosher Bakery recommends renaming this file to
-#"RobustParser.py" (or, in cases of extreme enterprisness,
+#"RobustParser.py" (or, in cases of extreme enterprisiness,
 #"RobustParserBeanInterface.class") and using the following
 #enterprise-friendly class aliases:
 class RobustXMLParser(BeautifulStoneSoup):
@@ -1530,9 +1683,9 @@
         self.triedEncodings = []
         if markup == '' or isinstance(markup, unicode):
             self.originalEncoding = None
-            self.unicode = unicode(markup)            
+            self.unicode = unicode(markup)
             return
-        
+
         u = None
         for proposedEncoding in overrideEncodings:
             u = self._convertFrom(proposedEncoding)
@@ -1541,7 +1694,7 @@
             for proposedEncoding in (documentEncoding, sniffedEncoding):
                 u = self._convertFrom(proposedEncoding)
                 if u: break
-                
+
         # If no luck and we have auto-detection library, try that:
         if not u and chardet and not isinstance(self.markup, unicode):
             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
@@ -1563,9 +1716,9 @@
                 sub = '&#x%s;' % sub[1]
             else:
                 sub = '&%s;' % sub[0]
-        return sub            
+        return sub
 
-    def _convertFrom(self, proposed):        
+    def _convertFrom(self, proposed):
         proposed = self.find_codec(proposed)
         if not proposed or proposed in self.triedEncodings:
             return None
@@ -1584,12 +1737,12 @@
         try:
             # print "Trying to convert document to %s" % proposed
             u = self._toUnicode(markup, proposed)
-            self.markup = u       
+            self.markup = u
             self.originalEncoding = proposed
         except Exception, e:
             # print "That didn't work!"
             # print e
-            return None        
+            return None
         #print "Correct encoding: %s" % proposed
         return self.markup
 
@@ -1617,7 +1770,7 @@
             data = data[4:]
         newdata = unicode(data, encoding)
         return newdata
-    
+
     def _detectEncoding(self, xml_data):
         """Given a document, tries to detect its XML encoding."""
         xml_encoding = sniffed_xml_encoding = None
@@ -1689,12 +1842,12 @@
                or charset
 
     def _codec(self, charset):
-        if not charset: return charset 
+        if not charset: return charset
         codec = None
         try:
             codecs.lookup(charset)
             codec = charset
-        except LookupError:
+        except (LookupError, ValueError):
             pass
         return codec
 





More information about the Pywikipedia-l mailing list