Revision: 4779 Author: rotem Date: 2007-12-29 17:48:36 +0000 (Sat, 29 Dec 2007)
Log Message: ----------- Update Beautiful Soup to version 3.0.5, per http://lists.wikimedia.org/pipermail/pywikipedia-l/2007-December/001627.html .
Modified Paths: -------------- trunk/pywikipedia/BeautifulSoup.py
Modified: trunk/pywikipedia/BeautifulSoup.py =================================================================== --- trunk/pywikipedia/BeautifulSoup.py 2007-12-29 15:50:13 UTC (rev 4778) +++ trunk/pywikipedia/BeautifulSoup.py 2007-12-29 17:48:36 UTC (rev 4779) @@ -11,7 +11,7 @@ structure. An ill-formed XML/HTML document yields a correspondingly ill-formed data structure. If your document is only locally well-formed, you can use this library to find and process the -well-formed part of it. The BeautifulSoup class +well-formed part of it.
Beautiful Soup works with Python 2.2 and up. It has no external dependencies, but you'll have more success at converting data to UTF-8 @@ -24,7 +24,7 @@ http://cjkpython.i18n.org/
Beautiful Soup defines classes for two main parsing strategies: - + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific language that kind of looks like XML.
@@ -40,13 +40,48 @@ documentation: http://www.crummy.com/software/BeautifulSoup/documentation.html
+Here, have some legalese: + +Copyright (c) 2004-2007, Leonard Richardson + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the the Beautiful Soup Consortium and All + Night Kosher Bakery nor the names of its contributors may be + used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. + """ from __future__ import generators
__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "3.0.4" +__version__ = "3.0.5" __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson" -__license__ = "PSF" +__license__ = "New-style BSD"
from sgmllib import SGMLParser, SGMLParseError import codecs @@ -71,7 +106,7 @@
def setup(self, parent=None, previous=None): """Sets up the initial relations between this element and - other elements.""" + other elements.""" self.parent = parent self.previous = previous self.next = None @@ -81,7 +116,7 @@ self.previousSibling = self.parent.contents[-1] self.previousSibling.nextSibling = self
- def replaceWith(self, replaceWith): + def replaceWith(self, replaceWith): oldParent = self.parent myIndex = self.parent.contents.index(self) if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: @@ -92,11 +127,11 @@ # means that when we extract it, the index of this # element will change. myIndex = myIndex - 1 - self.extract() + self.extract() oldParent.insert(myIndex, replaceWith) - + def extract(self): - """Destructively rips this element out of the tree.""" + """Destructively rips this element out of the tree.""" if self.parent: try: self.parent.contents.remove(self) @@ -105,7 +140,7 @@
#Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect - #the two. + #the two. lastChild = self._lastRecursiveChild() nextElement = lastChild.next
@@ -116,12 +151,12 @@ self.previous = None lastChild.next = None
- self.parent = None + self.parent = None if self.previousSibling: self.previousSibling.nextSibling = self.nextSibling if self.nextSibling: self.nextSibling.previousSibling = self.previousSibling - self.previousSibling = self.nextSibling = None + self.previousSibling = self.nextSibling = None
def _lastRecursiveChild(self): "Finds the last element beneath this object to be parsed." @@ -134,12 +169,12 @@ if (isinstance(newChild, basestring) or isinstance(newChild, unicode)) \ and not isinstance(newChild, NavigableString): - newChild = NavigableString(newChild) + newChild = NavigableString(newChild)
position = min(position, len(self.contents)) if hasattr(newChild, 'parent') and newChild.parent != None: # We're 'inserting' an element that's already one - # of this object's children. + # of this object's children. if newChild.parent == self: index = self.find(newChild) if index and index < position: @@ -149,7 +184,7 @@ # will jump down one. position = position - 1 newChild.extract() - + newChild.parent = self previousChild = None if position == 0: @@ -161,13 +196,13 @@ newChild.previousSibling.nextSibling = newChild newChild.previous = previousChild._lastRecursiveChild() if newChild.previous: - newChild.previous.next = newChild + newChild.previous.next = newChild
newChildsLastElement = newChild._lastRecursiveChild()
if position >= len(self.contents): newChild.nextSibling = None - + parent = self parentsNextSibling = None while not parentsNextSibling: @@ -180,8 +215,8 @@ else: newChildsLastElement.next = None else: - nextChild = self.contents[position] - newChild.nextSibling = nextChild + nextChild = self.contents[position] + newChild.nextSibling = nextChild if newChild.nextSibling: newChild.nextSibling.previousSibling = newChild newChildsLastElement.next = nextChild @@ -190,6 +225,10 @@ newChildsLastElement.next.previous = newChildsLastElement self.contents.insert(position, newChild)
+ def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + def findNext(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears after this Tag in the document.""" @@ -269,7 +308,7 @@ if l: r = l[0] return r - + def _findAll(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match."
@@ -294,7 +333,7 @@ return results
#These Generators can be used to navigate starting from both - #NavigableStrings and Tags. + #NavigableStrings and Tags. def nextGenerator(self): i = self while i: @@ -328,7 +367,7 @@ # Utility methods def substituteEncoding(self, str, encoding=None): encoding = encoding or "utf-8" - return str.replace("%SOUP-ENCODING%", encoding) + return str.replace("%SOUP-ENCODING%", encoding)
def toEncoding(self, s, encoding=None): """Encodes an object to a string in some encoding, or to Unicode. @@ -350,6 +389,9 @@
class NavigableString(unicode, PageElement):
+ def __getnewargs__(self): + return (NavigableString.__str__(self),) + def __getattr__(self, attr): """text.string gives you text. This is for backwards compatibility for Navigable*String, but for CData* it lets you @@ -360,14 +402,14 @@ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
def __unicode__(self): - return self.__str__(None) + return unicode(str(self))
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): if encoding: return self.encode(encoding) else: return self - + class CData(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): @@ -382,22 +424,56 @@
class Comment(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "<!--%s-->" % NavigableString.__str__(self, encoding) + return "<!--%s-->" % NavigableString.__str__(self, encoding)
class Declaration(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "<!%s>" % NavigableString.__str__(self, encoding) + return "<!%s>" % NavigableString.__str__(self, encoding)
class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
- XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot", - '"' : "quote", - "&" : "amp", - "<" : "lt", - ">" : "gt" } + def _invert(h): + "Cheap function to invert a hash." + i = {} + for k,v in h.items(): + i[v] = k + return i
+ XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", + "quot" : '"', + "amp" : "&", + "lt" : "<", + "gt" : ">" } + + XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) + + def _convertEntities(self, match): + """Used in a call to re.sub to replace HTML, XML, and numeric + entities with the appropriate Unicode characters. If HTML + entities are being converted, any unrecognized entities are + escaped.""" + x = match.group(1) + if self.convertHTMLEntities and x in name2codepoint: + return unichr(name2codepoint[x]) + elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: + if self.convertXMLEntities: + return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] + else: + return u'&%s;' % x + elif len(x) > 0 and x[0] == '#': + # Handle numeric entities + if len(x) > 1 and x[1] == 'x': + return unichr(int(x[2:], 16)) + else: + return unichr(int(x[1:])) + + elif self.escapeUnrecognizedEntities: + return u'&%s;' % x + else: + return u'&%s;' % x + def __init__(self, parser, name, attrs=None, parent=None, previous=None): "Basic constructor." @@ -414,12 +490,22 @@ self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False + self.convertHTMLEntities = parser.convertHTMLEntities + self.convertXMLEntities = parser.convertXMLEntities + self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+ # Convert any HTML, XML, or numeric entities in the attribute values. + convert = lambda(k, val): (k, + re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", + self._convertEntities, + val)) + self.attrs = map(convert, self.attrs) + def get(self, key, default=None): """Returns the value of the 'key' attribute for the tag, or the value given for 'default' if it doesn't have that attribute.""" - return self._getAttrMap().get(key, default) + return self._getAttrMap().get(key, default)
def has_key(self, key): return self._getAttrMap().has_key(key) @@ -444,7 +530,7 @@ "A tag is non-None even if it has no contents." return True
- def __setitem__(self, key, value): + def __setitem__(self, key, value): """Setting tag[key] sets the value of the 'key' attribute for the tag.""" self._getAttrMap() @@ -481,6 +567,7 @@ return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) + raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, @@ -507,6 +594,15 @@ def __unicode__(self): return self.__str__(None)
+ BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + + ")") + + def _sub_entity(self, x): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" + def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): """Returns a string or Unicode representation of this tag and @@ -522,7 +618,7 @@ if self.attrs: for key, val in self.attrs: fmt = '%s="%s"' - if isString(val): + if isString(val): if self.containsSubstitutions and '%SOUP-ENCODING%' in val: val = self.substituteEncoding(val, encoding)
@@ -543,19 +639,17 @@ # embedded single quotes to XML entities. if '"' in val: fmt = "%s='%s'" - # This can't happen naturally, but it can happen - # if you modify an attribute value after parsing. if "'" in val: + # TODO: replace with apos when + # appropriate. val = val.replace("'", "&squot;")
# Now we're okay w/r/t quotes. But the attribute # value might also contain angle brackets, or # ampersands that aren't part of entities. We need # to escape those to XML entities too. - val = re.sub("([<>]|&(?![^\s]+;))", - lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";", - val) - + val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) + attrs.append(fmt % (self.toEncoding(key, encoding), self.toEncoding(val, encoding))) close = '' @@ -577,7 +671,7 @@ s = [] attributeString = '' if attrs: - attributeString = ' ' + ' '.join(attrs) + attributeString = ' ' + ' '.join(attrs) if prettyPrint: s.append(space) s.append('<%s%s%s>' % (encodedName, attributeString, close)) @@ -609,14 +703,14 @@ elif isinstance(c, Tag): s.append(c.__str__(encoding, prettyPrint, indentLevel)) if text and prettyPrint: - text = text.strip() + text = text.strip() if text: if prettyPrint: s.append(" " * (indentLevel-1)) s.append(text) if prettyPrint: s.append("\n") - return ''.join(s) + return ''.join(s)
#Soup methods
@@ -651,19 +745,13 @@ # Pre-3.x compatibility methods first = find fetch = findAll - + def fetchText(self, text=None, recursive=True, limit=None): return self.findAll(text=text, recursive=recursive, limit=limit)
def firstText(self, text=None, recursive=True): return self.find(text=text, recursive=recursive) - - #Utility methods
- def append(self, tag): - """Appends the given tag to the contents of this tag.""" - self.contents.append(tag) - #Private methods
def _getAttrMap(self): @@ -672,7 +760,7 @@ if not getattr(self, 'attrMap'): self.attrMap = {} for (key, value) in self.attrs: - self.attrMap[key] = value + self.attrMap[key] = value return self.attrMap
#Generator methods @@ -680,12 +768,12 @@ for i in range(0, len(self.contents)): yield self.contents[i] raise StopIteration - + def recursiveChildGenerator(self): stack = [(self, 0)] while stack: tag, start = stack.pop() - if isinstance(tag, Tag): + if isinstance(tag, Tag): for i in range(start, len(tag.contents)): a = tag.contents[i] yield a @@ -720,7 +808,7 @@ return self.text else: return "%s|%s" % (self.name, self.attrs) - + def searchTag(self, markupName=None, markupAttrs={}): found = None markup = None @@ -737,7 +825,7 @@ if callFunctionWithTagData: match = self.name(markupName, markupAttrs) else: - match = True + match = True markupAttrMap = None for attr, matchAgainst in self.attrs.items(): if not markupAttrMap: @@ -762,7 +850,7 @@ #print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that - # matches. + # matches. if isList(markup) and not isinstance(markup, Tag): for element in markup: if isinstance(element, NavigableString) \ @@ -783,8 +871,8 @@ raise Exception, "I don't know how to match against a %s" \ % markup.__class__ return found - - def _matches(self, markup, matchAgainst): + + def _matches(self, markup, matchAgainst): #print "Matching %s against %s" % (markup, matchAgainst) result = False if matchAgainst == True and type(matchAgainst) == types.BooleanType: @@ -835,7 +923,7 @@ """Convenience method that works with all 2.x versions of Python to determine whether or not something is stringlike.""" try: - return isinstance(s, unicode) or isintance(s, basestring) + return isinstance(s, unicode) or isinstance(s, basestring) except NameError: return isinstance(s, str)
@@ -865,7 +953,7 @@ """This class contains the basic parser and search code. It defines a parser that knows nothing about tag behavior except for the following: - + You can't close a tag without closing all the tags it encloses. That is, "<foo><bar></foo>" actually means "<foo><bar></bar></foo>". @@ -878,10 +966,6 @@ or when BeautifulSoup makes an assumption counter to what you were expecting."""
- XML_ENTITY_LIST = {} - for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values(): - XML_ENTITY_LIST[i] = True - SELF_CLOSING_TAGS = {} NESTABLE_TAGS = {} RESET_NESTING_TAGS = {} @@ -897,13 +981,22 @@
HTML_ENTITIES = "html" XML_ENTITIES = "xml" + XHTML_ENTITIES = "xhtml" + # TODO: This only exists for backwards-compatibility + ALL_ENTITIES = XHTML_ENTITIES
+ # Used when determining whether a text node is all whitespace and + # can be replaced with a single space. A text node that contains + # fancy Unicode spaces (usually non-breaking) should be left + # alone. + STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } + def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, convertEntities=None, selfClosingTags=None): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) - is fed into the underlying parser. + is fed into the underlying parser.
sgmllib will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills @@ -930,14 +1023,33 @@ self.fromEncoding = fromEncoding self.smartQuotesTo = smartQuotesTo self.convertEntities = convertEntities + # Set the rules for how we'll deal with the entities we + # encounter if self.convertEntities: # It doesn't make sense to convert encoded characters to # entities even while you're converting entities to Unicode. # Just convert it all to Unicode. self.smartQuotesTo = None + if convertEntities == self.HTML_ENTITIES: + self.convertXMLEntities = False + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = True + elif convertEntities == self.XHTML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = False + elif convertEntities == self.XML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + else: + self.convertXMLEntities = False + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) SGMLParser.__init__(self) - + if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() self.markup = markup @@ -947,7 +1059,17 @@ except StopParsing: pass self.markup = None # The markup can now be GCed - + + def convert_charref(self, name): + """This method fixes a bug in Python's SGMLParser.""" + try: + n = int(name) + except ValueError: + return + if not 0 <= n <= 127 : # ASCII ends at 127, not 255 + return + return self.convert_codepoint(n) + def _feed(self, inDocumentEncoding=None): # Convert the document to Unicode. markup = self.markup @@ -963,9 +1085,15 @@ if markup: if self.markupMassage: if not isList(self.markupMassage): - self.markupMassage = self.MARKUP_MASSAGE + self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) + # TODO: We get rid of markupMassage so that the + # soup object can be deepcopied later on. Some + # Python installations can't copy regexes. If anyone + # was relying on the existence of markupMassage, this + # might cause problems. + del(self.markupMassage) self.reset()
SGMLParser.feed(self, markup) @@ -992,7 +1120,7 @@ self-closing tag according to this parser.""" return self.SELF_CLOSING_TAGS.has_key(name) \ or self.instanceSelfClosingTags.has_key(name) - + def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 @@ -1002,7 +1130,7 @@ self.tagStack = [] self.quoteStack = [] self.pushTag(self) - + def popTag(self): tag = self.tagStack.pop() # Tags with just one string-owning child get the child as a @@ -1020,14 +1148,14 @@ def pushTag(self, tag): #print "Push", tag.name if self.currentTag: - self.currentTag.append(tag) + self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1]
def endData(self, containerClass=NavigableString): if self.currentData: currentData = ''.join(self.currentData) - if not currentData.strip(): + if not currentData.translate(self.STRIP_ASCII_SPACES): if '\n' in currentData: currentData = '\n' else: @@ -1052,7 +1180,7 @@ the given tag.""" #print "Popping to %s" % name if name == self.ROOT_TAG_NAME: - return + return
numPops = 0 mostRecentTag = None @@ -1065,7 +1193,7 @@
for i in range(0, numPops): mostRecentTag = self.popTag() - return mostRecentTag + return mostRecentTag
def _smartPop(self, name):
@@ -1076,10 +1204,9 @@ comes between this tag and the previous tag of this type.
Examples: - <p>Foo<b>Bar<p> should pop to 'p', not 'b'. - <p>Foo<table>Bar<p> should pop to 'table', not 'p'. - <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'. - <p>Foo<b>Bar<p> should pop to 'p', not 'b'. + <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. + <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. + <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
<li><ul><li> *<li>* should pop to 'ul', not the first 'li'. <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' @@ -1102,7 +1229,7 @@ and p.name in nestingResetTriggers) \ or (nestingResetTriggers == None and isResetNesting and self.RESET_NESTING_TAGS.has_key(p.name)): - + #If we encounter one of the nesting reset triggers #peculiar to this tag, or we encounter another tag #that causes nesting to reset, pop up to but not @@ -1121,7 +1248,7 @@ #print "<%s> is not real!" % name attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) self.handle_data('<%s%s>' % (name, attrs)) - return + return self.endData()
if not self.isSelfClosingTag(name) and not selfClosing: @@ -1137,7 +1264,7 @@ self.previous = tag self.pushTag(tag) if selfClosing or self.isSelfClosingTag(name): - self.popTag() + self.popTag() if name in self.QUOTE_TAGS: #print "Beginning quote (%s)" % name self.quoteStack.append(name) @@ -1172,7 +1299,7 @@ object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.""" if text[:3] == "xml": - text = "xml version='1.0' encoding='%SOUP-ENCODING%'" + text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text): @@ -1181,8 +1308,7 @@
def handle_charref(self, ref): "Handle character references as data." - if self.convertEntities in [self.HTML_ENTITIES, - self.XML_ENTITIES]: + if self.convertEntities: data = unichr(int(ref)) else: data = '&#%s;' % ref @@ -1190,20 +1316,47 @@
def handle_entityref(self, ref): """Handle entity references as data, possibly converting known - HTML entity references to the corresponding Unicode + HTML and/or XML entity references to the corresponding Unicode characters.""" data = None - if self.convertEntities == self.HTML_ENTITIES or \ - (self.convertEntities == self.XML_ENTITIES and \ - self.XML_ENTITY_LIST.get(ref)): + if self.convertHTMLEntities: try: data = unichr(name2codepoint[ref]) except KeyError: pass + + if not data and self.convertXMLEntities: + data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + + if not data and self.convertHTMLEntities and \ + not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # TODO: We've got a problem here. We're told this is + # an entity reference, but it's not an XML entity + # reference or an HTML entity reference. Nonetheless, + # the logical thing to do is to pass it through as an + # unrecognized entity reference. + # + # Except: when the input is "&carol;" this function + # will be called with input "carol". When the input is + # "AT&T", this function will be called with input + # "T". We have no way of knowing whether a semicolon + # was present originally, so we don't know whether + # this is an unknown entity or just a misplaced + # ampersand. + # + # The more common case is a misplaced ampersand, so I + # escape the ampersand and omit the trailing semicolon. + data = "&%s" % ref if not data: - data = '&%s;' % ref + # This case is different from the one above, because we + # haven't already gone through a supposedly comprehensive + # mapping of entities to Unicode characters. We might not + # have gone through any mapping at all. So the chances are + # very high that this is a real entity, and not a + # misplaced ampersand. + data = "&%s;" % ref self.handle_data(data) - + def handle_decl(self, data): "Handle DOCTYPEs and the like as Declaration objects." self._toStringSubclass(data, Declaration) @@ -1285,8 +1438,8 @@ ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base'])
- QUOTE_TAGS = {'script': None} - + QUOTE_TAGS = {'script' : None, 'textarea' : None} + #According to the HTML standard, each of these inline tags can #contain another tag of the same type. Furthermore, it's common #to actually use these tags this way. @@ -1298,7 +1451,7 @@ #to actually use these tags this way. NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
- #Lists can contain other lists, but there are restrictions. + #Lists can contain other lists, but there are restrictions. NESTABLE_LIST_TAGS = { 'ol' : [], 'ul' : [], 'li' : ['ul', 'ol'], @@ -1306,8 +1459,8 @@ 'dd' : ['dl'], 'dt' : ['dl'] }
- #Tables can contain other tables, but there are restrictions. - NESTABLE_TABLE_TAGS = {'table' : [], + #Tables can contain other tables, but there are restrictions. + NESTABLE_TABLE_TAGS = {'table' : [], 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 'td' : ['tr'], 'th' : ['tr'], @@ -1377,7 +1530,7 @@
class StopParsing(Exception): pass - + class ICantBelieveItsBeautifulSoup(BeautifulSoup):
"""The BeautifulSoup class is oriented towards skipping over @@ -1423,7 +1576,7 @@
This also makes it better for subclassing than BeautifulStoneSoup or BeautifulSoup.""" - + RESET_NESTING_TAGS = buildTagMap('noscript') NESTABLE_TAGS = {}
@@ -1453,7 +1606,7 @@ parent = self.tagStack[-2] parent._getAttrMap() if (isinstance(tag, Tag) and len(tag.contents) == 1 and - isinstance(tag.contents[0], NavigableString) and + isinstance(tag.contents[0], NavigableString) and not parent.attrMap.has_key(tag.name)): parent[tag.name] = tag.contents[0] BeautifulStoneSoup.popTag(self) @@ -1463,7 +1616,7 @@ #and "unprofessional" for use in enterprise screen-scraping. We feel #your pain! For such-minded folk, the Beautiful Soup Consortium And #All-Night Kosher Bakery recommends renaming this file to -#"RobustParser.py" (or, in cases of extreme enterprisness, +#"RobustParser.py" (or, in cases of extreme enterprisiness, #"RobustParserBeanInterface.class") and using the following #enterprise-friendly class aliases: class RobustXMLParser(BeautifulStoneSoup): @@ -1530,9 +1683,9 @@ self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): self.originalEncoding = None - self.unicode = unicode(markup) + self.unicode = unicode(markup) return - + u = None for proposedEncoding in overrideEncodings: u = self._convertFrom(proposedEncoding) @@ -1541,7 +1694,7 @@ for proposedEncoding in (documentEncoding, sniffedEncoding): u = self._convertFrom(proposedEncoding) if u: break - + # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): u = self._convertFrom(chardet.detect(self.markup)['encoding']) @@ -1563,9 +1716,9 @@ sub = '&#x%s;' % sub[1] else: sub = '&%s;' % sub[0] - return sub + return sub
- def _convertFrom(self, proposed): + def _convertFrom(self, proposed): proposed = self.find_codec(proposed) if not proposed or proposed in self.triedEncodings: return None @@ -1584,12 +1737,12 @@ try: # print "Trying to convert document to %s" % proposed u = self._toUnicode(markup, proposed) - self.markup = u + self.markup = u self.originalEncoding = proposed except Exception, e: # print "That didn't work!" # print e - return None + return None #print "Correct encoding: %s" % proposed return self.markup
@@ -1617,7 +1770,7 @@ data = data[4:] newdata = unicode(data, encoding) return newdata - + def _detectEncoding(self, xml_data): """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None @@ -1689,12 +1842,12 @@ or charset
def _codec(self, charset): - if not charset: return charset + if not charset: return charset codec = None try: codecs.lookup(charset) codec = charset - except LookupError: + except (LookupError, ValueError): pass return codec
pywikipedia-l@lists.wikimedia.org