Revision: 6842 Author: alexsh Date: 2009-05-06 19:12:46 +0000 (Wed, 06 May 2009)
Log Message: ----------- update BeautifulSoup to 3.1.0.1
Modified Paths: -------------- trunk/pywikipedia/BeautifulSoup.py
Modified: trunk/pywikipedia/BeautifulSoup.py =================================================================== --- trunk/pywikipedia/BeautifulSoup.py 2009-05-06 17:47:16 UTC (rev 6841) +++ trunk/pywikipedia/BeautifulSoup.py 2009-05-06 19:12:46 UTC (rev 6842) @@ -42,7 +42,7 @@
Here, have some legalese:
-Copyright (c) 2004-2007, Leonard Richardson +Copyright (c) 2004-2009, Leonard Richardson
All rights reserved.
@@ -79,27 +79,38 @@ from __future__ import generators
__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "3.0.6" -__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson" +__version__ = "3.1.0.1" +__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" __license__ = "New-style BSD"
-from sgmllib import SGMLParser, SGMLParseError import codecs +import markupbase import types import re -import sgmllib +from HTMLParser import HTMLParser, HTMLParseError try: - from htmlentitydefs import name2codepoint + from htmlentitydefs import name2codepoint except ImportError: - name2codepoint = {} + name2codepoint = {} +try: + set +except NameError: + from sets import Set as set
-#This hack makes Beautiful Soup able to parse XML with namespaces -sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') +#These hacks make Beautiful Soup able to parse XML with namespaces +markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
DEFAULT_OUTPUT_ENCODING = "utf-8"
# First, the classes that represent markup elements.
+def sob(unicode, encoding): + """Returns either the given Unicode string or its encoding.""" + if encoding is None: + return unicode + else: + return unicode.encode(encoding) + class PageElement: """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -391,8 +402,20 @@
class NavigableString(unicode, PageElement):
+ def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + return unicode.__new__(cls, value) + return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + def __getnewargs__(self): - return (NavigableString.__str__(self),) + return (unicode(self),)
def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -403,34 +426,32 @@ else: raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
- def __unicode__(self): - return str(self).decode(DEFAULT_OUTPUT_ENCODING) + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING): + return self.decode().encode(encoding)
- def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - if encoding: - return self.encode(encoding) - else: - return self + def decodeGivenEventualEncoding(self, eventualEncoding): + return self
class CData(NavigableString):
- def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<![CDATA[' + self + u']]>'
class ProcessingInstruction(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): + + def decodeGivenEventualEncoding(self, eventualEncoding): output = self - if "%SOUP-ENCODING%" in output: - output = self.substituteEncoding(output, encoding) - return "<?%s?>" % self.toEncoding(output, encoding) + if u'%SOUP-ENCODING%' in output: + output = self.substituteEncoding(output, eventualEncoding) + return u'<?' + output + u'?>'
class Comment(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "<!--%s-->" % NavigableString.__str__(self, encoding) + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<!--' + self + u'-->'
class Declaration(NavigableString): - def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): - return "<!%s>" % NavigableString.__str__(self, encoding) + def decodeGivenEventualEncoding(self, eventualEncoding): + return u'<!' + self + u'>'
class Tag(PageElement):
@@ -496,11 +517,13 @@ self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
- # Convert any HTML, XML, or numeric entities in the attribute values. - convert = lambda(k, val): (k, - re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", - self._convertEntities, - val)) + def convert(kval): + "Converts HTML, XML and numeric entities in the attribute value." + k, val = kval + if val is None: + return kval + return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", + self._convertEntities, val)) self.attrs = map(convert, self.attrs)
def get(self, key, default=None): @@ -591,11 +614,8 @@
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): """Renders this tag as a string.""" - return self.__str__(encoding) + return self.decode(eventualEncoding=encoding)
- def __unicode__(self): - return self.__str__(None) - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") @@ -605,24 +625,30 @@ appropriate XML entity for an XML special character.""" return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
- def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, - prettyPrint=False, indentLevel=0): - """Returns a string or Unicode representation of this tag and - its contents. To get Unicode, pass None for encoding. + def __unicode__(self): + return self.decode()
- NOTE: since Python's HTML parser consumes whitespace, this - method is not certain to reproduce the whitespace present in - the original string.""" + def __str__(self): + return self.encode()
- encodedName = self.toEncoding(self.name, encoding) + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
+ def decode(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): + """Returns a string or Unicode representation of this tag and + its contents. To get Unicode, pass None for encoding.""" + attrs = [] if self.attrs: for key, val in self.attrs: fmt = '%s="%s"' if isString(val): - if self.containsSubstitutions and '%SOUP-ENCODING%' in val: - val = self.substituteEncoding(val, encoding) + if (self.containsSubstitutions + and eventualEncoding is not None + and '%SOUP-ENCODING%' in val): + val = self.substituteEncoding(val, eventualEncoding)
# The attribute value either: # @@ -651,22 +677,26 @@ # ampersands that aren't part of entities. We need # to escape those to XML entities too. val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) - - attrs.append(fmt % (self.toEncoding(key, encoding), - self.toEncoding(val, encoding))) + if val is None: + # Handle boolean attributes. + decoded = key + else: + decoded = fmt % (key, val) + attrs.append(decoded) close = '' closeTag = '' if self.isSelfClosing: close = ' /' else: - closeTag = '</%s>' % encodedName + closeTag = '</%s>' % self.name
indentTag, indentContents = 0, 0 if prettyPrint: indentTag = indentLevel space = (' ' * (indentTag-1)) indentContents = indentTag + 1 - contents = self.renderContents(encoding, prettyPrint, indentContents) + contents = self.decodeContents(prettyPrint, indentContents, + eventualEncoding) if self.hidden: s = contents else: @@ -676,7 +706,7 @@ attributeString = ' ' + ' '.join(attrs) if prettyPrint: s.append(space) - s.append('<%s%s%s>' % (encodedName, attributeString, close)) + s.append('<%s%s%s>' % (self.name, attributeString, close)) if prettyPrint: s.append("\n") s.append(contents) @@ -701,19 +731,23 @@ self.extract()
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.__str__(encoding, True) + return self.encode(encoding, True)
- def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): + return self.decodeContents(prettyPrint, indentLevel).encode(encoding) + + def decodeContents(self, prettyPrint=False, indentLevel=0, + eventualEncoding=DEFAULT_OUTPUT_ENCODING): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" s=[] for c in self: text = None if isinstance(c, NavigableString): - text = c.__str__(encoding) + text = c.decodeGivenEventualEncoding(eventualEncoding) elif isinstance(c, Tag): - s.append(c.__str__(encoding, prettyPrint, indentLevel)) + s.append(c.decode(prettyPrint, indentLevel, eventualEncoding)) if text and prettyPrint: text = text.strip() if text: @@ -754,7 +788,7 @@ return self._findAll(name, attrs, text, limit, generator, **kwargs) findChildren = findAll
- # Pre-3.x compatibility methods + # Pre-3.x compatibility methods. Will go away in 4.0. first = find fetch = findAll
@@ -764,6 +798,15 @@ def firstText(self, text=None, recursive=True): return self.find(text=text, recursive=recursive)
+ # 3.x compatibility methods. Will go away in 4.0. + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if encoding is None: + return self.decodeContents(prettyPrint, indentLevel, encoding) + else: + return self.encodeContents(encoding, prettyPrint, indentLevel) + + #Private methods
def _getAttrMap(self): @@ -776,26 +819,24 @@ return self.attrMap
#Generator methods + def recursiveChildGenerator(self): + if not len(self.contents): + raise StopIteration + stopNode = self._lastRecursiveChild().next + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next + def childGenerator(self): - for i in range(0, len(self.contents)): - yield self.contents[i] + if not len(self.contents): + raise StopIteration + current = self.contents[0] + while current: + yield current + current = current.nextSibling raise StopIteration
- def recursiveChildGenerator(self): - stack = [(self, 0)] - while stack: - tag, start = stack.pop() - if isinstance(tag, Tag): - for i in range(start, len(tag.contents)): - a = tag.contents[i] - yield a - if isinstance(a, Tag) and tag.contents: - if i < len(tag.contents) - 1: - stack.append((tag, i+1)) - stack.append((a, 0)) - break - raise StopIteration - # Next, a couple classes to represent queries and their results. class SoupStrainer: """Encapsulates a number of ways of matching a markup element (tag or @@ -896,13 +937,14 @@ #other ways of matching match the tag name as a string. if isinstance(markup, Tag): markup = markup.name - if markup and not isString(markup): + if markup is not None and not isString(markup): markup = unicode(markup) #Now we know that chunk is either a string, or None. if hasattr(matchAgainst, 'match'): # It's a regexp object. result = markup and matchAgainst.search(markup) - elif isList(matchAgainst): + elif (isList(matchAgainst) + and (markup is not None or not isString(matchAgainst))): result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): result = markup.has_key(matchAgainst) @@ -928,8 +970,8 @@ def isList(l): """Convenience method that works with all 2.x versions of Python to determine whether or not something is listlike.""" - return hasattr(l, '__iter__') \ - or (type(l) in (types.ListType, types.TupleType)) + return ((hasattr(l, '__iter__') and not isString(l)) + or (type(l) in (types.ListType, types.TupleType)))
def isString(s): """Convenience method that works with all 2.x versions of Python @@ -949,7 +991,7 @@ #It's a map. Merge it. for k,v in portion.items(): built[k] = v - elif isList(portion): + elif isList(portion) and not isString(portion): #It's a list. Map each item to the default. for k in portion: built[k] = default @@ -960,8 +1002,123 @@
# Now, the parser classes.
-class BeautifulStoneSoup(Tag, SGMLParser): +class HTMLParserBuilder(HTMLParser):
+ def __init__(self, soup): + HTMLParser.__init__(self) + self.soup = soup + + # We inherit feed() and reset(). + + def handle_starttag(self, name, attrs): + if name == 'meta': + self.soup.extractCharsetFromMeta(attrs) + else: + self.soup.unknown_starttag(name, attrs) + + def handle_endtag(self, name): + self.soup.unknown_endtag(name) + + def handle_data(self, content): + self.soup.handle_data(content) + + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.soup.endData() + self.handle_data(text) + self.soup.endData(subclass) + + def handle_pi(self, text): + """Handle a processing instruction as a ProcessingInstruction + object, possibly one with a %SOUP-ENCODING% slot into which an + encoding will be plugged later.""" + if text[:3] == "xml": + text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + self._toStringSubclass(text, ProcessingInstruction) + + def handle_comment(self, text): + "Handle comments as Comment objects." + self._toStringSubclass(text, Comment) + + def handle_charref(self, ref): + "Handle character references as data." + if self.soup.convertEntities: + data = unichr(int(ref)) + else: + data = '&#%s;' % ref + self.handle_data(data) + + def handle_entityref(self, ref): + """Handle entity references as data, possibly converting known + HTML and/or XML entity references to the corresponding Unicode + characters.""" + data = None + if self.soup.convertHTMLEntities: + try: + data = unichr(name2codepoint[ref]) + except KeyError: + pass + + if not data and self.soup.convertXMLEntities: + data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + + if not data and self.soup.convertHTMLEntities and \ + not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # TODO: We've got a problem here. We're told this is + # an entity reference, but it's not an XML entity + # reference or an HTML entity reference. Nonetheless, + # the logical thing to do is to pass it through as an + # unrecognized entity reference. + # + # Except: when the input is "&carol;" this function + # will be called with input "carol". When the input is + # "AT&T", this function will be called with input + # "T". We have no way of knowing whether a semicolon + # was present originally, so we don't know whether + # this is an unknown entity or just a misplaced + # ampersand. + # + # The more common case is a misplaced ampersand, so I + # escape the ampersand and omit the trailing semicolon. + data = "&%s" % ref + if not data: + # This case is different from the one above, because we + # haven't already gone through a supposedly comprehensive + # mapping of entities to Unicode characters. We might not + # have gone through any mapping at all. So the chances are + # very high that this is a real entity, and not a + # misplaced ampersand. + data = "&%s;" % ref + self.handle_data(data) + + def handle_decl(self, data): + "Handle DOCTYPEs and the like as Declaration objects." + self._toStringSubclass(data, Declaration) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = HTMLParser.parse_declaration(self, i) + except HTMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + + +class BeautifulStoneSoup(Tag): + """This class contains the basic parser and search code. It defines a parser that knows nothing about tag behavior except for the following: @@ -982,6 +1139,7 @@ NESTABLE_TAGS = {} RESET_NESTING_TAGS = {} QUOTE_TAGS = {} + PRESERVE_WHITESPACE_TAGS = []
MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), lambda x: x.group(1) + ' />'), @@ -1005,14 +1163,15 @@
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, - convertEntities=None, selfClosingTags=None): + convertEntities=None, selfClosingTags=None, isHTML=False, + builder=HTMLParserBuilder): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.
- sgmllib will process most bad HTML, and the BeautifulSoup + HTMLParser will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills - sgmllib, but Beautiful Soup can nonetheless choke or lose data + HTMLParser, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly.
@@ -1022,7 +1181,7 @@ you'll get better performance.
The default parser massage techniques fix the two most common - instances of invalid HTML that choke sgmllib: + instances of invalid HTML that choke HTMLParser:
<br/> (No space between name of closing tag and tag close) <! --Comment--> (Extraneous whitespace in declaration) @@ -1060,29 +1219,21 @@ self.escapeUnrecognizedEntities = False
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) - SGMLParser.__init__(self) + self.builder = builder(self) + self.reset()
if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() self.markup = markup self.markupMassage = markupMassage try: - self._feed() + self._feed(isHTML=isHTML) except StopParsing: pass - self.markup = None # The markup can now be GCed + self.markup = None # The markup can now be GCed. + self.builder = None # So can the builder.
- def convert_charref(self, name): - """This method fixes a bug in Python's SGMLParser.""" - try: - n = int(name) - except ValueError: - return - if not 0 <= n <= 127 : # ASCII ends at 127, not 255 - return - return self.convert_codepoint(n) - - def _feed(self, inDocumentEncoding=None): + def _feed(self, inDocumentEncoding=None, isHTML=False): # Convert the document to Unicode. markup = self.markup if isinstance(markup, unicode): @@ -1091,9 +1242,10 @@ else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.smartQuotesTo) + smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding + self.declaredHTMLEncoding = dammit.declaredHTMLEncoding if markup: if self.markupMassage: if not isList(self.markupMassage): @@ -1106,27 +1258,14 @@ # was relying on the existence of markupMassage, this # might cause problems. del(self.markupMassage) - self.reset() + self.builder.reset()
- SGMLParser.feed(self, markup) + self.builder.feed(markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: self.popTag()
- def __getattr__(self, methodName): - """This method routes method call requests to either the SGMLParser - superclass or the Tag superclass, depending on the method name.""" - #print "__getattr__ called on %s.%s" % (self.__class__, methodName) - - if methodName.find('start_') == 0 or methodName.find('end_') == 0 \ - or methodName.find('do_') == 0: - return SGMLParser.__getattr__(self, methodName) - elif methodName.find('__') != 0: - return Tag.__getattr__(self, methodName) - else: - raise AttributeError - def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" @@ -1136,7 +1275,7 @@ def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 - SGMLParser.reset(self) + self.builder.reset() self.currentData = [] self.currentTag = None self.tagStack = [] @@ -1166,8 +1305,10 @@
def endData(self, containerClass=NavigableString): if self.currentData: - currentData = ''.join(self.currentData) - if not currentData.translate(self.STRIP_ASCII_SPACES): + currentData = u''.join(self.currentData) + if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + not set([tag.name for tag in self.tagStack]).intersection( + self.PRESERVE_WHITESPACE_TAGS)): if '\n' in currentData: currentData = '\n' else: @@ -1299,100 +1440,10 @@ def handle_data(self, data): self.currentData.append(data)
- def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.endData() - self.handle_data(text) - self.endData(subclass) + def extractCharsetFromMeta(self, attrs): + self.unknown_starttag('meta', attrs)
- def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" - if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" - self._toStringSubclass(text, ProcessingInstruction)
- def handle_comment(self, text): - "Handle comments as Comment objects." - self._toStringSubclass(text, Comment) - - def handle_charref(self, ref): - "Handle character references as data." - if self.convertEntities: - data = unichr(int(ref)) - else: - data = '&#%s;' % ref - self.handle_data(data) - - def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML and/or XML entity references to the corresponding Unicode - characters.""" - data = None - if self.convertHTMLEntities: - try: - data = unichr(name2codepoint[ref]) - except KeyError: - pass - - if not data and self.convertXMLEntities: - data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) - - if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref - if not data: - # This case is different from the one above, because we - # haven't already gone through a supposedly comprehensive - # mapping of entities to Unicode characters. We might not - # have gone through any mapping at all. So the chances are - # very high that this is a real entity, and not a - # misplaced ampersand. - data = "&%s;" % ref - self.handle_data(data) - - def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." - self._toStringSubclass(data, Declaration) - - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i+9] == '<![CDATA[': - k = self.rawdata.find(']]>', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) - else: - try: - j = SGMLParser.parse_declaration(self, i) - except SGMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - class BeautifulSoup(BeautifulStoneSoup):
"""This parser knows the following facts about HTML: @@ -1444,12 +1495,15 @@ def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES + kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs)
SELF_CLOSING_TAGS = buildTagMap(None, ['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base'])
+ PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + QUOTE_TAGS = {'script' : None, 'textarea' : None}
#According to the HTML standard, each of these inline tags can @@ -1494,9 +1548,9 @@ NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
# Used to detect the charset in a META tag; see start_meta - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)") + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
- def start_meta(self, attrs): + def extractCharsetFromMeta(self, attrs): """Beautiful Soup can detect a charset included in a META tag, try to convert the document to that charset, and re-parse the document from the beginning.""" @@ -1517,29 +1571,33 @@ if httpEquiv and contentType: # It's an interesting meta tag. match = self.CHARSET_RE.search(contentType) if match: - if getattr(self, 'declaredHTMLEncoding') or \ - (self.originalEncoding == self.fromEncoding): - # This is our second pass through the document, or - # else an encoding was specified explicitly and it - # worked. Rewrite the meta tag. - newAttr = self.CHARSET_RE.sub\ - (lambda(match):match.group(1) + - "%SOUP-ENCODING%", contentType) + if (self.declaredHTMLEncoding is not None or + self.originalEncoding == self.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + newAttr = self.CHARSET_RE.sub(rewrite, contentType) attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], newAttr) tagNeedsEncodingSubstitution = True else: # This is our first pass through the document. - # Go through it again with the new information. + # Go through it again with the encoding information. newCharset = match.group(3) if newCharset and newCharset != self.originalEncoding: self.declaredHTMLEncoding = newCharset self._feed(self.declaredHTMLEncoding) raise StopParsing + pass tag = self.unknown_starttag("meta", attrs) if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True
+ class StopParsing(Exception): pass
@@ -1687,9 +1745,10 @@ "x-sjis" : "shift-jis" }
def __init__(self, markup, overrideEncodings=[], - smartQuotesTo='xml'): + smartQuotesTo='xml', isHTML=False): + self.declaredHTMLEncoding = None self.markup, documentEncoding, sniffedEncoding = \ - self._detectEncoding(markup) + self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): @@ -1715,18 +1774,22 @@ for proposed_encoding in ("utf-8", "windows-1252"): u = self._convertFrom(proposed_encoding) if u: break + self.unicode = u if not u: self.originalEncoding = None
- def _subMSChar(self, orig): + def _subMSChar(self, match): """Changes a MS smart quote character to an XML or HTML entity.""" + orig = match.group(1) sub = self.MS_CHARS.get(orig) if type(sub) == types.TupleType: if self.smartQuotesTo == 'xml': - sub = '&#x%s;' % sub[1] + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: - sub = '&%s;' % sub[0] + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() return sub
def _convertFrom(self, proposed): @@ -1741,9 +1804,9 @@ if self.smartQuotesTo and proposed.lower() in("windows-1252", "iso-8859-1", "iso-8859-2"): - markup = re.compile("([\x80-\x9f])").sub \ - (lambda(x): self._subMSChar(x.group(1)), - markup) + smart_quotes_re = "([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._subMSChar, markup)
try: # print "Trying to convert document to %s" % proposed @@ -1782,7 +1845,7 @@ newdata = unicode(data, encoding) return newdata
- def _detectEncoding(self, xml_data): + def _detectEncoding(self, xml_data, isHTML=False): """Given a document, tries to detect its XML encoding.""" xml_encoding = sniffed_xml_encoding = None try: @@ -1830,13 +1893,19 @@ else: sniffed_xml_encoding = 'ascii' pass - xml_encoding_match = re.compile \ - ('^<?.*encoding=['"](.*?)['"].*?>')\ - .match(xml_data) except: xml_encoding_match = None - if xml_encoding_match: - xml_encoding = xml_encoding_match.groups()[0].lower() + xml_encoding_re = '^<?.*encoding=['"](.*?)['"].*?>'.encode() + xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) + if not xml_encoding_match and isHTML: + meta_re = '<\s*meta[^>]+charset=([^>]*?)[;'">]'.encode() + regexp = re.compile(meta_re, re.I) + xml_encoding_match = regexp.search(xml_data) + if xml_encoding_match is not None: + xml_encoding = xml_encoding_match.groups()[0].decode( + 'ascii').lower() + if isHTML: + self.declaredHTMLEncoding = xml_encoding if sniffed_xml_encoding and \ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', @@ -1927,5 +1996,5 @@ #By default, act as an HTML pretty-printer. if __name__ == '__main__': import sys - soup = BeautifulSoup(sys.stdin.read()) + soup = BeautifulSoup(sys.stdin) print soup.prettify()
pywikipedia-svn@lists.wikimedia.org