Revision: 6858
Author: nicdumz
Date: 2009-05-08 15:23:29 +0000 (Fri, 08 May 2009)
Log Message:
-----------
BeautifulSoup 3.1.0.1 + relevant wikipedia.py fix
Modified Paths:
--------------
trunk/pywikipedia/BeautifulSoup.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/BeautifulSoup.py
===================================================================
--- trunk/pywikipedia/BeautifulSoup.py 2009-05-08 07:52:49 UTC (rev 6857)
+++ trunk/pywikipedia/BeautifulSoup.py 2009-05-08 15:23:29 UTC (rev 6858)
@@ -42,7 +42,7 @@
Here, have some legalese:
-Copyright (c) 2004-2007, Leonard Richardson
+Copyright (c) 2004-2009, Leonard Richardson
All rights reserved.
@@ -79,27 +79,38 @@
from __future__ import generators
__author__ = "Leonard Richardson (leonardr(a)segfault.org)"
-__version__ = "3.0.6"
-__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
+__version__ = "3.1.0.1"
+__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
__license__ = "New-style BSD"
-from sgmllib import SGMLParser, SGMLParseError
import codecs
+import markupbase
import types
import re
-import sgmllib
+from HTMLParser import HTMLParser, HTMLParseError
try:
- from htmlentitydefs import name2codepoint
+ from htmlentitydefs import name2codepoint
except ImportError:
- name2codepoint = {}
+ name2codepoint = {}
+try:
+ set
+except NameError:
+ from sets import Set as set
-#This hack makes Beautiful Soup able to parse XML with namespaces
-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+#These hacks make Beautiful Soup able to parse XML with namespaces
+markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
DEFAULT_OUTPUT_ENCODING = "utf-8"
# First, the classes that represent markup elements.
+def sob(unicode, encoding):
+ """Returns either the given Unicode string or its encoding."""
+ if encoding is None:
+ return unicode
+ else:
+ return unicode.encode(encoding)
+
class PageElement:
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -391,8 +402,20 @@
class NavigableString(unicode, PageElement):
+ def __new__(cls, value):
+ """Create a new NavigableString.
+
+ When unpickling a NavigableString, this method is called with
+ the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+ passed in to the superclass's __new__ or the superclass won't know
+ how to handle non-ASCII characters.
+ """
+ if isinstance(value, unicode):
+ return unicode.__new__(cls, value)
+ return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
def __getnewargs__(self):
- return (NavigableString.__str__(self),)
+ return (unicode(self),)
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
@@ -403,34 +426,32 @@
else:
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
- def __unicode__(self):
- return str(self).decode(DEFAULT_OUTPUT_ENCODING)
+ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.decode().encode(encoding)
- def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- if encoding:
- return self.encode(encoding)
- else:
- return self
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return self
class CData(NavigableString):
- def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<![CDATA[' + self + u']]>'
class ProcessingInstruction(NavigableString):
- def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+
+ def decodeGivenEventualEncoding(self, eventualEncoding):
output = self
- if "%SOUP-ENCODING%" in output:
- output = self.substituteEncoding(output, encoding)
- return "<?%s?>" % self.toEncoding(output, encoding)
+ if u'%SOUP-ENCODING%' in output:
+ output = self.substituteEncoding(output, eventualEncoding)
+ return u'<?' + output + u'?>'
class Comment(NavigableString):
- def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return "<!--%s-->" % NavigableString.__str__(self, encoding)
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<!--' + self + u'-->'
class Declaration(NavigableString):
- def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return "<!%s>" % NavigableString.__str__(self, encoding)
+ def decodeGivenEventualEncoding(self, eventualEncoding):
+ return u'<!' + self + u'>'
class Tag(PageElement):
@@ -496,11 +517,13 @@
self.convertXMLEntities = parser.convertXMLEntities
self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
- # Convert any HTML, XML, or numeric entities in the attribute values.
- convert = lambda(k, val): (k,
- re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
- self._convertEntities,
- val))
+ def convert(kval):
+ "Converts HTML, XML and numeric entities in the attribute value."
+ k, val = kval
+ if val is None:
+ return kval
+ return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities, val))
self.attrs = map(convert, self.attrs)
def get(self, key, default=None):
@@ -591,11 +614,8 @@
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
"""Renders this tag as a string."""
- return self.__str__(encoding)
+ return self.decode(eventualEncoding=encoding)
- def __unicode__(self):
- return self.__str__(None)
-
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ ")")
@@ -605,24 +625,30 @@
appropriate XML entity for an XML special character."""
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
- def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
- prettyPrint=False, indentLevel=0):
- """Returns a string or Unicode representation of this tag and
- its contents. To get Unicode, pass None for encoding.
+ def __unicode__(self):
+ return self.decode()
- NOTE: since Python's HTML parser consumes whitespace, this
- method is not certain to reproduce the whitespace present in
- the original string."""
+ def __str__(self):
+ return self.encode()
- encodedName = self.toEncoding(self.name, encoding)
+ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
+ def decode(self, prettyPrint=False, indentLevel=0,
+ eventualEncoding=DEFAULT_OUTPUT_ENCODING):
+ """Returns a string or Unicode representation of this tag and
+ its contents. To get Unicode, pass None for encoding."""
+
attrs = []
if self.attrs:
for key, val in self.attrs:
fmt = '%s="%s"'
if isString(val):
- if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
- val = self.substituteEncoding(val, encoding)
+ if (self.containsSubstitutions
+ and eventualEncoding is not None
+ and '%SOUP-ENCODING%' in val):
+ val = self.substituteEncoding(val, eventualEncoding)
# The attribute value either:
#
@@ -651,22 +677,26 @@
# ampersands that aren't part of entities. We need
# to escape those to XML entities too.
val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
-
- attrs.append(fmt % (self.toEncoding(key, encoding),
- self.toEncoding(val, encoding)))
+ if val is None:
+ # Handle boolean attributes.
+ decoded = key
+ else:
+ decoded = fmt % (key, val)
+ attrs.append(decoded)
close = ''
closeTag = ''
if self.isSelfClosing:
close = ' /'
else:
- closeTag = '</%s>' % encodedName
+ closeTag = '</%s>' % self.name
indentTag, indentContents = 0, 0
if prettyPrint:
indentTag = indentLevel
space = (' ' * (indentTag-1))
indentContents = indentTag + 1
- contents = self.renderContents(encoding, prettyPrint, indentContents)
+ contents = self.decodeContents(prettyPrint, indentContents,
+ eventualEncoding)
if self.hidden:
s = contents
else:
@@ -676,7 +706,7 @@
attributeString = ' ' + ' '.join(attrs)
if prettyPrint:
s.append(space)
- s.append('<%s%s%s>' % (encodedName, attributeString, close))
+ s.append('<%s%s%s>' % (self.name, attributeString, close))
if prettyPrint:
s.append("\n")
s.append(contents)
@@ -701,19 +731,23 @@
self.extract()
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return self.__str__(encoding, True)
+ return self.encode(encoding, True)
- def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0):
+ return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
+
+ def decodeContents(self, prettyPrint=False, indentLevel=0,
+ eventualEncoding=DEFAULT_OUTPUT_ENCODING):
"""Renders the contents of this tag as a string in the given
encoding. If encoding is None, returns a Unicode string.."""
s=[]
for c in self:
text = None
if isinstance(c, NavigableString):
- text = c.__str__(encoding)
+ text = c.decodeGivenEventualEncoding(eventualEncoding)
elif isinstance(c, Tag):
- s.append(c.__str__(encoding, prettyPrint, indentLevel))
+ s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
if text and prettyPrint:
text = text.strip()
if text:
@@ -754,7 +788,7 @@
return self._findAll(name, attrs, text, limit, generator, **kwargs)
findChildren = findAll
- # Pre-3.x compatibility methods
+ # Pre-3.x compatibility methods. Will go away in 4.0.
first = find
fetch = findAll
@@ -764,6 +798,15 @@
def firstText(self, text=None, recursive=True):
return self.find(text=text, recursive=recursive)
+ # 3.x compatibility methods. Will go away in 4.0.
+ def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ if encoding is None:
+ return self.decodeContents(prettyPrint, indentLevel, encoding)
+ else:
+ return self.encodeContents(encoding, prettyPrint, indentLevel)
+
+
#Private methods
def _getAttrMap(self):
@@ -776,26 +819,24 @@
return self.attrMap
#Generator methods
+ def recursiveChildGenerator(self):
+ if not len(self.contents):
+ raise StopIteration
+ stopNode = self._lastRecursiveChild().next
+ current = self.contents[0]
+ while current is not stopNode:
+ yield current
+ current = current.next
+
def childGenerator(self):
- for i in range(0, len(self.contents)):
- yield self.contents[i]
+ if not len(self.contents):
+ raise StopIteration
+ current = self.contents[0]
+ while current:
+ yield current
+ current = current.nextSibling
raise StopIteration
- def recursiveChildGenerator(self):
- stack = [(self, 0)]
- while stack:
- tag, start = stack.pop()
- if isinstance(tag, Tag):
- for i in range(start, len(tag.contents)):
- a = tag.contents[i]
- yield a
- if isinstance(a, Tag) and tag.contents:
- if i < len(tag.contents) - 1:
- stack.append((tag, i+1))
- stack.append((a, 0))
- break
- raise StopIteration
-
# Next, a couple classes to represent queries and their results.
class SoupStrainer:
"""Encapsulates a number of ways of matching a markup element (tag or
@@ -896,13 +937,14 @@
#other ways of matching match the tag name as a string.
if isinstance(markup, Tag):
markup = markup.name
- if markup and not isString(markup):
+ if markup is not None and not isString(markup):
markup = unicode(markup)
#Now we know that chunk is either a string, or None.
if hasattr(matchAgainst, 'match'):
# It's a regexp object.
result = markup and matchAgainst.search(markup)
- elif isList(matchAgainst):
+ elif (isList(matchAgainst)
+ and (markup is not None or not isString(matchAgainst))):
result = markup in matchAgainst
elif hasattr(matchAgainst, 'items'):
result = markup.has_key(matchAgainst)
@@ -928,8 +970,8 @@
def isList(l):
"""Convenience method that works with all 2.x versions of Python
to determine whether or not something is listlike."""
- return hasattr(l, '__iter__') \
- or (type(l) in (types.ListType, types.TupleType))
+ return ((hasattr(l, '__iter__') and not isString(l))
+ or (type(l) in (types.ListType, types.TupleType)))
def isString(s):
"""Convenience method that works with all 2.x versions of Python
@@ -949,7 +991,7 @@
#It's a map. Merge it.
for k,v in portion.items():
built[k] = v
- elif isList(portion):
+ elif isList(portion) and not isString(portion):
#It's a list. Map each item to the default.
for k in portion:
built[k] = default
@@ -960,8 +1002,123 @@
# Now, the parser classes.
-class BeautifulStoneSoup(Tag, SGMLParser):
+class HTMLParserBuilder(HTMLParser):
+ def __init__(self, soup):
+ HTMLParser.__init__(self)
+ self.soup = soup
+
+ # We inherit feed() and reset().
+
+ def handle_starttag(self, name, attrs):
+ if name == 'meta':
+ self.soup.extractCharsetFromMeta(attrs)
+ else:
+ self.soup.unknown_starttag(name, attrs)
+
+ def handle_endtag(self, name):
+ self.soup.unknown_endtag(name)
+
+ def handle_data(self, content):
+ self.soup.handle_data(content)
+
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.soup.endData()
+ self.handle_data(text)
+ self.soup.endData(subclass)
+
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.soup.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '&#%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.soup.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.soup.convertXMLEntities:
+ data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.soup.convertHTMLEntities and \
+ not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = HTMLParser.parse_declaration(self, i)
+ except HTMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
+
+class BeautifulStoneSoup(Tag):
+
"""This class contains the basic parser and search code. It defines
a parser that knows nothing about tag behavior except for the
following:
@@ -982,6 +1139,7 @@
NESTABLE_TAGS = {}
RESET_NESTING_TAGS = {}
QUOTE_TAGS = {}
+ PRESERVE_WHITESPACE_TAGS = []
MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
lambda x: x.group(1) + ' />'),
@@ -1005,14 +1163,15 @@
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
markupMassage=True, smartQuotesTo=XML_ENTITIES,
- convertEntities=None, selfClosingTags=None):
+ convertEntities=None, selfClosingTags=None, isHTML=False,
+ builder=HTMLParserBuilder):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser.
- sgmllib will process most bad HTML, and the BeautifulSoup
+ HTMLParser will process most bad HTML, and the BeautifulSoup
class has some tricks for dealing with some HTML that kills
- sgmllib, but Beautiful Soup can nonetheless choke or lose data
+ HTMLParser, but Beautiful Soup can nonetheless choke or lose data
if your data uses self-closing tags or declarations
incorrectly.
@@ -1022,7 +1181,7 @@
you'll get better performance.
The default parser massage techniques fix the two most common
- instances of invalid HTML that choke sgmllib:
+ instances of invalid HTML that choke HTMLParser:
<br/> (No space between name of closing tag and tag close)
<! --Comment--> (Extraneous whitespace in declaration)
@@ -1060,29 +1219,21 @@
self.escapeUnrecognizedEntities = False
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
- SGMLParser.__init__(self)
+ self.builder = builder(self)
+ self.reset()
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
self.markup = markup
self.markupMassage = markupMassage
try:
- self._feed()
+ self._feed(isHTML=isHTML)
except StopParsing:
pass
- self.markup = None # The markup can now be GCed
+ self.markup = None # The markup can now be GCed.
+ self.builder = None # So can the builder.
- def convert_charref(self, name):
- """This method fixes a bug in Python's SGMLParser."""
- try:
- n = int(name)
- except ValueError:
- return
- if not 0 <= n <= 127 : # ASCII ends at 127, not 255
- return
- return self.convert_codepoint(n)
-
- def _feed(self, inDocumentEncoding=None):
+ def _feed(self, inDocumentEncoding=None, isHTML=False):
# Convert the document to Unicode.
markup = self.markup
if isinstance(markup, unicode):
@@ -1091,9 +1242,10 @@
else:
dammit = UnicodeDammit\
(markup, [self.fromEncoding, inDocumentEncoding],
- smartQuotesTo=self.smartQuotesTo)
+ smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
markup = dammit.unicode
self.originalEncoding = dammit.originalEncoding
+ self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
if markup:
if self.markupMassage:
if not isList(self.markupMassage):
@@ -1106,27 +1258,14 @@
# was relying on the existence of markupMassage, this
# might cause problems.
del(self.markupMassage)
- self.reset()
+ self.builder.reset()
- SGMLParser.feed(self, markup)
+ self.builder.feed(markup)
# Close out any unfinished strings and close all the open tags.
self.endData()
while self.currentTag.name != self.ROOT_TAG_NAME:
self.popTag()
- def __getattr__(self, methodName):
- """This method routes method call requests to either the SGMLParser
- superclass or the Tag superclass, depending on the method name."""
- #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
-
- if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
- or methodName.find('do_') == 0:
- return SGMLParser.__getattr__(self, methodName)
- elif methodName.find('__') != 0:
- return Tag.__getattr__(self, methodName)
- else:
- raise AttributeError
-
def isSelfClosingTag(self, name):
"""Returns true iff the given string is the name of a
self-closing tag according to this parser."""
@@ -1136,7 +1275,7 @@
def reset(self):
Tag.__init__(self, self, self.ROOT_TAG_NAME)
self.hidden = 1
- SGMLParser.reset(self)
+ self.builder.reset()
self.currentData = []
self.currentTag = None
self.tagStack = []
@@ -1166,8 +1305,10 @@
def endData(self, containerClass=NavigableString):
if self.currentData:
- currentData = ''.join(self.currentData)
- if not currentData.translate(self.STRIP_ASCII_SPACES):
+ currentData = u''.join(self.currentData)
+ if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+ not set([tag.name for tag in self.tagStack]).intersection(
+ self.PRESERVE_WHITESPACE_TAGS)):
if '\n' in currentData:
currentData = '\n'
else:
@@ -1299,100 +1440,10 @@
def handle_data(self, data):
self.currentData.append(data)
- def _toStringSubclass(self, text, subclass):
- """Adds a certain piece of text to the tree as a NavigableString
- subclass."""
- self.endData()
- self.handle_data(text)
- self.endData(subclass)
+ def extractCharsetFromMeta(self, attrs):
+ self.unknown_starttag('meta', attrs)
- def handle_pi(self, text):
- """Handle a processing instruction as a ProcessingInstruction
- object, possibly one with a %SOUP-ENCODING% slot into which an
- encoding will be plugged later."""
- if text[:3] == "xml":
- text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
- self._toStringSubclass(text, ProcessingInstruction)
- def handle_comment(self, text):
- "Handle comments as Comment objects."
- self._toStringSubclass(text, Comment)
-
- def handle_charref(self, ref):
- "Handle character references as data."
- if self.convertEntities:
- data = unichr(int(ref))
- else:
- data = '&#%s;' % ref
- self.handle_data(data)
-
- def handle_entityref(self, ref):
- """Handle entity references as data, possibly converting known
- HTML and/or XML entity references to the corresponding Unicode
- characters."""
- data = None
- if self.convertHTMLEntities:
- try:
- data = unichr(name2codepoint[ref])
- except KeyError:
- pass
-
- if not data and self.convertXMLEntities:
- data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
-
- if not data and self.convertHTMLEntities and \
- not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
- # TODO: We've got a problem here. We're told this is
- # an entity reference, but it's not an XML entity
- # reference or an HTML entity reference. Nonetheless,
- # the logical thing to do is to pass it through as an
- # unrecognized entity reference.
- #
- # Except: when the input is "&carol;" this function
- # will be called with input "carol". When the input is
- # "AT&T", this function will be called with input
- # "T". We have no way of knowing whether a semicolon
- # was present originally, so we don't know whether
- # this is an unknown entity or just a misplaced
- # ampersand.
- #
- # The more common case is a misplaced ampersand, so I
- # escape the ampersand and omit the trailing semicolon.
- data = "&%s" % ref
- if not data:
- # This case is different from the one above, because we
- # haven't already gone through a supposedly comprehensive
- # mapping of entities to Unicode characters. We might not
- # have gone through any mapping at all. So the chances are
- # very high that this is a real entity, and not a
- # misplaced ampersand.
- data = "&%s;" % ref
- self.handle_data(data)
-
- def handle_decl(self, data):
- "Handle DOCTYPEs and the like as Declaration objects."
- self._toStringSubclass(data, Declaration)
-
- def parse_declaration(self, i):
- """Treat a bogus SGML declaration as raw data. Treat a CDATA
- declaration as a CData object."""
- j = None
- if self.rawdata[i:i+9] == '<![CDATA[':
- k = self.rawdata.find(']]>', i)
- if k == -1:
- k = len(self.rawdata)
- data = self.rawdata[i+9:k]
- j = k+3
- self._toStringSubclass(data, CData)
- else:
- try:
- j = SGMLParser.parse_declaration(self, i)
- except SGMLParseError:
- toHandle = self.rawdata[i:]
- self.handle_data(toHandle)
- j = i + len(toHandle)
- return j
-
class BeautifulSoup(BeautifulStoneSoup):
"""This parser knows the following facts about HTML:
@@ -1444,12 +1495,15 @@
def __init__(self, *args, **kwargs):
if not kwargs.has_key('smartQuotesTo'):
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
+ kwargs['isHTML'] = True
BeautifulStoneSoup.__init__(self, *args, **kwargs)
SELF_CLOSING_TAGS = buildTagMap(None,
['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
+ PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
QUOTE_TAGS = {'script' : None, 'textarea' : None}
#According to the HTML standard, each of these inline tags can
@@ -1494,9 +1548,9 @@
NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
# Used to detect the charset in a META tag; see start_meta
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
+ CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
- def start_meta(self, attrs):
+ def extractCharsetFromMeta(self, attrs):
"""Beautiful Soup can detect a charset included in a META tag,
try to convert the document to that charset, and re-parse the
document from the beginning."""
@@ -1517,29 +1571,33 @@
if httpEquiv and contentType: # It's an interesting meta tag.
match = self.CHARSET_RE.search(contentType)
if match:
- if getattr(self, 'declaredHTMLEncoding') or \
- (self.originalEncoding == self.fromEncoding):
- # This is our second pass through the document, or
- # else an encoding was specified explicitly and it
- # worked. Rewrite the meta tag.
- newAttr = self.CHARSET_RE.sub\
- (lambda(match):match.group(1) +
- "%SOUP-ENCODING%", contentType)
+ if (self.declaredHTMLEncoding is not None or
+ self.originalEncoding == self.fromEncoding):
+ # An HTML encoding was sniffed while converting
+ # the document to Unicode, or an HTML encoding was
+ # sniffed during a previous pass through the
+ # document, or an encoding was specified
+ # explicitly and it worked. Rewrite the meta tag.
+ def rewrite(match):
+ return match.group(1) + "%SOUP-ENCODING%"
+ newAttr = self.CHARSET_RE.sub(rewrite, contentType)
attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
newAttr)
tagNeedsEncodingSubstitution = True
else:
# This is our first pass through the document.
- # Go through it again with the new information.
+ # Go through it again with the encoding information.
newCharset = match.group(3)
if newCharset and newCharset != self.originalEncoding:
self.declaredHTMLEncoding = newCharset
self._feed(self.declaredHTMLEncoding)
raise StopParsing
+ pass
tag = self.unknown_starttag("meta", attrs)
if tag and tagNeedsEncodingSubstitution:
tag.containsSubstitutions = True
+
class StopParsing(Exception):
pass
@@ -1687,9 +1745,10 @@
"x-sjis" : "shift-jis" }
def __init__(self, markup, overrideEncodings=[],
- smartQuotesTo='xml'):
+ smartQuotesTo='xml', isHTML=False):
+ self.declaredHTMLEncoding = None
self.markup, documentEncoding, sniffedEncoding = \
- self._detectEncoding(markup)
+ self._detectEncoding(markup, isHTML)
self.smartQuotesTo = smartQuotesTo
self.triedEncodings = []
if markup == '' or isinstance(markup, unicode):
@@ -1715,18 +1774,22 @@
for proposed_encoding in ("utf-8", "windows-1252"):
u = self._convertFrom(proposed_encoding)
if u: break
+
self.unicode = u
if not u: self.originalEncoding = None
- def _subMSChar(self, orig):
+ def _subMSChar(self, match):
"""Changes a MS smart quote character to an XML or HTML
entity."""
+ orig = match.group(1)
sub = self.MS_CHARS.get(orig)
if type(sub) == types.TupleType:
if self.smartQuotesTo == 'xml':
- sub = '&#x%s;' % sub[1]
+ sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
else:
- sub = '&%s;' % sub[0]
+ sub = '&'.encode() + sub[0].encode() + ';'.encode()
+ else:
+ sub = sub.encode()
return sub
def _convertFrom(self, proposed):
@@ -1741,9 +1804,9 @@
if self.smartQuotesTo and proposed.lower() in("windows-1252",
"iso-8859-1",
"iso-8859-2"):
- markup = re.compile("([\x80-\x9f])").sub \
- (lambda(x): self._subMSChar(x.group(1)),
- markup)
+ smart_quotes_re = "([\x80-\x9f])"
+ smart_quotes_compiled = re.compile(smart_quotes_re)
+ markup = smart_quotes_compiled.sub(self._subMSChar, markup)
try:
# print "Trying to convert document to %s" % proposed
@@ -1782,7 +1845,7 @@
newdata = unicode(data, encoding)
return newdata
- def _detectEncoding(self, xml_data):
+ def _detectEncoding(self, xml_data, isHTML=False):
"""Given a document, tries to detect its XML encoding."""
xml_encoding = sniffed_xml_encoding = None
try:
@@ -1830,13 +1893,19 @@
else:
sniffed_xml_encoding = 'ascii'
pass
- xml_encoding_match = re.compile \
- ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
- .match(xml_data)
except:
xml_encoding_match = None
- if xml_encoding_match:
- xml_encoding = xml_encoding_match.groups()[0].lower()
+ xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
+ xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
+ if not xml_encoding_match and isHTML:
+ meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
+ regexp = re.compile(meta_re, re.I)
+ xml_encoding_match = regexp.search(xml_data)
+ if xml_encoding_match is not None:
+ xml_encoding = xml_encoding_match.groups()[0].decode(
+ 'ascii').lower()
+ if isHTML:
+ self.declaredHTMLEncoding = xml_encoding
if sniffed_xml_encoding and \
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
@@ -1927,5 +1996,5 @@
#By default, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
- soup = BeautifulSoup(sys.stdin.read())
+ soup = BeautifulSoup(sys.stdin)
print soup.prettify()
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-05-08 07:52:49 UTC (rev 6857)
+++ trunk/pywikipedia/wikipedia.py 2009-05-08 15:23:29 UTC (rev 6858)
@@ -4995,7 +4995,7 @@
else:
tree = BeautifulStoneSoup(xml)
self._mediawiki_messages = _dict([(tag.get('name').lower(), html2unicode(tag.string))
- for tag in tree.findAll('message')])
+ for tag in tree.findAll('message') if tag.string])
if not self._mediawiki_messages:
# No messages could be added.