[Pywikipedia-l] SVN: [4779] trunk/pywikipedia/BeautifulSoup.py
rotem at svn.wikimedia.org
rotem at svn.wikimedia.org
Sat Dec 29 17:48:36 UTC 2007
Revision: 4779
Author: rotem
Date: 2007-12-29 17:48:36 +0000 (Sat, 29 Dec 2007)
Log Message:
-----------
Update Beautiful Soup to version 3.0.5, per http://lists.wikimedia.org/pipermail/pywikipedia-l/2007-December/001627.html .
Modified Paths:
--------------
trunk/pywikipedia/BeautifulSoup.py
Modified: trunk/pywikipedia/BeautifulSoup.py
===================================================================
--- trunk/pywikipedia/BeautifulSoup.py 2007-12-29 15:50:13 UTC (rev 4778)
+++ trunk/pywikipedia/BeautifulSoup.py 2007-12-29 17:48:36 UTC (rev 4779)
@@ -11,7 +11,7 @@
structure. An ill-formed XML/HTML document yields a correspondingly
ill-formed data structure. If your document is only locally
well-formed, you can use this library to find and process the
-well-formed part of it. The BeautifulSoup class
+well-formed part of it.
Beautiful Soup works with Python 2.2 and up. It has no external
dependencies, but you'll have more success at converting data to UTF-8
@@ -24,7 +24,7 @@
http://cjkpython.i18n.org/
Beautiful Soup defines classes for two main parsing strategies:
-
+
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
language that kind of looks like XML.
@@ -40,13 +40,48 @@
documentation:
http://www.crummy.com/software/BeautifulSoup/documentation.html
+Here, have some legalese:
+
+Copyright (c) 2004-2007, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Neither the name of the the Beautiful Soup Consortium and All
+ Night Kosher Bakery nor the names of its contributors may be
+ used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
"""
from __future__ import generators
__author__ = "Leonard Richardson (leonardr at segfault.org)"
-__version__ = "3.0.4"
+__version__ = "3.0.5"
__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
-__license__ = "PSF"
+__license__ = "New-style BSD"
from sgmllib import SGMLParser, SGMLParseError
import codecs
@@ -71,7 +106,7 @@
def setup(self, parent=None, previous=None):
"""Sets up the initial relations between this element and
- other elements."""
+ other elements."""
self.parent = parent
self.previous = previous
self.next = None
@@ -81,7 +116,7 @@
self.previousSibling = self.parent.contents[-1]
self.previousSibling.nextSibling = self
- def replaceWith(self, replaceWith):
+ def replaceWith(self, replaceWith):
oldParent = self.parent
myIndex = self.parent.contents.index(self)
if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
@@ -92,11 +127,11 @@
# means that when we extract it, the index of this
# element will change.
myIndex = myIndex - 1
- self.extract()
+ self.extract()
oldParent.insert(myIndex, replaceWith)
-
+
def extract(self):
- """Destructively rips this element out of the tree."""
+ """Destructively rips this element out of the tree."""
if self.parent:
try:
self.parent.contents.remove(self)
@@ -105,7 +140,7 @@
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
- #the two.
+ #the two.
lastChild = self._lastRecursiveChild()
nextElement = lastChild.next
@@ -116,12 +151,12 @@
self.previous = None
lastChild.next = None
- self.parent = None
+ self.parent = None
if self.previousSibling:
self.previousSibling.nextSibling = self.nextSibling
if self.nextSibling:
self.nextSibling.previousSibling = self.previousSibling
- self.previousSibling = self.nextSibling = None
+ self.previousSibling = self.nextSibling = None
def _lastRecursiveChild(self):
"Finds the last element beneath this object to be parsed."
@@ -134,12 +169,12 @@
if (isinstance(newChild, basestring)
or isinstance(newChild, unicode)) \
and not isinstance(newChild, NavigableString):
- newChild = NavigableString(newChild)
+ newChild = NavigableString(newChild)
position = min(position, len(self.contents))
if hasattr(newChild, 'parent') and newChild.parent != None:
# We're 'inserting' an element that's already one
- # of this object's children.
+ # of this object's children.
if newChild.parent == self:
index = self.find(newChild)
if index and index < position:
@@ -149,7 +184,7 @@
# will jump down one.
position = position - 1
newChild.extract()
-
+
newChild.parent = self
previousChild = None
if position == 0:
@@ -161,13 +196,13 @@
newChild.previousSibling.nextSibling = newChild
newChild.previous = previousChild._lastRecursiveChild()
if newChild.previous:
- newChild.previous.next = newChild
+ newChild.previous.next = newChild
newChildsLastElement = newChild._lastRecursiveChild()
if position >= len(self.contents):
newChild.nextSibling = None
-
+
parent = self
parentsNextSibling = None
while not parentsNextSibling:
@@ -180,8 +215,8 @@
else:
newChildsLastElement.next = None
else:
- nextChild = self.contents[position]
- newChild.nextSibling = nextChild
+ nextChild = self.contents[position]
+ newChild.nextSibling = nextChild
if newChild.nextSibling:
newChild.nextSibling.previousSibling = newChild
newChildsLastElement.next = nextChild
@@ -190,6 +225,10 @@
newChildsLastElement.next.previous = newChildsLastElement
self.contents.insert(position, newChild)
+ def append(self, tag):
+ """Appends the given tag to the contents of this tag."""
+ self.insert(len(self.contents), tag)
+
def findNext(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
appears after this Tag in the document."""
@@ -269,7 +308,7 @@
if l:
r = l[0]
return r
-
+
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
@@ -294,7 +333,7 @@
return results
#These Generators can be used to navigate starting from both
- #NavigableStrings and Tags.
+ #NavigableStrings and Tags.
def nextGenerator(self):
i = self
while i:
@@ -328,7 +367,7 @@
# Utility methods
def substituteEncoding(self, str, encoding=None):
encoding = encoding or "utf-8"
- return str.replace("%SOUP-ENCODING%", encoding)
+ return str.replace("%SOUP-ENCODING%", encoding)
def toEncoding(self, s, encoding=None):
"""Encodes an object to a string in some encoding, or to Unicode.
@@ -350,6 +389,9 @@
class NavigableString(unicode, PageElement):
+ def __getnewargs__(self):
+ return (NavigableString.__str__(self),)
+
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
compatibility for Navigable*String, but for CData* it lets you
@@ -360,14 +402,14 @@
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
def __unicode__(self):
- return self.__str__(None)
+ return unicode(str(self))
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
if encoding:
return self.encode(encoding)
else:
return self
-
+
class CData(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
@@ -382,22 +424,56 @@
class Comment(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return "<!--%s-->" % NavigableString.__str__(self, encoding)
+ return "<!--%s-->" % NavigableString.__str__(self, encoding)
class Declaration(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return "<!%s>" % NavigableString.__str__(self, encoding)
+ return "<!%s>" % NavigableString.__str__(self, encoding)
class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
- XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
- '"' : "quote",
- "&" : "amp",
- "<" : "lt",
- ">" : "gt" }
+ def _invert(h):
+ "Cheap function to invert a hash."
+ i = {}
+ for k,v in h.items():
+ i[v] = k
+ return i
+ XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
+ "quot" : '"',
+ "amp" : "&",
+ "lt" : "<",
+ "gt" : ">" }
+
+ XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
+
+ def _convertEntities(self, match):
+ """Used in a call to re.sub to replace HTML, XML, and numeric
+ entities with the appropriate Unicode characters. If HTML
+ entities are being converted, any unrecognized entities are
+ escaped."""
+ x = match.group(1)
+ if self.convertHTMLEntities and x in name2codepoint:
+ return unichr(name2codepoint[x])
+ elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
+ if self.convertXMLEntities:
+ return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
+ else:
+ return u'&%s;' % x
+ elif len(x) > 0 and x[0] == '#':
+ # Handle numeric entities
+ if len(x) > 1 and x[1] == 'x':
+ return unichr(int(x[2:], 16))
+ else:
+ return unichr(int(x[1:]))
+
+ elif self.escapeUnrecognizedEntities:
+ return u'&%s;' % x
+ else:
+ return u'&%s;' % x
+
def __init__(self, parser, name, attrs=None, parent=None,
previous=None):
"Basic constructor."
@@ -414,12 +490,22 @@
self.setup(parent, previous)
self.hidden = False
self.containsSubstitutions = False
+ self.convertHTMLEntities = parser.convertHTMLEntities
+ self.convertXMLEntities = parser.convertXMLEntities
+ self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
+ # Convert any HTML, XML, or numeric entities in the attribute values.
+ convert = lambda(k, val): (k,
+ re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities,
+ val))
+ self.attrs = map(convert, self.attrs)
+
def get(self, key, default=None):
"""Returns the value of the 'key' attribute for the tag, or
the value given for 'default' if it doesn't have that
attribute."""
- return self._getAttrMap().get(key, default)
+ return self._getAttrMap().get(key, default)
def has_key(self, key):
return self._getAttrMap().has_key(key)
@@ -444,7 +530,7 @@
"A tag is non-None even if it has no contents."
return True
- def __setitem__(self, key, value):
+ def __setitem__(self, key, value):
"""Setting tag[key] sets the value of the 'key' attribute for the
tag."""
self._getAttrMap()
@@ -481,6 +567,7 @@
return self.find(tag[:-3])
elif tag.find('__') != 0:
return self.find(tag)
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
def __eq__(self, other):
"""Returns true iff this tag has the same name, the same attributes,
@@ -507,6 +594,15 @@
def __unicode__(self):
return self.__str__(None)
+ BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ + ")")
+
+ def _sub_entity(self, x):
+ """Used with a regular expression to substitute the
+ appropriate XML entity for an XML special character."""
+ return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
+
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0):
"""Returns a string or Unicode representation of this tag and
@@ -522,7 +618,7 @@
if self.attrs:
for key, val in self.attrs:
fmt = '%s="%s"'
- if isString(val):
+ if isString(val):
if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
val = self.substituteEncoding(val, encoding)
@@ -543,19 +639,17 @@
# embedded single quotes to XML entities.
if '"' in val:
fmt = "%s='%s'"
- # This can't happen naturally, but it can happen
- # if you modify an attribute value after parsing.
if "'" in val:
+ # TODO: replace with apos when
+ # appropriate.
val = val.replace("'", "&squot;")
# Now we're okay w/r/t quotes. But the attribute
# value might also contain angle brackets, or
# ampersands that aren't part of entities. We need
# to escape those to XML entities too.
- val = re.sub("([<>]|&(?![^\s]+;))",
- lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
- val)
-
+ val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
+
attrs.append(fmt % (self.toEncoding(key, encoding),
self.toEncoding(val, encoding)))
close = ''
@@ -577,7 +671,7 @@
s = []
attributeString = ''
if attrs:
- attributeString = ' ' + ' '.join(attrs)
+ attributeString = ' ' + ' '.join(attrs)
if prettyPrint:
s.append(space)
s.append('<%s%s%s>' % (encodedName, attributeString, close))
@@ -609,14 +703,14 @@
elif isinstance(c, Tag):
s.append(c.__str__(encoding, prettyPrint, indentLevel))
if text and prettyPrint:
- text = text.strip()
+ text = text.strip()
if text:
if prettyPrint:
s.append(" " * (indentLevel-1))
s.append(text)
if prettyPrint:
s.append("\n")
- return ''.join(s)
+ return ''.join(s)
#Soup methods
@@ -651,19 +745,13 @@
# Pre-3.x compatibility methods
first = find
fetch = findAll
-
+
def fetchText(self, text=None, recursive=True, limit=None):
return self.findAll(text=text, recursive=recursive, limit=limit)
def firstText(self, text=None, recursive=True):
return self.find(text=text, recursive=recursive)
-
- #Utility methods
- def append(self, tag):
- """Appends the given tag to the contents of this tag."""
- self.contents.append(tag)
-
#Private methods
def _getAttrMap(self):
@@ -672,7 +760,7 @@
if not getattr(self, 'attrMap'):
self.attrMap = {}
for (key, value) in self.attrs:
- self.attrMap[key] = value
+ self.attrMap[key] = value
return self.attrMap
#Generator methods
@@ -680,12 +768,12 @@
for i in range(0, len(self.contents)):
yield self.contents[i]
raise StopIteration
-
+
def recursiveChildGenerator(self):
stack = [(self, 0)]
while stack:
tag, start = stack.pop()
- if isinstance(tag, Tag):
+ if isinstance(tag, Tag):
for i in range(start, len(tag.contents)):
a = tag.contents[i]
yield a
@@ -720,7 +808,7 @@
return self.text
else:
return "%s|%s" % (self.name, self.attrs)
-
+
def searchTag(self, markupName=None, markupAttrs={}):
found = None
markup = None
@@ -737,7 +825,7 @@
if callFunctionWithTagData:
match = self.name(markupName, markupAttrs)
else:
- match = True
+ match = True
markupAttrMap = None
for attr, matchAgainst in self.attrs.items():
if not markupAttrMap:
@@ -762,7 +850,7 @@
#print 'looking for %s in %s' % (self, markup)
found = None
# If given a list of items, scan it for a text element that
- # matches.
+ # matches.
if isList(markup) and not isinstance(markup, Tag):
for element in markup:
if isinstance(element, NavigableString) \
@@ -783,8 +871,8 @@
raise Exception, "I don't know how to match against a %s" \
% markup.__class__
return found
-
- def _matches(self, markup, matchAgainst):
+
+ def _matches(self, markup, matchAgainst):
#print "Matching %s against %s" % (markup, matchAgainst)
result = False
if matchAgainst == True and type(matchAgainst) == types.BooleanType:
@@ -835,7 +923,7 @@
"""Convenience method that works with all 2.x versions of Python
to determine whether or not something is stringlike."""
try:
- return isinstance(s, unicode) or isintance(s, basestring)
+ return isinstance(s, unicode) or isinstance(s, basestring)
except NameError:
return isinstance(s, str)
@@ -865,7 +953,7 @@
"""This class contains the basic parser and search code. It defines
a parser that knows nothing about tag behavior except for the
following:
-
+
You can't close a tag without closing all the tags it encloses.
That is, "<foo><bar></foo>" actually means
"<foo><bar></bar></foo>".
@@ -878,10 +966,6 @@
or when BeautifulSoup makes an assumption counter to what you were
expecting."""
- XML_ENTITY_LIST = {}
- for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():
- XML_ENTITY_LIST[i] = True
-
SELF_CLOSING_TAGS = {}
NESTABLE_TAGS = {}
RESET_NESTING_TAGS = {}
@@ -897,13 +981,22 @@
HTML_ENTITIES = "html"
XML_ENTITIES = "xml"
+ XHTML_ENTITIES = "xhtml"
+ # TODO: This only exists for backwards-compatibility
+ ALL_ENTITIES = XHTML_ENTITIES
+ # Used when determining whether a text node is all whitespace and
+ # can be replaced with a single space. A text node that contains
+ # fancy Unicode spaces (usually non-breaking) should be left
+ # alone.
+ STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
markupMassage=True, smartQuotesTo=XML_ENTITIES,
convertEntities=None, selfClosingTags=None):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
- is fed into the underlying parser.
+ is fed into the underlying parser.
sgmllib will process most bad HTML, and the BeautifulSoup
class has some tricks for dealing with some HTML that kills
@@ -930,14 +1023,33 @@
self.fromEncoding = fromEncoding
self.smartQuotesTo = smartQuotesTo
self.convertEntities = convertEntities
+ # Set the rules for how we'll deal with the entities we
+ # encounter
if self.convertEntities:
# It doesn't make sense to convert encoded characters to
# entities even while you're converting entities to Unicode.
# Just convert it all to Unicode.
self.smartQuotesTo = None
+ if convertEntities == self.HTML_ENTITIES:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = True
+ elif convertEntities == self.XHTML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = True
+ self.escapeUnrecognizedEntities = False
+ elif convertEntities == self.XML_ENTITIES:
+ self.convertXMLEntities = True
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+ else:
+ self.convertXMLEntities = False
+ self.convertHTMLEntities = False
+ self.escapeUnrecognizedEntities = False
+
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
SGMLParser.__init__(self)
-
+
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
self.markup = markup
@@ -947,7 +1059,17 @@
except StopParsing:
pass
self.markup = None # The markup can now be GCed
-
+
+ def convert_charref(self, name):
+ """This method fixes a bug in Python's SGMLParser."""
+ try:
+ n = int(name)
+ except ValueError:
+ return
+ if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+ return
+ return self.convert_codepoint(n)
+
def _feed(self, inDocumentEncoding=None):
# Convert the document to Unicode.
markup = self.markup
@@ -963,9 +1085,15 @@
if markup:
if self.markupMassage:
if not isList(self.markupMassage):
- self.markupMassage = self.MARKUP_MASSAGE
+ self.markupMassage = self.MARKUP_MASSAGE
for fix, m in self.markupMassage:
markup = fix.sub(m, markup)
+ # TODO: We get rid of markupMassage so that the
+ # soup object can be deepcopied later on. Some
+ # Python installations can't copy regexes. If anyone
+ # was relying on the existence of markupMassage, this
+ # might cause problems.
+ del(self.markupMassage)
self.reset()
SGMLParser.feed(self, markup)
@@ -992,7 +1120,7 @@
self-closing tag according to this parser."""
return self.SELF_CLOSING_TAGS.has_key(name) \
or self.instanceSelfClosingTags.has_key(name)
-
+
def reset(self):
Tag.__init__(self, self, self.ROOT_TAG_NAME)
self.hidden = 1
@@ -1002,7 +1130,7 @@
self.tagStack = []
self.quoteStack = []
self.pushTag(self)
-
+
def popTag(self):
tag = self.tagStack.pop()
# Tags with just one string-owning child get the child as a
@@ -1020,14 +1148,14 @@
def pushTag(self, tag):
#print "Push", tag.name
if self.currentTag:
- self.currentTag.append(tag)
+ self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
def endData(self, containerClass=NavigableString):
if self.currentData:
currentData = ''.join(self.currentData)
- if not currentData.strip():
+ if not currentData.translate(self.STRIP_ASCII_SPACES):
if '\n' in currentData:
currentData = '\n'
else:
@@ -1052,7 +1180,7 @@
the given tag."""
#print "Popping to %s" % name
if name == self.ROOT_TAG_NAME:
- return
+ return
numPops = 0
mostRecentTag = None
@@ -1065,7 +1193,7 @@
for i in range(0, numPops):
mostRecentTag = self.popTag()
- return mostRecentTag
+ return mostRecentTag
def _smartPop(self, name):
@@ -1076,10 +1204,9 @@
comes between this tag and the previous tag of this type.
Examples:
- <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
- <p>Foo<table>Bar<p> should pop to 'table', not 'p'.
- <p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
- <p>Foo<b>Bar<p> should pop to 'p', not 'b'.
+ <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
+ <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
+ <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
@@ -1102,7 +1229,7 @@
and p.name in nestingResetTriggers) \
or (nestingResetTriggers == None and isResetNesting
and self.RESET_NESTING_TAGS.has_key(p.name)):
-
+
#If we encounter one of the nesting reset triggers
#peculiar to this tag, or we encounter another tag
#that causes nesting to reset, pop up to but not
@@ -1121,7 +1248,7 @@
#print "<%s> is not real!" % name
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
self.handle_data('<%s%s>' % (name, attrs))
- return
+ return
self.endData()
if not self.isSelfClosingTag(name) and not selfClosing:
@@ -1137,7 +1264,7 @@
self.previous = tag
self.pushTag(tag)
if selfClosing or self.isSelfClosingTag(name):
- self.popTag()
+ self.popTag()
if name in self.QUOTE_TAGS:
#print "Beginning quote (%s)" % name
self.quoteStack.append(name)
@@ -1172,7 +1299,7 @@
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later."""
if text[:3] == "xml":
- text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text):
@@ -1181,8 +1308,7 @@
def handle_charref(self, ref):
"Handle character references as data."
- if self.convertEntities in [self.HTML_ENTITIES,
- self.XML_ENTITIES]:
+ if self.convertEntities:
data = unichr(int(ref))
else:
data = '&#%s;' % ref
@@ -1190,20 +1316,47 @@
def handle_entityref(self, ref):
"""Handle entity references as data, possibly converting known
- HTML entity references to the corresponding Unicode
+ HTML and/or XML entity references to the corresponding Unicode
characters."""
data = None
- if self.convertEntities == self.HTML_ENTITIES or \
- (self.convertEntities == self.XML_ENTITIES and \
- self.XML_ENTITY_LIST.get(ref)):
+ if self.convertHTMLEntities:
try:
data = unichr(name2codepoint[ref])
except KeyError:
pass
+
+ if not data and self.convertXMLEntities:
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.convertHTMLEntities and \
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
if not data:
- data = '&%s;' % ref
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
self.handle_data(data)
-
+
def handle_decl(self, data):
"Handle DOCTYPEs and the like as Declaration objects."
self._toStringSubclass(data, Declaration)
@@ -1285,8 +1438,8 @@
['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
- QUOTE_TAGS = {'script': None}
-
+ QUOTE_TAGS = {'script' : None, 'textarea' : None}
+
#According to the HTML standard, each of these inline tags can
#contain another tag of the same type. Furthermore, it's common
#to actually use these tags this way.
@@ -1298,7 +1451,7 @@
#to actually use these tags this way.
NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
- #Lists can contain other lists, but there are restrictions.
+ #Lists can contain other lists, but there are restrictions.
NESTABLE_LIST_TAGS = { 'ol' : [],
'ul' : [],
'li' : ['ul', 'ol'],
@@ -1306,8 +1459,8 @@
'dd' : ['dl'],
'dt' : ['dl'] }
- #Tables can contain other tables, but there are restrictions.
- NESTABLE_TABLE_TAGS = {'table' : [],
+ #Tables can contain other tables, but there are restrictions.
+ NESTABLE_TABLE_TAGS = {'table' : [],
'tr' : ['table', 'tbody', 'tfoot', 'thead'],
'td' : ['tr'],
'th' : ['tr'],
@@ -1377,7 +1530,7 @@
class StopParsing(Exception):
pass
-
+
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
"""The BeautifulSoup class is oriented towards skipping over
@@ -1423,7 +1576,7 @@
This also makes it better for subclassing than BeautifulStoneSoup
or BeautifulSoup."""
-
+
RESET_NESTING_TAGS = buildTagMap('noscript')
NESTABLE_TAGS = {}
@@ -1453,7 +1606,7 @@
parent = self.tagStack[-2]
parent._getAttrMap()
if (isinstance(tag, Tag) and len(tag.contents) == 1 and
- isinstance(tag.contents[0], NavigableString) and
+ isinstance(tag.contents[0], NavigableString) and
not parent.attrMap.has_key(tag.name)):
parent[tag.name] = tag.contents[0]
BeautifulStoneSoup.popTag(self)
@@ -1463,7 +1616,7 @@
#and "unprofessional" for use in enterprise screen-scraping. We feel
#your pain! For such-minded folk, the Beautiful Soup Consortium And
#All-Night Kosher Bakery recommends renaming this file to
-#"RobustParser.py" (or, in cases of extreme enterprisness,
+#"RobustParser.py" (or, in cases of extreme enterprisiness,
#"RobustParserBeanInterface.class") and using the following
#enterprise-friendly class aliases:
class RobustXMLParser(BeautifulStoneSoup):
@@ -1530,9 +1683,9 @@
self.triedEncodings = []
if markup == '' or isinstance(markup, unicode):
self.originalEncoding = None
- self.unicode = unicode(markup)
+ self.unicode = unicode(markup)
return
-
+
u = None
for proposedEncoding in overrideEncodings:
u = self._convertFrom(proposedEncoding)
@@ -1541,7 +1694,7 @@
for proposedEncoding in (documentEncoding, sniffedEncoding):
u = self._convertFrom(proposedEncoding)
if u: break
-
+
# If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode):
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
@@ -1563,9 +1716,9 @@
sub = '&#x%s;' % sub[1]
else:
sub = '&%s;' % sub[0]
- return sub
+ return sub
- def _convertFrom(self, proposed):
+ def _convertFrom(self, proposed):
proposed = self.find_codec(proposed)
if not proposed or proposed in self.triedEncodings:
return None
@@ -1584,12 +1737,12 @@
try:
# print "Trying to convert document to %s" % proposed
u = self._toUnicode(markup, proposed)
- self.markup = u
+ self.markup = u
self.originalEncoding = proposed
except Exception, e:
# print "That didn't work!"
# print e
- return None
+ return None
#print "Correct encoding: %s" % proposed
return self.markup
@@ -1617,7 +1770,7 @@
data = data[4:]
newdata = unicode(data, encoding)
return newdata
-
+
def _detectEncoding(self, xml_data):
"""Given a document, tries to detect its XML encoding."""
xml_encoding = sniffed_xml_encoding = None
@@ -1689,12 +1842,12 @@
or charset
def _codec(self, charset):
- if not charset: return charset
+ if not charset: return charset
codec = None
try:
codecs.lookup(charset)
codec = charset
- except LookupError:
+ except (LookupError, ValueError):
pass
return codec
More information about the Pywikipedia-l
mailing list