http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9172
Revision: 9172
Author: a_engels
Date: 2011-04-15 06:07:00 +0000 (Fri, 15 Apr 2011)
Log Message:
-----------
deal with the self.originPage == None case (I think it occurs if the original page has bug #3081100, but I'm not sure about that)
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2011-04-13 19:57:44 UTC (rev 9171)
+++ trunk/pywikipedia/interwiki.py 2011-04-15 06:07:00 UTC (rev 9172)
@@ -1725,24 +1725,25 @@
raise "Bugcheck: finish called before done"
if not self.workonme:
return
- if self.forcedStop: # autonomous with problem
- pywikibot.output(u"======Aborted processing %s======" % self.originPage.aslink(True))
- return
if self.originPage:
if self.originPage.isRedirectPage():
return
if self.originPage.isCategoryRedirect():
return
+ else:
+ return
if not self.untranslated and globalvar.untranslatedonly:
return
+ if self.forcedStop: # autonomous with problem
+ pywikibot.output(u"======Aborted processing %s======" % self.originPage.aslink(True))
+ return
# The following check is not always correct and thus disabled.
# self.done might contain no interwiki links because of the -neverlink
# argument or because of disambiguation conflicts.
# if len(self.done) == 1:
# # No interwiki at all
# return
- if self.originPage:
- pywikibot.output(u"======Post-processing %s======" % self.originPage.aslink(True))
+ pywikibot.output(u"======Post-processing %s======" % self.originPage.aslink(True))
# Assemble list of accepted interwiki links
new = self.assemble()
if new is None: # User said give up
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9171
Revision: 9171
Author: shizhao
Date: 2011-04-13 19:57:44 +0000 (Wed, 13 Apr 2011)
Log Message:
-----------
update to 3.2.0
Modified Paths:
--------------
trunk/pywikipedia/BeautifulSoup.py
Modified: trunk/pywikipedia/BeautifulSoup.py
===================================================================
--- trunk/pywikipedia/BeautifulSoup.py 2011-04-13 04:41:33 UTC (rev 9170)
+++ trunk/pywikipedia/BeautifulSoup.py 2011-04-13 19:57:44 UTC (rev 9171)
@@ -42,7 +42,7 @@
Here, have some legalese:
-Copyright (c) 2004-2009, Leonard Richardson
+Copyright (c) 2004-2010, Leonard Richardson
All rights reserved.
@@ -79,39 +79,38 @@
from __future__ import generators
__author__ = "Leonard Richardson (leonardr(a)segfault.org)"
-__version__ = "3.1.0.1"
-__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
+__version__ = "3.2.0"
+__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
__license__ = "New-style BSD"
+from sgmllib import SGMLParser, SGMLParseError
import codecs
import markupbase
import types
import re
-from HTMLParser import HTMLParser, HTMLParseError
+import sgmllib
try:
- from htmlentitydefs import name2codepoint
+ from htmlentitydefs import name2codepoint
except ImportError:
- name2codepoint = {}
+ name2codepoint = {}
try:
set
except NameError:
from sets import Set as set
#These hacks make Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
DEFAULT_OUTPUT_ENCODING = "utf-8"
+def _match_css_class(str):
+ """Build a RE to match the given CSS class."""
+ return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
# First, the classes that represent markup elements.
-def sob(unicode, encoding):
- """Returns either the given Unicode string or its encoding."""
- if encoding is None:
- return unicode
- else:
- return unicode.encode(encoding)
-
-class PageElement:
+class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -129,10 +128,11 @@
def replaceWith(self, replaceWith):
oldParent = self.parent
- myIndex = self.parent.contents.index(self)
- if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
+ myIndex = self.parent.index(self)
+ if hasattr(replaceWith, "parent")\
+ and replaceWith.parent is self.parent:
# We're replacing this element with one of its siblings.
- index = self.parent.contents.index(replaceWith)
+ index = replaceWith.parent.index(replaceWith)
if index and index < myIndex:
# Furthermore, it comes before this element. That
# means that when we extract it, the index of this
@@ -141,11 +141,20 @@
self.extract()
oldParent.insert(myIndex, replaceWith)
+ def replaceWithChildren(self):
+ myParent = self.parent
+ myIndex = self.parent.index(self)
+ self.extract()
+ reversedChildren = list(self.contents)
+ reversedChildren.reverse()
+ for child in reversedChildren:
+ myParent.insert(myIndex, child)
+
def extract(self):
"""Destructively rips this element out of the tree."""
if self.parent:
try:
- self.parent.contents.remove(self)
+ del self.parent.contents[self.parent.index(self)]
except ValueError:
pass
@@ -178,18 +187,17 @@
return lastChild
def insert(self, position, newChild):
- if (isinstance(newChild, basestring)
- or isinstance(newChild, unicode)) \
+ if isinstance(newChild, basestring) \
and not isinstance(newChild, NavigableString):
newChild = NavigableString(newChild)
position = min(position, len(self.contents))
- if hasattr(newChild, 'parent') and newChild.parent != None:
+ if hasattr(newChild, 'parent') and newChild.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
- if newChild.parent == self:
- index = self.find(newChild)
- if index and index < position:
+ if newChild.parent is self:
+ index = self.index(newChild)
+ if index > position:
# Furthermore we're moving it further down the
# list of this object's children. That means that
# when we extract this element, our target index
@@ -327,8 +335,21 @@
if isinstance(name, SoupStrainer):
strainer = name
+ # (Possibly) special case some findAll*(...) searches
+ elif text is None and not limit and not attrs and not kwargs:
+ # findAll*(True)
+ if name is True:
+ return [element for element in generator()
+ if isinstance(element, Tag)]
+ # findAll*('tag-name')
+ elif isinstance(name, basestring):
+ return [element for element in generator()
+ if isinstance(element, Tag) and
+ element.name == name]
+ else:
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ # Build a SoupStrainer
else:
- # Build a SoupStrainer
strainer = SoupStrainer(name, attrs, text, **kwargs)
results = ResultSet(strainer)
g = generator()
@@ -349,31 +370,31 @@
#NavigableStrings and Tags.
def nextGenerator(self):
i = self
- while i:
+ while i is not None:
i = i.next
yield i
def nextSiblingGenerator(self):
i = self
- while i:
+ while i is not None:
i = i.nextSibling
yield i
def previousGenerator(self):
i = self
- while i:
+ while i is not None:
i = i.previous
yield i
def previousSiblingGenerator(self):
i = self
- while i:
+ while i is not None:
i = i.previousSibling
yield i
def parentGenerator(self):
i = self
- while i:
+ while i is not None:
i = i.parent
yield i
@@ -415,7 +436,7 @@
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __getnewargs__(self):
- return (unicode(self),)
+ return (NavigableString.__str__(self),)
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
@@ -426,32 +447,34 @@
else:
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
- def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return self.decode().encode(encoding)
+ def __unicode__(self):
+ return str(self).decode(DEFAULT_OUTPUT_ENCODING)
- def decodeGivenEventualEncoding(self, eventualEncoding):
- return self
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ if encoding:
+ return self.encode(encoding)
+ else:
+ return self
class CData(NavigableString):
- def decodeGivenEventualEncoding(self, eventualEncoding):
- return u'<![CDATA[' + self + u']]>'
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
class ProcessingInstruction(NavigableString):
-
- def decodeGivenEventualEncoding(self, eventualEncoding):
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
output = self
- if u'%SOUP-ENCODING%' in output:
- output = self.substituteEncoding(output, eventualEncoding)
- return u'<?' + output + u'?>'
+ if "%SOUP-ENCODING%" in output:
+ output = self.substituteEncoding(output, encoding)
+ return "<?%s?>" % self.toEncoding(output, encoding)
class Comment(NavigableString):
- def decodeGivenEventualEncoding(self, eventualEncoding):
- return u'<!--' + self + u'-->'
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<!--%s-->" % NavigableString.__str__(self, encoding)
class Declaration(NavigableString):
- def decodeGivenEventualEncoding(self, eventualEncoding):
- return u'<!' + self + u'>'
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return "<!%s>" % NavigableString.__str__(self, encoding)
class Tag(PageElement):
@@ -506,8 +529,10 @@
self.parserClass = parser.__class__
self.isSelfClosing = parser.isSelfClosingTag(name)
self.name = name
- if attrs == None:
+ if attrs is None:
attrs = []
+ elif isinstance(attrs, dict):
+ attrs = attrs.items()
self.attrs = attrs
self.contents = []
self.setup(parent, previous)
@@ -517,21 +542,56 @@
self.convertXMLEntities = parser.convertXMLEntities
self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
- def convert(kval):
- "Converts HTML, XML and numeric entities in the attribute value."
- k, val = kval
- if val is None:
- return kval
- return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
- self._convertEntities, val))
+ # Convert any HTML, XML, or numeric entities in the attribute values.
+ convert = lambda(k, val): (k,
+ re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+ self._convertEntities,
+ val))
self.attrs = map(convert, self.attrs)
+ def getString(self):
+ if (len(self.contents) == 1
+ and isinstance(self.contents[0], NavigableString)):
+ return self.contents[0]
+
+ def setString(self, string):
+ """Replace the contents of the tag with a string"""
+ self.clear()
+ self.append(string)
+
+ string = property(getString, setString)
+
+ def getText(self, separator=u""):
+ if not len(self.contents):
+ return u""
+ stopNode = self._lastRecursiveChild().next
+ strings = []
+ current = self.contents[0]
+ while current is not stopNode:
+ if isinstance(current, NavigableString):
+ strings.append(current.strip())
+ current = current.next
+ return separator.join(strings)
+
+ text = property(getText)
+
def get(self, key, default=None):
"""Returns the value of the 'key' attribute for the tag, or
the value given for 'default' if it doesn't have that
attribute."""
return self._getAttrMap().get(key, default)
+ def clear(self):
+ """Extract all children."""
+ for child in self.contents[:]:
+ child.extract()
+
+ def index(self, element):
+ for i, child in enumerate(self.contents):
+ if child is element:
+ return i
+ raise ValueError("Tag.index: element not in tag")
+
def has_key(self, key):
return self._getAttrMap().has_key(key)
@@ -600,6 +660,8 @@
NOTE: right now this will return false if two tags have the
same attributes in a different order. Should this be fixed?"""
+ if other is self:
+ return True
if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
return False
for i in range(0, len(self.contents)):
@@ -614,8 +676,11 @@
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
"""Renders this tag as a string."""
- return self.decode(eventualEncoding=encoding)
+ return self.__str__(encoding)
+ def __unicode__(self):
+ return self.__str__(None)
+
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ ")")
@@ -625,30 +690,24 @@
appropriate XML entity for an XML special character."""
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
- def __unicode__(self):
- return self.decode()
+ def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ prettyPrint=False, indentLevel=0):
+ """Returns a string or Unicode representation of this tag and
+ its contents. To get Unicode, pass None for encoding.
- def __str__(self):
- return self.encode()
+ NOTE: since Python's HTML parser consumes whitespace, this
+ method is not certain to reproduce the whitespace present in
+ the original string."""
- def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
- prettyPrint=False, indentLevel=0):
- return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
+ encodedName = self.toEncoding(self.name, encoding)
- def decode(self, prettyPrint=False, indentLevel=0,
- eventualEncoding=DEFAULT_OUTPUT_ENCODING):
- """Returns a string or Unicode representation of this tag and
- its contents. To get Unicode, pass None for encoding."""
-
attrs = []
if self.attrs:
for key, val in self.attrs:
fmt = '%s="%s"'
- if isString(val):
- if (self.containsSubstitutions
- and eventualEncoding is not None
- and '%SOUP-ENCODING%' in val):
- val = self.substituteEncoding(val, eventualEncoding)
+ if isinstance(val, basestring):
+ if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
+ val = self.substituteEncoding(val, encoding)
# The attribute value either:
#
@@ -677,26 +736,22 @@
# ampersands that aren't part of entities. We need
# to escape those to XML entities too.
val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
- if val is None:
- # Handle boolean attributes.
- decoded = key
- else:
- decoded = fmt % (key, val)
- attrs.append(decoded)
+
+ attrs.append(fmt % (self.toEncoding(key, encoding),
+ self.toEncoding(val, encoding)))
close = ''
closeTag = ''
if self.isSelfClosing:
close = ' /'
else:
- closeTag = '</%s>' % self.name
+ closeTag = '</%s>' % encodedName
indentTag, indentContents = 0, 0
if prettyPrint:
indentTag = indentLevel
space = (' ' * (indentTag-1))
indentContents = indentTag + 1
- contents = self.decodeContents(prettyPrint, indentContents,
- eventualEncoding)
+ contents = self.renderContents(encoding, prettyPrint, indentContents)
if self.hidden:
s = contents
else:
@@ -706,7 +761,7 @@
attributeString = ' ' + ' '.join(attrs)
if prettyPrint:
s.append(space)
- s.append('<%s%s%s>' % (self.name, attributeString, close))
+ s.append('<%s%s%s>' % (encodedName, attributeString, close))
if prettyPrint:
s.append("\n")
s.append(contents)
@@ -722,32 +777,35 @@
def decompose(self):
"""Recursively destroys the contents of this tree."""
- contents = [i for i in self.contents]
- for i in contents:
- if isinstance(i, Tag):
- i.decompose()
- else:
- i.extract()
self.extract()
+ if len(self.contents) == 0:
+ return
+ current = self.contents[0]
+ while current is not None:
+ next = current.next
+ if isinstance(current, Tag):
+ del current.contents[:]
+ current.parent = None
+ current.previous = None
+ current.previousSibling = None
+ current.next = None
+ current.nextSibling = None
+ current = next
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return self.encode(encoding, True)
+ return self.__str__(encoding, True)
- def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0):
- return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
-
- def decodeContents(self, prettyPrint=False, indentLevel=0,
- eventualEncoding=DEFAULT_OUTPUT_ENCODING):
"""Renders the contents of this tag as a string in the given
encoding. If encoding is None, returns a Unicode string.."""
s=[]
for c in self:
text = None
if isinstance(c, NavigableString):
- text = c.decodeGivenEventualEncoding(eventualEncoding)
+ text = c.__str__(encoding)
elif isinstance(c, Tag):
- s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
+ s.append(c.__str__(encoding, prettyPrint, indentLevel))
if text and prettyPrint:
text = text.strip()
if text:
@@ -788,7 +846,7 @@
return self._findAll(name, attrs, text, limit, generator, **kwargs)
findChildren = findAll
- # Pre-3.x compatibility methods. Will go away in 4.0.
+ # Pre-3.x compatibility methods
first = find
fetch = findAll
@@ -798,15 +856,6 @@
def firstText(self, text=None, recursive=True):
return self.find(text=text, recursive=recursive)
- # 3.x compatibility methods. Will go away in 4.0.
- def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
- prettyPrint=False, indentLevel=0):
- if encoding is None:
- return self.decodeContents(prettyPrint, indentLevel, encoding)
- else:
- return self.encodeContents(encoding, prettyPrint, indentLevel)
-
-
#Private methods
def _getAttrMap(self):
@@ -819,6 +868,10 @@
return self.attrMap
#Generator methods
+ def childGenerator(self):
+ # Just use the iterator from the contents
+ return iter(self.contents)
+
def recursiveChildGenerator(self):
if not len(self.contents):
raise StopIteration
@@ -828,14 +881,6 @@
yield current
current = current.next
- def childGenerator(self):
- if not len(self.contents):
- raise StopIteration
- current = self.contents[0]
- while current:
- yield current
- current = current.nextSibling
- raise StopIteration
# Next, a couple classes to represent queries and their results.
class SoupStrainer:
@@ -844,8 +889,8 @@
def __init__(self, name=None, attrs={}, text=None, **kwargs):
self.name = name
- if isString(attrs):
- kwargs['class'] = attrs
+ if isinstance(attrs, basestring):
+ kwargs['class'] = _match_css_class(attrs)
attrs = None
if kwargs:
if attrs:
@@ -904,7 +949,8 @@
found = None
# If given a list of items, scan it for a text element that
# matches.
- if isList(markup) and not isinstance(markup, Tag):
+ if hasattr(markup, "__iter__") \
+ and not isinstance(markup, Tag):
for element in markup:
if isinstance(element, NavigableString) \
and self.search(element):
@@ -917,7 +963,7 @@
found = self.searchTag(markup)
# If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \
- isString(markup):
+ isinstance(markup, basestring):
if self._matches(markup, self.text):
found = markup
else:
@@ -928,8 +974,8 @@
def _matches(self, markup, matchAgainst):
#print "Matching %s against %s" % (markup, matchAgainst)
result = False
- if matchAgainst == True and type(matchAgainst) == types.BooleanType:
- result = markup != None
+ if matchAgainst is True:
+ result = markup is not None
elif callable(matchAgainst):
result = matchAgainst(markup)
else:
@@ -937,18 +983,17 @@
#other ways of matching match the tag name as a string.
if isinstance(markup, Tag):
markup = markup.name
- if markup is not None and not isString(markup):
+ if markup and not isinstance(markup, basestring):
markup = unicode(markup)
#Now we know that chunk is either a string, or None.
if hasattr(matchAgainst, 'match'):
# It's a regexp object.
result = markup and matchAgainst.search(markup)
- elif (isList(matchAgainst)
- and (markup is not None or not isString(matchAgainst))):
+ elif hasattr(matchAgainst, '__iter__'): # list-like
result = markup in matchAgainst
elif hasattr(matchAgainst, 'items'):
result = markup.has_key(matchAgainst)
- elif matchAgainst and isString(markup):
+ elif matchAgainst and isinstance(markup, basestring):
if isinstance(markup, unicode):
matchAgainst = unicode(matchAgainst)
else:
@@ -967,20 +1012,6 @@
# Now, some helper functions.
-def isList(l):
- """Convenience method that works with all 2.x versions of Python
- to determine whether or not something is listlike."""
- return ((hasattr(l, '__iter__') and not isString(l))
- or (type(l) in (types.ListType, types.TupleType)))
-
-def isString(s):
- """Convenience method that works with all 2.x versions of Python
- to determine whether or not something is stringlike."""
- try:
- return isinstance(s, unicode) or isinstance(s, basestring)
- except NameError:
- return isinstance(s, str)
-
def buildTagMap(default, *args):
"""Turns a list of maps, lists, or scalars into a single map.
Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
@@ -991,7 +1022,7 @@
#It's a map. Merge it.
for k,v in portion.items():
built[k] = v
- elif isList(portion) and not isString(portion):
+ elif hasattr(portion, '__iter__'): # is a list
#It's a list. Map each item to the default.
for k in portion:
built[k] = default
@@ -1002,123 +1033,8 @@
# Now, the parser classes.
-class HTMLParserBuilder(HTMLParser):
+class BeautifulStoneSoup(Tag, SGMLParser):
- def __init__(self, soup):
- HTMLParser.__init__(self)
- self.soup = soup
-
- # We inherit feed() and reset().
-
- def handle_starttag(self, name, attrs):
- if name == 'meta':
- self.soup.extractCharsetFromMeta(attrs)
- else:
- self.soup.unknown_starttag(name, attrs)
-
- def handle_endtag(self, name):
- self.soup.unknown_endtag(name)
-
- def handle_data(self, content):
- self.soup.handle_data(content)
-
- def _toStringSubclass(self, text, subclass):
- """Adds a certain piece of text to the tree as a NavigableString
- subclass."""
- self.soup.endData()
- self.handle_data(text)
- self.soup.endData(subclass)
-
- def handle_pi(self, text):
- """Handle a processing instruction as a ProcessingInstruction
- object, possibly one with a %SOUP-ENCODING% slot into which an
- encoding will be plugged later."""
- if text[:3] == "xml":
- text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
- self._toStringSubclass(text, ProcessingInstruction)
-
- def handle_comment(self, text):
- "Handle comments as Comment objects."
- self._toStringSubclass(text, Comment)
-
- def handle_charref(self, ref):
- "Handle character references as data."
- if self.soup.convertEntities:
- data = unichr(int(ref))
- else:
- data = '&#%s;' % ref
- self.handle_data(data)
-
- def handle_entityref(self, ref):
- """Handle entity references as data, possibly converting known
- HTML and/or XML entity references to the corresponding Unicode
- characters."""
- data = None
- if self.soup.convertHTMLEntities:
- try:
- data = unichr(name2codepoint[ref])
- except KeyError:
- pass
-
- if not data and self.soup.convertXMLEntities:
- data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
-
- if not data and self.soup.convertHTMLEntities and \
- not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
- # TODO: We've got a problem here. We're told this is
- # an entity reference, but it's not an XML entity
- # reference or an HTML entity reference. Nonetheless,
- # the logical thing to do is to pass it through as an
- # unrecognized entity reference.
- #
- # Except: when the input is "&carol;" this function
- # will be called with input "carol". When the input is
- # "AT&T", this function will be called with input
- # "T". We have no way of knowing whether a semicolon
- # was present originally, so we don't know whether
- # this is an unknown entity or just a misplaced
- # ampersand.
- #
- # The more common case is a misplaced ampersand, so I
- # escape the ampersand and omit the trailing semicolon.
- data = "&%s" % ref
- if not data:
- # This case is different from the one above, because we
- # haven't already gone through a supposedly comprehensive
- # mapping of entities to Unicode characters. We might not
- # have gone through any mapping at all. So the chances are
- # very high that this is a real entity, and not a
- # misplaced ampersand.
- data = "&%s;" % ref
- self.handle_data(data)
-
- def handle_decl(self, data):
- "Handle DOCTYPEs and the like as Declaration objects."
- self._toStringSubclass(data, Declaration)
-
- def parse_declaration(self, i):
- """Treat a bogus SGML declaration as raw data. Treat a CDATA
- declaration as a CData object."""
- j = None
- if self.rawdata[i:i+9] == '<![CDATA[':
- k = self.rawdata.find(']]>', i)
- if k == -1:
- k = len(self.rawdata)
- data = self.rawdata[i+9:k]
- j = k+3
- self._toStringSubclass(data, CData)
- else:
- try:
- j = HTMLParser.parse_declaration(self, i)
- except HTMLParseError:
- toHandle = self.rawdata[i:]
- self.handle_data(toHandle)
- j = i + len(toHandle)
- return j
-
-
-class BeautifulStoneSoup(Tag):
-
"""This class contains the basic parser and search code. It defines
a parser that knows nothing about tag behavior except for the
following:
@@ -1163,15 +1079,14 @@
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
markupMassage=True, smartQuotesTo=XML_ENTITIES,
- convertEntities=None, selfClosingTags=None, isHTML=False,
- builder=HTMLParserBuilder):
+ convertEntities=None, selfClosingTags=None, isHTML=False):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser.
- HTMLParser will process most bad HTML, and the BeautifulSoup
+ sgmllib will process most bad HTML, and the BeautifulSoup
class has some tricks for dealing with some HTML that kills
- HTMLParser, but Beautiful Soup can nonetheless choke or lose data
+ sgmllib, but Beautiful Soup can nonetheless choke or lose data
if your data uses self-closing tags or declarations
incorrectly.
@@ -1181,7 +1096,7 @@
you'll get better performance.
The default parser massage techniques fix the two most common
- instances of invalid HTML that choke HTMLParser:
+ instances of invalid HTML that choke sgmllib:
<br/> (No space between name of closing tag and tag close)
<! --Comment--> (Extraneous whitespace in declaration)
@@ -1219,8 +1134,7 @@
self.escapeUnrecognizedEntities = False
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
- self.builder = builder(self)
- self.reset()
+ SGMLParser.__init__(self)
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
@@ -1230,9 +1144,18 @@
self._feed(isHTML=isHTML)
except StopParsing:
pass
- self.markup = None # The markup can now be GCed.
- self.builder = None # So can the builder.
+ self.markup = None # The markup can now be GCed
+ def convert_charref(self, name):
+ """This method fixes a bug in Python's SGMLParser."""
+ try:
+ n = int(name)
+ except ValueError:
+ return
+ if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+ return
+ return self.convert_codepoint(n)
+
def _feed(self, inDocumentEncoding=None, isHTML=False):
# Convert the document to Unicode.
markup = self.markup
@@ -1248,7 +1171,7 @@
self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
if markup:
if self.markupMassage:
- if not isList(self.markupMassage):
+ if not hasattr(self.markupMassage, "__iter__"):
self.markupMassage = self.MARKUP_MASSAGE
for fix, m in self.markupMassage:
markup = fix.sub(m, markup)
@@ -1258,14 +1181,27 @@
# was relying on the existence of markupMassage, this
# might cause problems.
del(self.markupMassage)
- self.builder.reset()
+ self.reset()
- self.builder.feed(markup)
+ SGMLParser.feed(self, markup)
# Close out any unfinished strings and close all the open tags.
self.endData()
while self.currentTag.name != self.ROOT_TAG_NAME:
self.popTag()
+ def __getattr__(self, methodName):
+ """This method routes method call requests to either the SGMLParser
+ superclass or the Tag superclass, depending on the method name."""
+ #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+
+ if methodName.startswith('start_') or methodName.startswith('end_') \
+ or methodName.startswith('do_'):
+ return SGMLParser.__getattr__(self, methodName)
+ elif not methodName.startswith('__'):
+ return Tag.__getattr__(self, methodName)
+ else:
+ raise AttributeError
+
def isSelfClosingTag(self, name):
"""Returns true iff the given string is the name of a
self-closing tag according to this parser."""
@@ -1275,7 +1211,7 @@
def reset(self):
Tag.__init__(self, self, self.ROOT_TAG_NAME)
self.hidden = 1
- self.builder.reset()
+ SGMLParser.reset(self)
self.currentData = []
self.currentTag = None
self.tagStack = []
@@ -1284,12 +1220,6 @@
def popTag(self):
tag = self.tagStack.pop()
- # Tags with just one string-owning child get the child as a
- # 'string' property, so that soup.tag.string is shorthand for
- # soup.tag.contents[0]
- if len(self.currentTag.contents) == 1 and \
- isinstance(self.currentTag.contents[0], NavigableString):
- self.currentTag.string = self.currentTag.contents[0]
#print "Pop", tag.name
if self.tagStack:
@@ -1378,9 +1308,9 @@
#last occurance.
popTo = name
break
- if (nestingResetTriggers != None
+ if (nestingResetTriggers is not None
and p.name in nestingResetTriggers) \
- or (nestingResetTriggers == None and isResetNesting
+ or (nestingResetTriggers is None and isResetNesting
and self.RESET_NESTING_TAGS.has_key(p.name)):
#If we encounter one of the nesting reset triggers
@@ -1399,7 +1329,7 @@
if self.quoteStack:
#This is not a real tag.
#print "<%s> is not real!" % name
- attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
+ attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
self.handle_data('<%s%s>' % (name, attrs))
return
self.endData()
@@ -1440,10 +1370,100 @@
def handle_data(self, data):
self.currentData.append(data)
- def extractCharsetFromMeta(self, attrs):
- self.unknown_starttag('meta', attrs)
+ def _toStringSubclass(self, text, subclass):
+ """Adds a certain piece of text to the tree as a NavigableString
+ subclass."""
+ self.endData()
+ self.handle_data(text)
+ self.endData(subclass)
+ def handle_pi(self, text):
+ """Handle a processing instruction as a ProcessingInstruction
+ object, possibly one with a %SOUP-ENCODING% slot into which an
+ encoding will be plugged later."""
+ if text[:3] == "xml":
+ text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+ self._toStringSubclass(text, ProcessingInstruction)
+ def handle_comment(self, text):
+ "Handle comments as Comment objects."
+ self._toStringSubclass(text, Comment)
+
+ def handle_charref(self, ref):
+ "Handle character references as data."
+ if self.convertEntities:
+ data = unichr(int(ref))
+ else:
+ data = '&#%s;' % ref
+ self.handle_data(data)
+
+ def handle_entityref(self, ref):
+ """Handle entity references as data, possibly converting known
+ HTML and/or XML entity references to the corresponding Unicode
+ characters."""
+ data = None
+ if self.convertHTMLEntities:
+ try:
+ data = unichr(name2codepoint[ref])
+ except KeyError:
+ pass
+
+ if not data and self.convertXMLEntities:
+ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+ if not data and self.convertHTMLEntities and \
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ # TODO: We've got a problem here. We're told this is
+ # an entity reference, but it's not an XML entity
+ # reference or an HTML entity reference. Nonetheless,
+ # the logical thing to do is to pass it through as an
+ # unrecognized entity reference.
+ #
+ # Except: when the input is "&carol;" this function
+ # will be called with input "carol". When the input is
+ # "AT&T", this function will be called with input
+ # "T". We have no way of knowing whether a semicolon
+ # was present originally, so we don't know whether
+ # this is an unknown entity or just a misplaced
+ # ampersand.
+ #
+ # The more common case is a misplaced ampersand, so I
+ # escape the ampersand and omit the trailing semicolon.
+ data = "&%s" % ref
+ if not data:
+ # This case is different from the one above, because we
+ # haven't already gone through a supposedly comprehensive
+ # mapping of entities to Unicode characters. We might not
+ # have gone through any mapping at all. So the chances are
+ # very high that this is a real entity, and not a
+ # misplaced ampersand.
+ data = "&%s;" % ref
+ self.handle_data(data)
+
+ def handle_decl(self, data):
+ "Handle DOCTYPEs and the like as Declaration objects."
+ self._toStringSubclass(data, Declaration)
+
+ def parse_declaration(self, i):
+ """Treat a bogus SGML declaration as raw data. Treat a CDATA
+ declaration as a CData object."""
+ j = None
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k+3
+ self._toStringSubclass(data, CData)
+ else:
+ try:
+ j = SGMLParser.parse_declaration(self, i)
+ except SGMLParseError:
+ toHandle = self.rawdata[i:]
+ self.handle_data(toHandle)
+ j = i + len(toHandle)
+ return j
+
class BeautifulSoup(BeautifulStoneSoup):
"""This parser knows the following facts about HTML:
@@ -1499,8 +1519,8 @@
BeautifulStoneSoup.__init__(self, *args, **kwargs)
SELF_CLOSING_TAGS = buildTagMap(None,
- ['br' , 'hr', 'input', 'img', 'meta',
- 'spacer', 'link', 'frame', 'base'])
+ ('br' , 'hr', 'input', 'img', 'meta',
+ 'spacer', 'link', 'frame', 'base', 'col'))
PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
@@ -1509,13 +1529,13 @@
#According to the HTML standard, each of these inline tags can
#contain another tag of the same type. Furthermore, it's common
#to actually use these tags this way.
- NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
- 'center']
+ NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+ 'center')
#According to the HTML standard, these block tags can contain
#another tag of the same type. Furthermore, it's common
#to actually use these tags this way.
- NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
+ NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
#Lists can contain other lists, but there are restrictions.
NESTABLE_LIST_TAGS = { 'ol' : [],
@@ -1535,7 +1555,7 @@
'tfoot' : ['table'],
}
- NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
+ NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
#If one of these tags is encountered, all tags up to the next tag of
#this type are popped.
@@ -1550,7 +1570,7 @@
# Used to detect the charset in a META tag; see start_meta
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
- def extractCharsetFromMeta(self, attrs):
+ def start_meta(self, attrs):
"""Beautiful Soup can detect a charset included in a META tag,
try to convert the document to that charset, and re-parse the
document from the beginning."""
@@ -1597,7 +1617,6 @@
if tag and tagNeedsEncodingSubstitution:
tag.containsSubstitutions = True
-
class StopParsing(Exception):
pass
@@ -1627,11 +1646,11 @@
wouldn't be."""
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
- ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
+ ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
- 'big']
+ 'big')
- I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
+ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
@@ -1778,18 +1797,15 @@
self.unicode = u
if not u: self.originalEncoding = None
- def _subMSChar(self, match):
+ def _subMSChar(self, orig):
"""Changes a MS smart quote character to an XML or HTML
entity."""
- orig = match.group(1)
sub = self.MS_CHARS.get(orig)
- if type(sub) == types.TupleType:
+ if isinstance(sub, tuple):
if self.smartQuotesTo == 'xml':
- sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+ sub = '&#x%s;' % sub[1]
else:
- sub = '&'.encode() + sub[0].encode() + ';'.encode()
- else:
- sub = sub.encode()
+ sub = '&%s;' % sub[0]
return sub
def _convertFrom(self, proposed):
@@ -1804,9 +1820,9 @@
if self.smartQuotesTo and proposed.lower() in("windows-1252",
"iso-8859-1",
"iso-8859-2"):
- smart_quotes_re = "([\x80-\x9f])"
- smart_quotes_compiled = re.compile(smart_quotes_re)
- markup = smart_quotes_compiled.sub(self._subMSChar, markup)
+ markup = re.compile("([\x80-\x9f])").sub \
+ (lambda(x): self._subMSChar(x.group(1)),
+ markup)
try:
# print "Trying to convert document to %s" % proposed
@@ -1895,15 +1911,13 @@
pass
except:
xml_encoding_match = None
- xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
- xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
+ xml_encoding_match = re.compile(
+ '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
if not xml_encoding_match and isHTML:
- meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
- regexp = re.compile(meta_re, re.I)
+ regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
xml_encoding_match = regexp.search(xml_data)
if xml_encoding_match is not None:
- xml_encoding = xml_encoding_match.groups()[0].decode(
- 'ascii').lower()
+ xml_encoding = xml_encoding_match.groups()[0].lower()
if isHTML:
self.declaredHTMLEncoding = xml_encoding
if sniffed_xml_encoding and \
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9170
Revision: 9170
Author: xqt
Date: 2011-04-13 04:41:33 +0000 (Wed, 13 Apr 2011)
Log Message:
-----------
Do not putSpacesInLists inside template namespaces (update from trunk r9169
Modified Paths:
--------------
branches/rewrite/scripts/cosmetic_changes.py
Modified: branches/rewrite/scripts/cosmetic_changes.py
===================================================================
--- branches/rewrite/scripts/cosmetic_changes.py 2011-04-13 04:38:23 UTC (rev 9169)
+++ branches/rewrite/scripts/cosmetic_changes.py 2011-04-13 04:41:33 UTC (rev 9170)
@@ -369,13 +369,17 @@
For better readability of bullet list and enumeration wiki source code,
puts a space between the * or # and the text.
- NOTE: This space is recommended in the syntax help on the English, German,
- and French Wikipedia. It might be that it is not wanted on other wikis.
- If there are any complaints, please file a bug report.
+ NOTE: This space is recommended in the syntax help on the English,
+ German, and French Wikipedia. It might be that it is not wanted on other
+ wikis. If there are any complaints, please file a bug report.
"""
exceptions = ['comment', 'math', 'nowiki', 'pre', 'source', 'timeline']
- if not self.redirect and pywikibot.calledModuleName() <> 'capitalize_redirects':
- text = pywikibot.replaceExcept(text, r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', '\g<bullet> \g<char>', exceptions)
+ if not (self.redirect or self.template) and \
+ pywikibot.calledModuleName() != 'capitalize_redirects':
+ text = pywikibot.replaceExcept(
+ text,
+ r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', '\g<bullet> \g<char>',
+ exceptions)
return text
def replaceDeprecatedTemplates(self, text):
http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9169
Revision: 9169
Author: xqt
Date: 2011-04-13 04:38:23 +0000 (Wed, 13 Apr 2011)
Log Message:
-----------
Do not putSpacesInLists inside template namespaces like http://hsb.wikipedia.org/w/index.php?title=P%C5%99ed%C5%82oha%3AInfoka%C5%A…
Modified Paths:
--------------
trunk/pywikipedia/cosmetic_changes.py
Modified: trunk/pywikipedia/cosmetic_changes.py
===================================================================
--- trunk/pywikipedia/cosmetic_changes.py 2011-04-12 23:08:51 UTC (rev 9168)
+++ trunk/pywikipedia/cosmetic_changes.py 2011-04-13 04:38:23 UTC (rev 9169)
@@ -637,13 +637,17 @@
For better readability of bullet list and enumeration wiki source code,
puts a space between the * or # and the text.
- NOTE: This space is recommended in the syntax help on the English, German,
- and French Wikipedia. It might be that it is not wanted on other wikis.
- If there are any complaints, please file a bug report.
+ NOTE: This space is recommended in the syntax help on the English,
+ German, and French Wikipedia. It might be that it is not wanted on other
+ wikis. If there are any complaints, please file a bug report.
"""
exceptions = ['comment', 'math', 'nowiki', 'pre', 'source', 'timeline']
- if not self.redirect and pywikibot.calledModuleName() <> 'capitalize_redirects':
- text = pywikibot.replaceExcept(text, r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', '\g<bullet> \g<char>', exceptions)
+ if not (self.redirect or self.template) and \
+ pywikibot.calledModuleName() != 'capitalize_redirects':
+ text = pywikibot.replaceExcept(
+ text,
+ r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', '\g<bullet> \g<char>',
+ exceptions)
return text
def replaceDeprecatedTemplates(self, text):