http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11156
Revision: 11156 Author: xqt Date: 2013-03-02 12:57:54 +0000 (Sat, 02 Mar 2013) Log Message: ----------- some PEP8 changes
Modified Paths: -------------- trunk/pywikipedia/BeautifulSoup.py
Modified: trunk/pywikipedia/BeautifulSoup.py =================================================================== --- trunk/pywikipedia/BeautifulSoup.py 2013-03-02 10:39:02 UTC (rev 11155) +++ trunk/pywikipedia/BeautifulSoup.py 2013-03-02 12:57:54 UTC (rev 11156) @@ -89,9 +89,9 @@ import re import sgmllib try: - from htmlentitydefs import name2codepoint + from htmlentitydefs import name2codepoint except ImportError: - name2codepoint = {} + name2codepoint = {} try: set except NameError: @@ -103,12 +103,13 @@
DEFAULT_OUTPUT_ENCODING = "utf-8"
+ def _match_css_class(str): """Build a RE to match the given CSS class.""" return re.compile(r"(^|.*\s)%s($|\s)" % str)
+ # First, the classes that represent markup elements. - class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -128,8 +129,8 @@ def replaceWith(self, replaceWith): oldParent = self.parent myIndex = self.parent.index(self) - if hasattr(replaceWith, "parent")\ - and replaceWith.parent is self.parent: + if hasattr(replaceWith, "parent") and \ + replaceWith.parent is self.parent: # We're replacing this element with one of its siblings. index = replaceWith.parent.index(replaceWith) if index and index < myIndex: @@ -186,11 +187,11 @@ return lastChild
def insert(self, position, newChild): - if isinstance(newChild, basestring) \ - and not isinstance(newChild, NavigableString): + if isinstance(newChild, basestring) and not \ + isinstance(newChild, NavigableString): newChild = NavigableString(newChild)
- position = min(position, len(self.contents)) + position = min(position, len(self.contents)) if hasattr(newChild, 'parent') and newChild.parent is not None: # We're 'inserting' an element that's already one # of this object's children. @@ -227,7 +228,7 @@ while not parentsNextSibling: parentsNextSibling = parent.nextSibling parent = parent.parent - if not parent: # This is the last element in the document. + if not parent: # This is the last element in the document. break if parentsNextSibling: newChildsLastElement.next = parentsNextSibling @@ -272,8 +273,9 @@ criteria and appear after this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.nextSiblingGenerator, **kwargs) - fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+ fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x + def findPrevious(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and appears before this Tag in the document.""" @@ -284,8 +286,8 @@ """Returns all items that match the given criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousGenerator, - **kwargs) - fetchPrevious = findAllPrevious # Compatibility with pre-3.x + **kwargs) + fetchPrevious = findAllPrevious # Compatibility with pre-3.x
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): """Returns the closest sibling to this Tag that matches the @@ -299,7 +301,7 @@ criteria and appear before this Tag in the document.""" return self._findAll(name, attrs, text, limit, self.previousSiblingGenerator, **kwargs) - fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x + fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
def findParent(self, name=None, attrs={}, **kwargs): """Returns the closest parent of this Tag that matches the given @@ -318,8 +320,9 @@
return self._findAll(name, attrs, None, limit, self.parentGenerator, **kwargs) - fetchParents = findParents # Compatibility with pre-3.x
+ fetchParents = findParents # Compatibility with pre-3.x + #These methods do the real heavy lifting.
def _findOne(self, method, name, attrs, text, **kwargs): @@ -415,11 +418,12 @@ s = unicode(s) else: if encoding: - s = self.toEncoding(str(s), encoding) + s = self.toEncoding(str(s), encoding) else: s = unicode(s) return s
+ class NavigableString(unicode, PageElement):
def __new__(cls, value): @@ -444,7 +448,8 @@ if attr == 'string': return self else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) + raise AttributeError("'%s' object has no attribute '%s'" + % (self.__class__.__name__, attr))
def __unicode__(self): return str(self).decode(DEFAULT_OUTPUT_ENCODING) @@ -455,11 +460,13 @@ else: return self
+ class CData(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
+ class ProcessingInstruction(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): output = self @@ -467,14 +474,17 @@ output = self.substituteEncoding(output, encoding) return "<?%s?>" % self.toEncoding(output, encoding)
+ class Comment(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "<!--%s-->" % NavigableString.__str__(self, encoding)
+ class Declaration(NavigableString): def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): return "<!%s>" % NavigableString.__str__(self, encoding)
+ class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents.""" @@ -482,15 +492,15 @@ def _invert(h): "Cheap function to invert a hash." i = {} - for k,v in h.items(): + for k, v in h.items(): i[v] = k return i
- XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", - "quot" : '"', - "amp" : "&", - "lt" : "<", - "gt" : ">" } + XML_ENTITIES_TO_SPECIAL_CHARS = {"apos": "'", + "quot": '"', + "amp": "&", + "lt": "<", + "gt": ">"}
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
@@ -549,8 +559,8 @@ self.attrs = map(convert, self.attrs)
def getString(self): - if (len(self.contents) == 1 - and isinstance(self.contents[0], NavigableString)): + if (len(self.contents) == 1 and isinstance(self.contents[0], + NavigableString)): return self.contents[0]
def setString(self, string): @@ -592,7 +602,7 @@ raise ValueError("Tag.index: element not in tag")
def has_key(self, key): - return self._getAttrMap().has_key(key) + return key in self._getAttrMap()
def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, @@ -636,7 +646,7 @@ #We don't break because bad HTML can define the same #attribute multiple times. self._getAttrMap() - if self.attrMap.has_key(key): + if key in self.attrMap: del self.attrMap[key]
def __call__(self, *args, **kwargs): @@ -651,7 +661,8 @@ return self.find(tag[:-3]) elif tag.find('__') != 0: return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) + raise AttributeError("'%s' object has no attribute '%s'" + % (self.__class__, tag))
def __eq__(self, other): """Returns true iff this tag has the same name, the same attributes, @@ -661,7 +672,9 @@ same attributes in a different order. Should this be fixed?""" if other is self: return True - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): + if not hasattr(other, 'name') or not hasattr(other, 'attrs') or \ + not hasattr(other, 'contents') or self.name != other.name or \ + self.attrs != other.attrs or len(self) != len(other): return False for i in range(0, len(self.contents)): if self.contents[i] != other.contents[i]: @@ -734,7 +747,8 @@ # value might also contain angle brackets, or # ampersands that aren't part of entities. We need # to escape those to XML entities too. - val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) + val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, + val)
attrs.append(fmt % (self.toEncoding(key, encoding), self.toEncoding(val, encoding))) @@ -798,7 +812,7 @@ prettyPrint=False, indentLevel=0): """Renders the contents of this tag as a string in the given encoding. If encoding is None, returns a Unicode string..""" - s=[] + s = [] for c in self: text = None if isinstance(c, NavigableString): @@ -912,13 +926,13 @@ if isinstance(markupName, Tag): markup = markupName markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) + callFunctionWithTagData = callable(self.name) and \ + not isinstance(markupName, Tag)
if (not self.name) \ - or callFunctionWithTagData \ - or (markup and self._matches(markup, self.name)) \ - or (not markup and self._matches(markupName, self.name)): + or callFunctionWithTagData \ + or (markup and self._matches(markup, self.name)) \ + or (not markup and self._matches(markupName, self.name)): if callFunctionWithTagData: match = self.name(markupName, markupAttrs) else: @@ -926,11 +940,11 @@ markupAttrMap = None for attr, matchAgainst in self.attrs.items(): if not markupAttrMap: - if hasattr(markupAttrs, 'get'): + if hasattr(markupAttrs, 'get'): markupAttrMap = markupAttrs - else: + else: markupAttrMap = {} - for k,v in markupAttrs: + for k, v in markupAttrs: markupAttrMap[k] = v attrValue = markupAttrMap.get(attr) if not self._matches(attrValue, matchAgainst): @@ -948,11 +962,10 @@ found = None # If given a list of items, scan it for a text element that # matches. - if hasattr(markup, "__iter__") \ - and not isinstance(markup, Tag): + if hasattr(markup, "__iter__") and not isinstance(markup, Tag): for element in markup: - if isinstance(element, NavigableString) \ - and self.search(element): + if isinstance(element, NavigableString) and \ + self.search(element): found = element break # If it's a Tag, make sure its name or attributes match. @@ -961,13 +974,13 @@ if not self.text: found = self.searchTag(markup) # If it's text, make sure the text matches. - elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): + elif isinstance(markup, NavigableString) or isinstance(markup, + basestring): if self._matches(markup, self.text): found = markup else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ + raise Exception("I don't know how to match against a %s" + % markup.__class__) return found
def _matches(self, markup, matchAgainst): @@ -988,10 +1001,10 @@ if hasattr(matchAgainst, 'match'): # It's a regexp object. result = markup and matchAgainst.search(markup) - elif hasattr(matchAgainst, '__iter__'): # list-like + elif hasattr(matchAgainst, '__iter__'): # list-like result = markup in matchAgainst elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) + result = matchAgainst in markup elif matchAgainst and isinstance(markup, basestring): if isinstance(markup, unicode): matchAgainst = unicode(matchAgainst) @@ -1002,6 +1015,7 @@ result = matchAgainst == markup return result
+ class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" @@ -1009,6 +1023,7 @@ list.__init__([]) self.source = source
+ # Now, some helper functions.
def buildTagMap(default, *args): @@ -1019,9 +1034,9 @@ for portion in args: if hasattr(portion, 'items'): #It's a map. Merge it. - for k,v in portion.items(): + for k, v in portion.items(): built[k] = v - elif hasattr(portion, '__iter__'): # is a list + elif hasattr(portion, '__iter__'): # is a list #It's a list. Map each item to the default. for k in portion: built[k] = default @@ -1030,6 +1045,7 @@ built[portion] = default return built
+ # Now, the parser classes.
class BeautifulStoneSoup(Tag, SGMLParser): @@ -1074,7 +1090,7 @@ # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left # alone. - STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } + STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, markupMassage=True, smartQuotesTo=XML_ENTITIES, @@ -1151,7 +1167,7 @@ n = int(name) except ValueError: return - if not 0 <= n <= 127 : # ASCII ends at 127, not 255 + if not 0 <= n <= 127: # ASCII ends at 127, not 255 return return self.convert_codepoint(n)
@@ -1162,9 +1178,10 @@ if not hasattr(self, 'originalEncoding'): self.originalEncoding = None else: - dammit = UnicodeDammit\ - (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) + dammit = UnicodeDammit(markup, + [self.fromEncoding, inDocumentEncoding], + smartQuotesTo=self.smartQuotesTo, + isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding @@ -1194,7 +1211,7 @@ #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
if methodName.startswith('start_') or methodName.startswith('end_') \ - or methodName.startswith('do_'): + or methodName.startswith('do_'): return SGMLParser.__getattr__(self, methodName) elif not methodName.startswith('__'): return Tag.__getattr__(self, methodName) @@ -1204,8 +1221,8 @@ def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" - return self.SELF_CLOSING_TAGS.has_key(name) \ - or self.instanceSelfClosingTags.has_key(name) + return name in self.SELF_CLOSING_TAGS or \ + name in self.instanceSelfClosingTags
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) @@ -1244,8 +1261,8 @@ currentData = ' ' self.currentData = [] if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or \ - not self.parseOnlyThese.search(currentData)): + (not self.parseOnlyThese.text or not + self.parseOnlyThese.search(currentData)): return o = containerClass(currentData) o.setup(self.currentTag, self.previous) @@ -1254,7 +1271,6 @@ self.previous = o self.currentTag.contents.append(o)
- def _popToTag(self, name, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag @@ -1296,8 +1312,8 @@ """
nestingResetTriggers = self.NESTABLE_TAGS.get(name) - isNestable = nestingResetTriggers != None - isResetNesting = self.RESET_NESTING_TAGS.has_key(name) + isNestable = nestingResetTriggers is not None + isResetNesting = name in self.RESET_NESTING_TAGS popTo = None inclusive = True for i in range(len(self.tagStack)-1, 0, -1): @@ -1310,7 +1326,7 @@ if (nestingResetTriggers is not None and p.name in nestingResetTriggers) \ or (nestingResetTriggers is None and isResetNesting - and self.RESET_NESTING_TAGS.has_key(p.name)): + and p.name in self.RESET_NESTING_TAGS):
#If we encounter one of the nesting reset triggers #peculiar to this tag, or we encounter another tag @@ -1337,7 +1353,8 @@ self._smartPop(name)
if self.parseOnlyThese and len(self.tagStack) <= 1 \ - and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): + and (self.parseOnlyThese.text or + not self.parseOnlyThese.searchTag(name, attrs)): return
tag = Tag(self, name, attrs, self.currentTag, self.previous) @@ -1411,7 +1428,7 @@ data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): # TODO: We've got a problem here. We're told this is # an entity reference, but it's not an XML entity # reference or an HTML entity reference. Nonetheless, @@ -1448,12 +1465,12 @@ declaration as a CData object.""" j = None if self.rawdata[i:i+9] == '<![CDATA[': - k = self.rawdata.find(']]>', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k + 3 + self._toStringSubclass(data, CData) else: try: j = SGMLParser.parse_declaration(self, i) @@ -1463,6 +1480,7 @@ j = i + len(toHandle) return j
+ class BeautifulSoup(BeautifulStoneSoup):
"""This parser knows the following facts about HTML: @@ -1512,18 +1530,18 @@ BeautifulStoneSoup before writing your own subclass."""
def __init__(self, *args, **kwargs): - if not kwargs.has_key('smartQuotesTo'): + if not 'smartQuotesTo' in kwargs: kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs)
SELF_CLOSING_TAGS = buildTagMap(None, - ('br' , 'hr', 'input', 'img', 'meta', + ('br', 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base', 'col'))
PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
- QUOTE_TAGS = {'script' : None, 'textarea' : None} + QUOTE_TAGS = {'script': None, 'textarea': None}
#According to the HTML standard, each of these inline tags can #contain another tag of the same type. Furthermore, it's common @@ -1537,21 +1555,21 @@ NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
#Lists can contain other lists, but there are restrictions. - NESTABLE_LIST_TAGS = { 'ol' : [], - 'ul' : [], - 'li' : ['ul', 'ol'], - 'dl' : [], - 'dd' : ['dl'], - 'dt' : ['dl'] } + NESTABLE_LIST_TAGS = {'ol': [], + 'ul': [], + 'li': ['ul', 'ol'], + 'dl': [], + 'dd': ['dl'], + 'dt': ['dl']}
#Tables can contain other tables, but there are restrictions. - NESTABLE_TABLE_TAGS = {'table' : [], - 'tr' : ['table', 'tbody', 'tfoot', 'thead'], - 'td' : ['tr'], - 'th' : ['tr'], - 'thead' : ['table'], - 'tbody' : ['table'], - 'tfoot' : ['table'], + NESTABLE_TABLE_TAGS = {'table': [], + 'tr': ['table', 'tbody', 'tfoot', 'thead'], + 'td': ['tr'], + 'th': ['tr'], + 'thead': ['table'], + 'tbody': ['table'], + 'tfoot': ['table'], }
NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') @@ -1587,11 +1605,11 @@ contentType = value contentTypeIndex = i
- if httpEquiv and contentType: # It's an interesting meta tag. + if httpEquiv and contentType: # It's an interesting meta tag. match = self.CHARSET_RE.search(contentType) if match: if (self.declaredHTMLEncoding is not None or - self.originalEncoding == self.fromEncoding): + self.originalEncoding == self.fromEncoding): # An HTML encoding was sniffed while converting # the document to Unicode, or an HTML encoding was # sniffed during a previous pass through the @@ -1616,9 +1634,11 @@ if tag and tagNeedsEncodingSubstitution: tag.containsSubstitutions = True
+ class StopParsing(Exception): pass
+ class ICantBelieveItsBeautifulSoup(BeautifulSoup):
"""The BeautifulSoup class is oriented towards skipping over @@ -1644,10 +1664,10 @@ it's valid HTML and BeautifulSoup screwed up by assuming it wouldn't be."""
- I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ - ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', - 'big') + I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = ( + 'em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite', + 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big' + )
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
@@ -1655,6 +1675,7 @@ I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+ class MinimalSoup(BeautifulSoup): """The MinimalSoup class is for parsing HTML that contains pathologically bad markup. It makes no assumptions about tag @@ -1668,6 +1689,7 @@ RESET_NESTING_TAGS = buildTagMap('noscript') NESTABLE_TAGS = {}
+ class BeautifulSOAP(BeautifulStoneSoup): """This class will push a tag with only a single string child into the tag's parent as an attribute. The attribute's name is the tag @@ -1695,10 +1717,11 @@ parent._getAttrMap() if (isinstance(tag, Tag) and len(tag.contents) == 1 and isinstance(tag.contents[0], NavigableString) and - not parent.attrMap.has_key(tag.name)): + not tag.name in parent.attrMap): parent[tag.name] = tag.contents[0] BeautifulStoneSoup.popTag(self)
+ #Enterprise class names! It has come to our attention that some people #think the names of the Beautiful Soup parser classes are too silly #and "unprofessional" for use in enterprise screen-scraping. We feel @@ -1749,6 +1772,7 @@ except ImportError: pass
+ class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is @@ -1759,14 +1783,14 @@ # meta tags to the corresponding Python codec names. It only covers # values that aren't in Python's aliases and can't be determined # by the heuristics in find_codec. - CHARSET_ALIASES = { "macintosh" : "mac-roman", - "x-sjis" : "shift-jis" } + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"}
def __init__(self, markup, overrideEncodings=[], smartQuotesTo='xml', isHTML=False): self.declaredHTMLEncoding = None self.markup, documentEncoding, sniffedEncoding = \ - self._detectEncoding(markup, isHTML) + self._detectEncoding(markup, isHTML) self.smartQuotesTo = smartQuotesTo self.triedEncodings = [] if markup == '' or isinstance(markup, unicode): @@ -1819,9 +1843,8 @@ if self.smartQuotesTo and proposed.lower() in("windows-1252", "iso-8859-1", "iso-8859-2"): - markup = re.compile("([\x80-\x9f])").sub \ - (lambda(x): self._subMSChar(x.group(1)), - markup) + markup = re.compile("([\x80-\x9f])").sub( + lambda(x): self._subMSChar(x.group(1)), markup)
try: # print "Trying to convert document to %s" % proposed @@ -1841,11 +1864,11 @@
# strip Byte Order Mark (if present) if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): + and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): + elif (len(data) >= 4) and \ + (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): encoding = 'utf-16le' data = data[2:] elif data[:3] == '\xef\xbb\xbf': @@ -1871,8 +1894,8 @@ # UTF-16BE sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ - and (xml_data[2:4] != '\x00\x00'): + elif (len(xml_data) >= 4) and \ + (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): # UTF-16BE with BOM sniffed_xml_encoding = 'utf-16be' xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') @@ -1881,7 +1904,7 @@ sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ - (xml_data[2:4] != '\x00\x00'): + (xml_data[2:4] != '\x00\x00'): # UTF-16LE with BOM sniffed_xml_encoding = 'utf-16le' xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') @@ -1927,7 +1950,6 @@ xml_encoding = sniffed_xml_encoding return xml_data, xml_encoding, sniffed_xml_encoding
- def find_codec(self, charset): return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ or (charset and self._codec(charset.replace("-", ""))) \ @@ -1945,63 +1967,70 @@ return codec
EBCDIC_TO_ASCII_MAP = None + def _ebcdic_to_ascii(self, s): c = self.__class__ if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) + emap = (0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, + 29, 30, 31, 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, + 138, 139, 140, 5, 6, 7, 144, 145, 22, 147, 148, 149, 150, 4, + 152, 153, 154, 155, 20, 21, 158, 26, 32, 160, 161, 162, 163, + 164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33, 38, 169, + 170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59, + 94, 45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124, 44, + 37, 95, 62, 63, 186, 187, 188, 189, 190, 191, 192, 193, 194, + 96, 58, 35, 64, 39, 61, 34, 195, 97, 98, 99, 100, 101, 102, + 103, 104, 105, 196, 197, 198, 199, 200, 201, 202, 106, 107, + 108, 109, 110, 111, 112, 113, 114, 203, 204, 205, 206, 207, + 208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210, + 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, + 223, 224, 225, 226, 227, 228, 229, 230, 231, 123, 65, 66, + 67, 68, 69, 70, 71, 72, 73, 232, 233, 234, 235, 236, 237, + 125, 74, 75, 76, 77, 78, 79, 80, 81, 82, 238, 239, 240, 241, + 242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89, 90, 244, 245, + 246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, + 250, 251, 252, 253, 254, 255) import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + c.EBCDIC_TO_ASCII_MAP = string.maketrans(''.join(map(chr, + range(256))), + ''.join(map(chr, emap))) return s.translate(c.EBCDIC_TO_ASCII_MAP)
- MS_CHARS = { '\x80' : ('euro', '20AC'), - '\x81' : ' ', - '\x82' : ('sbquo', '201A'), - '\x83' : ('fnof', '192'), - '\x84' : ('bdquo', '201E'), - '\x85' : ('hellip', '2026'), - '\x86' : ('dagger', '2020'), - '\x87' : ('Dagger', '2021'), - '\x88' : ('circ', '2C6'), - '\x89' : ('permil', '2030'), - '\x8A' : ('Scaron', '160'), - '\x8B' : ('lsaquo', '2039'), - '\x8C' : ('OElig', '152'), - '\x8D' : '?', - '\x8E' : ('#x17D', '17D'), - '\x8F' : '?', - '\x90' : '?', - '\x91' : ('lsquo', '2018'), - '\x92' : ('rsquo', '2019'), - '\x93' : ('ldquo', '201C'), - '\x94' : ('rdquo', '201D'), - '\x95' : ('bull', '2022'), - '\x96' : ('ndash', '2013'), - '\x97' : ('mdash', '2014'), - '\x98' : ('tilde', '2DC'), - '\x99' : ('trade', '2122'), - '\x9a' : ('scaron', '161'), - '\x9b' : ('rsaquo', '203A'), - '\x9c' : ('oelig', '153'), - '\x9d' : '?', - '\x9e' : ('#x17E', '17E'), - '\x9f' : ('Yuml', ''),} + MS_CHARS = { + '\x80': ('euro', '20AC'), + '\x81': ' ', + '\x82': ('sbquo', '201A'), + '\x83': ('fnof', '192'), + '\x84': ('bdquo', '201E'), + '\x85': ('hellip', '2026'), + '\x86': ('dagger', '2020'), + '\x87': ('Dagger', '2021'), + '\x88': ('circ', '2C6'), + '\x89': ('permil', '2030'), + '\x8A': ('Scaron', '160'), + '\x8B': ('lsaquo', '2039'), + '\x8C': ('OElig', '152'), + '\x8D': '?', + '\x8E': ('#x17D', '17D'), + '\x8F': '?', + '\x90': '?', + '\x91': ('lsquo', '2018'), + '\x92': ('rsquo', '2019'), + '\x93': ('ldquo', '201C'), + '\x94': ('rdquo', '201D'), + '\x95': ('bull', '2022'), + '\x96': ('ndash', '2013'), + '\x97': ('mdash', '2014'), + '\x98': ('tilde', '2DC'), + '\x99': ('trade', '2122'), + '\x9a': ('scaron', '161'), + '\x9b': ('rsaquo', '203A'), + '\x9c': ('oelig', '153'), + '\x9d': '?', + '\x9e': ('#x17E', '17E'), + '\x9f': ('Yuml', ''), + }
#######################################################################