Revision: 6568 Author: nicdumz Date: 2009-04-01 11:18:59 +0000 (Wed, 01 Apr 2009)
Log Message: ----------- Cleaning previous commit : * Cleaning scrubxml() implementation * Applying scrubxml AFTER decoding the string to unicode
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2009-04-01 10:45:17 UTC (rev 6567) +++ trunk/pywikipedia/wikipedia.py 2009-04-01 11:18:59 UTC (rev 6568) @@ -4893,23 +4893,15 @@ # Token not found output(u'WARNING: Token not found on %s. You will not be able to edit any page.' % self)
- def scrubxml(self,xml): + def scrubxml(self, xml): """scrub the start of xml input, to make things work, even - when crap is inserted ahead of the actual xml data. (such as when php reports strict - warnings)""" - xml2="" - start=False - warn=False - for line in xml.split("\n"): - if line.startswith("<?xml"): - start=True - else: - warn=True - if start: - xml2+=line+"\n" - if warn==True: - pass #TODO: we could issue a warning for broken xml - return xml2 + when crap is inserted ahead of the actual xml data. + (such as when php reports strict warnings)""" + start = xml.find('<?xml') + if start < 0: + # '<?xml' not found ? Should not happen. + return "" + return xml[start:]
def mediawiki_message(self, key): """Return the MediaWiki message text for key "key" """ @@ -4957,7 +4949,6 @@ else: xml = self.getUrl(self.get_address("Special:Allmessages") + "&ot=xml") - xml=self.scrubxml(xml) # xml structure is : # <messages lang="fr"> # <message name="about">À propos</message> @@ -4965,7 +4956,8 @@ # </messages> if elementtree: decode = xml.encode(self.encoding()) - tree = XML(decode) + clean = self.scrubxml(decode) + tree = XML(clean) self._mediawiki_messages = _dict([(tag.get('name').lower(), tag.text) for tag in tree.getiterator('message')]) else: