[Pywikipedia-l] SVN: [6568] trunk/pywikipedia/wikipedia.py

Wed Apr 1 11:18:59 UTC 2009

Revision: 6568
Author:   nicdumz
Date:     2009-04-01 11:18:59 +0000 (Wed, 01 Apr 2009)

Log Message:
-----------
Cleaning previous commit :
* Cleaning scrubxml() implementation
* Applying scrubxml AFTER decoding the string to unicode

Modified Paths:
--------------
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/wikipedia.py
===================================================================

--- trunk/pywikipedia/wikipedia.py	2009-04-01 10:45:17 UTC (rev 6567)
+++ trunk/pywikipedia/wikipedia.py	2009-04-01 11:18:59 UTC (rev 6568)
@@ -4893,23 +4893,15 @@
                 # Token not found
                 output(u'WARNING: Token not found on %s. You will not be able to edit any page.' % self)
 
-    def scrubxml(self,xml):
+    def scrubxml(self, xml):
         """scrub the start of xml input, to make things work, even
-        when crap is inserted ahead of the actual xml data. (such as when php reports strict
-        warnings)"""
-        xml2=""
-        start=False
-        warn=False
-        for line in xml.split("\n"):
-            if line.startswith("<?xml"):
-                start=True
-            else:
-		 warn=True
-            if start:
-                xml2+=line+"\n"
-        if warn==True:
-            pass    #TODO: we could issue a warning for broken xml
-        return xml2
+        when crap is inserted ahead of the actual xml data. 
+        (such as when php reports strict warnings)"""
+        start = xml.find('<?xml')
+        if start < 0:
+            # '<?xml' not found ? Should not happen.
+            return ""
+        return xml[start:]
 
     def mediawiki_message(self, key):
         """Return the MediaWiki message text for key "key" """
@@ -4957,7 +4949,6 @@
                 else:
                     xml = self.getUrl(self.get_address("Special:Allmessages")
                                         + "&ot=xml")
-                    xml=self.scrubxml(xml)
                     # xml structure is :
                     # <messages lang="fr">
                     #    <message name="about">À propos</message>
@@ -4965,7 +4956,8 @@
                     # </messages>
                     if elementtree:
                         decode = xml.encode(self.encoding())
-                        tree = XML(decode)
+                        clean = self.scrubxml(decode)
+                        tree = XML(clean)
                         self._mediawiki_messages = _dict([(tag.get('name').lower(), tag.text)
                                 for tag in tree.getiterator('message')])
                     else: