[Pywikipedia-svn] SVN: [6739] trunk/pywikipedia/xmlreader.py

27 Apr 2009

Revision: 6739
Author:   nicdumz
Date:     2009-04-27 15:21:01 +0000 (Mon, 27 Apr 2009)
Log Message:
-----------
#2679114 : "xmlreader.xmldump iterparse all revisions"
Adding an allrevisions parameter to the xmlreader.XmlDump constructor to 
be able to fetch all revisions from a dump when needed.
Kudos to Johan Euphrosine for the original patch :)
Modified Paths:
--------------
    trunk/pywikipedia/xmlreader.py
Modified: trunk/pywikipedia/xmlreader.py
===================================================================

--- trunk/pywikipedia/xmlreader.py	2009-04-27 14:32:03 UTC (rev 6738)
+++ trunk/pywikipedia/xmlreader.py	2009-04-27 15:21:01 UTC (rev 6739)
@@ -237,9 +237,18 @@
     NOTE: This used to be done by a SAX parser, but the solution with regular
     expressions is about 10 to 20 times faster. The cElementTree version is
     again much, much faster than the regex solution.
+
+    @param allrevisions: boolean
+        Only available for cElementTree version:
+        If True, parse all revisions instead of only the latest one.
+        Default: False.
     """
-    def __init__(self, filename):
+    def __init__(self, filename, allrevisions=False):
         self.filename = filename
+        if allrevisions:
+            self._parse = self._parse_all
+        else:
+            self._parse = self._parse_only_latest
def parse(self):
         """Return a generator that will yield XmlEntry objects"""
@@ -261,38 +270,60 @@
             # assume it's an uncompressed XML file
             source = open(self.filename)
         context = iterparse(source, events=("start", "end", "start-ns"))
-        root = None
+        self.root = None
for event, elem in context:
             if event == "start-ns" and elem[0] == "":
-                uri = elem[1]
+                self.uri = elem[1]
                 continue
-            if event == "start" and root is None:
-                root = elem
+            if event == "start" and self.root is None:
+                self.root = elem
                 continue
-            if event == "end" and elem.tag == "{%s}page" % uri:
-                title = elem.findtext("{%s}title" % uri)
-                pageid = elem.findtext("{%s}id" % uri)
-                restrictions = elem.findtext("{%s}restrictions" % uri)
-                revision = elem.find("{%s}revision" % uri)
-                revisionid = revision.findtext("{%s}id" % uri)
-                timestamp = revision.findtext("{%s}timestamp" % uri)
-                contributor = revision.find("{%s}contributor" % uri)
-                ipeditor = contributor.findtext("{%s}ip" % uri)
-                username = ipeditor or contributor.findtext("{%s}username" % uri)
-                # could get comment, minor as well
-                text = revision.findtext("{%s}text" % uri)
-                editRestriction, moveRestriction \
-                        = parseRestrictions(restrictions)
-                yield XmlEntry(title=title, id=pageid, text=text or u'',
-                               username=username, ipedit=bool(ipeditor),
-                               timestamp=timestamp,
-                               editRestriction=editRestriction,
-                               moveRestriction=moveRestriction,
-                               revisionid=revisionid
-                              )
-                root.clear()
+            for rev in self._parse(event, elem):
+                yield rev
+    def _parse_only_latest(self, event, elem):
+        """Parser that yields only the latest revision"""
+        if event == "end" and elem.tag == "{%s}page" % self.uri:
+            self._headers(elem)
+
+            revision = elem.find("{%s}revision" % self.uri)
+            yield self._create_revision(revision)
+            self.root.clear()
+
+    def _parse_all(self, event, elem):
+        """Parser that yields all revisions"""
+        if event == "start" and elem.tag == "{%s}revision" % self.uri:
+            self._headers(elem)
+
+        if event == "end" and elem.tag == "{%s}revision" % self.uri:
+            yield self._create_revision(elem)
+            self.root.clear()
+    
+    def _headers(self, elem):
+        self.title = elem.findtext("{%s}title" % self.uri)
+        self.pageid = elem.findtext("{%s}id" % self.uri)
+        self.restrictions = elem.findtext("{%s}restrictions" % self.uri)
+
+    def _create_revision(self, revision):
+        """Creates a Single revision"""
+        revisionid = revision.findtext("{%s}id" % self.uri)
+        timestamp = revision.findtext("{%s}timestamp" % self.uri)
+        contributor = revision.find("{%s}contributor" % self.uri)
+        ipeditor = contributor.findtext("{%s}ip" % self.uri)
+        username = ipeditor or contributor.findtext("{%s}username" % self.uri)
+        # could get comment, minor as well
+        text = revision.findtext("{%s}text" % self.uri)
+        editRestriction, moveRestriction \
+                = parseRestrictions(self.restrictions)
+        return XmlEntry(title=self.title, id=self.pageid, text=text or u'',
+                       username=username, ipedit=bool(ipeditor),
+                       timestamp=timestamp,
+                       editRestriction=editRestriction,
+                       moveRestriction=moveRestriction,
+                       revisionid=revisionid
+                      )
+
     def regex_parse(self):
         """
         Generator which reads some lines from the XML dump file, and

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Pywikipedia-svn] SVN: [6739] trunk/pywikipedia/xmlreader.py