SVN: [10383] branches/rewrite/pywikibot/xmlreader.py - Pywikipedia-svn

20 Jun 2012

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10383

Revision: 10383
Author:   xqt
Date:     2012-06-20 13:11:29 +0000 (Wed, 20 Jun 2012)
Log Message:
-----------
xmlreader from trunk r10259

Added Paths:
-----------
    branches/rewrite/pywikibot/xmlreader.py

Copied: branches/rewrite/pywikibot/xmlreader.py (from rev 10375,
trunk/pywikipedia/xmlreader.py)
===================================================================

--- branches/rewrite/pywikibot/xmlreader.py	                        (rev 0)
+++ branches/rewrite/pywikibot/xmlreader.py	2012-06-20 13:11:29 UTC (rev 10383)
@@ -0,0 +1,181 @@
+# -*- coding: utf-8  -*-
+
+"""
+Each XmlEntry object represents a page, as read from an XML source
+
+The XmlDump class reads a pages_current XML dump (like the ones offered on
+http://download.wikimedia.org/wikipedia/de/) and offers a generator over
+XmlEntry objects which can be used by other bots.
+"""
+#
+# (C) Pywikipedia bot team, 2005-2012
+#
+# Distributed under the terms of the MIT license.
+#
+__version__='$Id$'
+#
+
+import threading
+import codecs, re
+from xml.etree.cElementTree import iterparse
+import pywikibot
+
+def parseRestrictions(restrictions):
+    """
+    Parses the characters within a restrictions tag and returns
+    strings representing user groups allowed to edit and to move
+    a page, where None means there are no restrictions.
+    """
+    if not restrictions:
+        return None, None
+    editRestriction = None
+    moveRestriction = None
+    editLockMatch = re.search('edit=([^:]*)', restrictions)
+    if editLockMatch:
+        editRestriction = editLockMatch.group(1)
+    moveLockMatch = re.search('move=([^:]*)', restrictions)
+    if moveLockMatch:
+        moveRestriction = moveLockMatch.group(1)
+    if restrictions == 'sysop':
+        editRestriction = 'sysop'
+        moveRestriction = 'sysop'
+    return editRestriction, moveRestriction
+
+
+class XmlEntry:
+    """
+    Represents a page.
+    """
+    def __init__(self, title, ns, id, text, username, ipedit, timestamp,
+                 editRestriction, moveRestriction, revisionid, comment,
+                 redirect):
+        # TODO: there are more tags we can read.
+        self.title = title
+        self.ns = ns
+        self.id =id
+        self.text = text
+        self.username = username.strip()
+        self.ipedit = ipedit
+        self.timestamp = timestamp
+        self.editRestriction = editRestriction
+        self.moveRestriction = moveRestriction
+        self.revisionid = revisionid
+        self.comment = comment
+        self.isredirect = redirect
+
+
+class XmlParserThread(threading.Thread):
+    """
+    This XML parser will run as a single thread. This allows the XmlDump
+    generator to yield pages before the parser has finished reading the
+    entire dump.
+
+    There surely are more elegant ways to do this.
+    """
+    def __init__(self, filename, handler):
+        threading.Thread.__init__(self)
+        self.filename = filename
+        self.handler = handler
+
+    def run(self):
+        xml.sax.parse(self.filename, self.handler)
+
+
+class XmlDump(object):
+    """
+    Represents an XML dump file. Reads the local file at initialization,
+    parses it, and offers access to the resulting XmlEntries via a generator.
+
+    @param allrevisions: boolean
+        If True, parse all revisions instead of only the latest one.
+        Default: False.
+    """
+    def __init__(self, filename, allrevisions=False):
+        self.filename = filename
+        if allrevisions:
+            self._parse = self._parse_all
+        else:
+            self._parse = self._parse_only_latest
+
+    def parse(self):
+        """Generator using cElementTree iterparse
function"""
+        if self.filename.endswith('.bz2'):
+            import bz2
+            source = bz2.BZ2File(self.filename)
+        elif self.filename.endswith('.gz'):
+            import gzip
+            source = gzip.open(self.filename)
+        elif self.filename.endswith('.7z'):
+            import subprocess
+            source = subprocess.Popen('7za e -bd -so %s 2>/dev/null'
+                                      % self.filename,
+                                      shell=True,
+                                      stdout=subprocess.PIPE,
+                                      bufsize=65535).stdout
+        else:
+            # assume it's an uncompressed XML file
+            source = open(self.filename)
+        context = iterparse(source, events=("start", "end",
"start-ns"))
+        self.root = None
+
+        for event, elem in context:
+            if event == "start-ns" and elem[0] == "":
+                self.uri = elem[1]
+                continue
+            if event == "start" and self.root is None:
+                self.root = elem
+                continue
+            for rev in self._parse(event, elem):
+                yield rev
+
+    def _parse_only_latest(self, event, elem):
+        """Parser that yields only the latest revision"""
+        if event == "end" and elem.tag == "{%s}page" % self.uri:
+            self._headers(elem)
+            revision = elem.find("{%s}revision" % self.uri)
+            yield self._create_revision(revision)
+            elem.clear()
+            self.root.clear()
+
+    def _parse_all(self, event, elem):
+        """Parser that yields all revisions"""
+        if event == "start" and elem.tag == "{%s}page" % self.uri:
+            self._headers(elem)
+        if event == "end" and elem.tag == "{%s}revision" % self.uri:
+            yield self._create_revision(elem)
+            elem.clear()
+            self.root.clear()
+
+    def _headers(self, elem):
+        self.title = elem.findtext("{%s}title" % self.uri)
+        self.ns = elem.findtext("{%s}ns" % self.uri)
+        self.pageid = elem.findtext("{%s}id" % self.uri)
+        self.restrictions = elem.findtext("{%s}restrictions" % self.uri)
+        self.isredirect = elem.findtext("{%s}redirect" % self.uri) is not None
+        self.editRestriction, self.moveRestriction \
+                              = parseRestrictions(self.restrictions)
+
+
+    def _create_revision(self, revision):
+        """Creates a Single revision"""
+        revisionid = revision.findtext("{%s}id" % self.uri)
+        timestamp = revision.findtext("{%s}timestamp" % self.uri)
+        comment = revision.findtext("{%s}comment" % self.uri)
+        contributor = revision.find("{%s}contributor" % self.uri)
+        ipeditor = contributor.findtext("{%s}ip" % self.uri)
+        username = ipeditor or contributor.findtext("{%s}username" % self.uri)
+        # could get comment, minor as well
+        text = revision.findtext("{%s}text" % self.uri)
+        return XmlEntry(title=self.title,
+                        ns=self.ns,
+                        id=self.pageid,
+                        text=text or u'',
+                        username=username or u'', #username might be deleted
+                        ipedit=bool(ipeditor),
+                        timestamp=timestamp,
+                        editRestriction=self.editRestriction,
+                        moveRestriction=self.moveRestriction,
+                        revisionid=revisionid,
+                        comment=comment,
+                        redirect=self.isredirect
+                       )