http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10383
Revision: 10383
Author: xqt
Date: 2012-06-20 13:11:29 +0000 (Wed, 20 Jun 2012)
Log Message:
-----------
xmlreader from trunk r10259
Added Paths:
-----------
branches/rewrite/pywikibot/xmlreader.py
Copied: branches/rewrite/pywikibot/xmlreader.py (from rev 10375,
trunk/pywikipedia/xmlreader.py)
===================================================================
--- branches/rewrite/pywikibot/xmlreader.py (rev 0)
+++ branches/rewrite/pywikibot/xmlreader.py 2012-06-20 13:11:29 UTC (rev 10383)
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+
+"""
+Each XmlEntry object represents a page, as read from an XML source
+
+The XmlDump class reads a pages_current XML dump (like the ones offered on
+http://download.wikimedia.org/wikipedia/de/) and offers a generator over
+XmlEntry objects which can be used by other bots.
+"""
+#
+# (C) Pywikipedia bot team, 2005-2012
+#
+# Distributed under the terms of the MIT license.
+#
+__version__='$Id$'
+#
+
+import threading
+import codecs, re
+from xml.etree.cElementTree import iterparse
+import pywikibot
+
+def parseRestrictions(restrictions):
+ """
+ Parses the characters within a restrictions tag and returns
+ strings representing user groups allowed to edit and to move
+ a page, where None means there are no restrictions.
+ """
+ if not restrictions:
+ return None, None
+ editRestriction = None
+ moveRestriction = None
+ editLockMatch = re.search('edit=([^:]*)', restrictions)
+ if editLockMatch:
+ editRestriction = editLockMatch.group(1)
+ moveLockMatch = re.search('move=([^:]*)', restrictions)
+ if moveLockMatch:
+ moveRestriction = moveLockMatch.group(1)
+ if restrictions == 'sysop':
+ editRestriction = 'sysop'
+ moveRestriction = 'sysop'
+ return editRestriction, moveRestriction
+
+
+class XmlEntry:
+ """
+ Represents a page.
+ """
+ def __init__(self, title, ns, id, text, username, ipedit, timestamp,
+ editRestriction, moveRestriction, revisionid, comment,
+ redirect):
+ # TODO: there are more tags we can read.
+ self.title = title
+ self.ns = ns
+ self.id =id
+ self.text = text
+ self.username = username.strip()
+ self.ipedit = ipedit
+ self.timestamp = timestamp
+ self.editRestriction = editRestriction
+ self.moveRestriction = moveRestriction
+ self.revisionid = revisionid
+ self.comment = comment
+ self.isredirect = redirect
+
+
+class XmlParserThread(threading.Thread):
+ """
+ This XML parser will run as a single thread. This allows the XmlDump
+ generator to yield pages before the parser has finished reading the
+ entire dump.
+
+ There surely are more elegant ways to do this.
+ """
+ def __init__(self, filename, handler):
+ threading.Thread.__init__(self)
+ self.filename = filename
+ self.handler = handler
+
+ def run(self):
+ xml.sax.parse(self.filename, self.handler)
+
+
+class XmlDump(object):
+ """
+ Represents an XML dump file. Reads the local file at initialization,
+ parses it, and offers access to the resulting XmlEntries via a generator.
+
+ @param allrevisions: boolean
+ If True, parse all revisions instead of only the latest one.
+ Default: False.
+ """
+ def __init__(self, filename, allrevisions=False):
+ self.filename = filename
+ if allrevisions:
+ self._parse = self._parse_all
+ else:
+ self._parse = self._parse_only_latest
+
+ def parse(self):
+ """Generator using cElementTree iterparse
function"""
+ if self.filename.endswith('.bz2'):
+ import bz2
+ source = bz2.BZ2File(self.filename)
+ elif self.filename.endswith('.gz'):
+ import gzip
+ source = gzip.open(self.filename)
+ elif self.filename.endswith('.7z'):
+ import subprocess
+ source = subprocess.Popen('7za e -bd -so %s 2>/dev/null'
+ % self.filename,
+ shell=True,
+ stdout=subprocess.PIPE,
+ bufsize=65535).stdout
+ else:
+ # assume it's an uncompressed XML file
+ source = open(self.filename)
+ context = iterparse(source, events=("start", "end",
"start-ns"))
+ self.root = None
+
+ for event, elem in context:
+ if event == "start-ns" and elem[0] == "":
+ self.uri = elem[1]
+ continue
+ if event == "start" and self.root is None:
+ self.root = elem
+ continue
+ for rev in self._parse(event, elem):
+ yield rev
+
+ def _parse_only_latest(self, event, elem):
+ """Parser that yields only the latest revision"""
+ if event == "end" and elem.tag == "{%s}page" % self.uri:
+ self._headers(elem)
+ revision = elem.find("{%s}revision" % self.uri)
+ yield self._create_revision(revision)
+ elem.clear()
+ self.root.clear()
+
+ def _parse_all(self, event, elem):
+ """Parser that yields all revisions"""
+ if event == "start" and elem.tag == "{%s}page" % self.uri:
+ self._headers(elem)
+ if event == "end" and elem.tag == "{%s}revision" % self.uri:
+ yield self._create_revision(elem)
+ elem.clear()
+ self.root.clear()
+
+ def _headers(self, elem):
+ self.title = elem.findtext("{%s}title" % self.uri)
+ self.ns = elem.findtext("{%s}ns" % self.uri)
+ self.pageid = elem.findtext("{%s}id" % self.uri)
+ self.restrictions = elem.findtext("{%s}restrictions" % self.uri)
+ self.isredirect = elem.findtext("{%s}redirect" % self.uri) is not None
+ self.editRestriction, self.moveRestriction \
+ = parseRestrictions(self.restrictions)
+
+
+ def _create_revision(self, revision):
+ """Creates a Single revision"""
+ revisionid = revision.findtext("{%s}id" % self.uri)
+ timestamp = revision.findtext("{%s}timestamp" % self.uri)
+ comment = revision.findtext("{%s}comment" % self.uri)
+ contributor = revision.find("{%s}contributor" % self.uri)
+ ipeditor = contributor.findtext("{%s}ip" % self.uri)
+ username = ipeditor or contributor.findtext("{%s}username" % self.uri)
+ # could get comment, minor as well
+ text = revision.findtext("{%s}text" % self.uri)
+ return XmlEntry(title=self.title,
+ ns=self.ns,
+ id=self.pageid,
+ text=text or u'',
+ username=username or u'', #username might be deleted
+ ipedit=bool(ipeditor),
+ timestamp=timestamp,
+ editRestriction=self.editRestriction,
+ moveRestriction=self.moveRestriction,
+ revisionid=revisionid,
+ comment=comment,
+ redirect=self.isredirect
+ )