Revision: 6739 Author: nicdumz Date: 2009-04-27 15:21:01 +0000 (Mon, 27 Apr 2009)
Log Message: ----------- #2679114 : "xmlreader.xmldump iterparse all revisions" Adding an allrevisions parameter to the xmlreader.XmlDump constructor to be able to fetch all revisions from a dump when needed.
Kudos to Johan Euphrosine for the original patch :)
Modified Paths: -------------- trunk/pywikipedia/xmlreader.py
Modified: trunk/pywikipedia/xmlreader.py =================================================================== --- trunk/pywikipedia/xmlreader.py 2009-04-27 14:32:03 UTC (rev 6738) +++ trunk/pywikipedia/xmlreader.py 2009-04-27 15:21:01 UTC (rev 6739) @@ -237,9 +237,18 @@ NOTE: This used to be done by a SAX parser, but the solution with regular expressions is about 10 to 20 times faster. The cElementTree version is again much, much faster than the regex solution. + + @param allrevisions: boolean + Only available for cElementTree version: + If True, parse all revisions instead of only the latest one. + Default: False. """ - def __init__(self, filename): + def __init__(self, filename, allrevisions=False): self.filename = filename + if allrevisions: + self._parse = self._parse_all + else: + self._parse = self._parse_only_latest
def parse(self): """Return a generator that will yield XmlEntry objects""" @@ -261,38 +270,60 @@ # assume it's an uncompressed XML file source = open(self.filename) context = iterparse(source, events=("start", "end", "start-ns")) - root = None + self.root = None
for event, elem in context: if event == "start-ns" and elem[0] == "": - uri = elem[1] + self.uri = elem[1] continue - if event == "start" and root is None: - root = elem + if event == "start" and self.root is None: + self.root = elem continue - if event == "end" and elem.tag == "{%s}page" % uri: - title = elem.findtext("{%s}title" % uri) - pageid = elem.findtext("{%s}id" % uri) - restrictions = elem.findtext("{%s}restrictions" % uri) - revision = elem.find("{%s}revision" % uri) - revisionid = revision.findtext("{%s}id" % uri) - timestamp = revision.findtext("{%s}timestamp" % uri) - contributor = revision.find("{%s}contributor" % uri) - ipeditor = contributor.findtext("{%s}ip" % uri) - username = ipeditor or contributor.findtext("{%s}username" % uri) - # could get comment, minor as well - text = revision.findtext("{%s}text" % uri) - editRestriction, moveRestriction \ - = parseRestrictions(restrictions) - yield XmlEntry(title=title, id=pageid, text=text or u'', - username=username, ipedit=bool(ipeditor), - timestamp=timestamp, - editRestriction=editRestriction, - moveRestriction=moveRestriction, - revisionid=revisionid - ) - root.clear() + for rev in self._parse(event, elem): + yield rev
+ def _parse_only_latest(self, event, elem): + """Parser that yields only the latest revision""" + if event == "end" and elem.tag == "{%s}page" % self.uri: + self._headers(elem) + + revision = elem.find("{%s}revision" % self.uri) + yield self._create_revision(revision) + self.root.clear() + + def _parse_all(self, event, elem): + """Parser that yields all revisions""" + if event == "start" and elem.tag == "{%s}revision" % self.uri: + self._headers(elem) + + if event == "end" and elem.tag == "{%s}revision" % self.uri: + yield self._create_revision(elem) + self.root.clear() + + def _headers(self, elem): + self.title = elem.findtext("{%s}title" % self.uri) + self.pageid = elem.findtext("{%s}id" % self.uri) + self.restrictions = elem.findtext("{%s}restrictions" % self.uri) + + def _create_revision(self, revision): + """Creates a Single revision""" + revisionid = revision.findtext("{%s}id" % self.uri) + timestamp = revision.findtext("{%s}timestamp" % self.uri) + contributor = revision.find("{%s}contributor" % self.uri) + ipeditor = contributor.findtext("{%s}ip" % self.uri) + username = ipeditor or contributor.findtext("{%s}username" % self.uri) + # could get comment, minor as well + text = revision.findtext("{%s}text" % self.uri) + editRestriction, moveRestriction \ + = parseRestrictions(self.restrictions) + return XmlEntry(title=self.title, id=self.pageid, text=text or u'', + username=username, ipedit=bool(ipeditor), + timestamp=timestamp, + editRestriction=editRestriction, + moveRestriction=moveRestriction, + revisionid=revisionid + ) + def regex_parse(self): """ Generator which reads some lines from the XML dump file, and