Revision: 4691 Author: russblau Date: 2007-12-10 17:29:45 +0000 (Mon, 10 Dec 2007)
Log Message: ----------- Implement -xmlstart parameter (requested in Patch #1649080)
Modified Paths: -------------- trunk/pywikipedia/replace.py
Modified: trunk/pywikipedia/replace.py =================================================================== --- trunk/pywikipedia/replace.py 2007-12-10 16:59:01 UTC (rev 4690) +++ trunk/pywikipedia/replace.py 2007-12-10 17:29:45 UTC (rev 4691) @@ -23,6 +23,10 @@
-nocase Use case insensitive regular expressions.
+-xmlstart (Only works with -xml) Skip all articles in the XML dump + before the one specified (may also be given as + -xmlstart:Article). + -excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex argument is given, XYZ will be regarded as a regular expression.
@@ -147,6 +151,7 @@ These pages will be retrieved from a local XML dump file. Arguments: * xmlFilename - The dump's path, either absolute or relative + * xmlStart - Skip all articles in the dump before this one * replacements - A list of 2-tuples of original text (as a compiled regular expression) and replacement text (as a string). @@ -155,10 +160,12 @@ constructor below.
""" - def __init__(self, xmlFilename, replacements, exceptions): + def __init__(self, xmlFilename, xmlStart, replacements, exceptions): self.xmlFilename = xmlFilename self.replacements = replacements self.exceptions = exceptions + self.xmlStart = xmlStart + self.skipping = bool(xmlStart)
self.excsInside = [] if self.exceptions.has_key('inside-tags'): @@ -174,18 +181,32 @@ return self
def next(self): - while True: + try: + while True: + try: + entry = self.parser.next() + except StopIteration: + raise + if self.skipping: + if entry.title != self.xmlStart: + continue + self.skipping = False + if not self.isTitleExcepted(entry.title) \ + and not self.isTextExcepted(entry.text): + new_text = entry.text + for old, new in self.replacements: + new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside) + if new_text != entry.text: + return wikipedia.Page(self.site, entry.title) + except KeyboardInterrupt: try: - entry = self.parser.next() - except StopIteration: - raise - if not self.isTitleExcepted(entry.title) \ - and not self.isTextExcepted(entry.text): - new_text = entry.text - for old, new in self.replacements: - new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside) - if new_text != entry.text: - return wikipedia.Page(self.site, entry.title) + if not self.skipping: + wikipedia.output( + 'To resume, use "-xmlstart:%s" on the command line.' + % entry.title) + except NameError: + pass + raise KeyboardInterrupt
def isTitleExcepted(self, title): if self.exceptions.has_key('title'): @@ -404,6 +425,12 @@ for arg in wikipedia.handleArgs(): if arg == '-regex': regex = True + elif arg.startswith('-xmlstart'): + if len(arg) == 9: + xmlStart = wikipedia.input( + u'Please enter the dumped article to start with:') + else: + xmlStart = arg[10:] elif arg.startswith('-xml'): if len(arg) == 4: xmlFilename = wikipedia.input( @@ -527,7 +554,12 @@ exceptions[exceptionCategory] = patterns
if xmlFilename: - gen = XmlDumpReplacePageGenerator(xmlFilename, replacements, exceptions) + try: + xmlStart + except NameError: + xmlStart = None + gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart, + replacements, exceptions) elif useSql: whereClause = 'WHERE (%s)' % ' OR '.join(["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern) for (old, new) in replacements]) if exceptions: @@ -553,7 +585,7 @@ wikipedia.stopme() sys.exit() if namespaces != []: - gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) + gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) if xmlFilename: # XML parsing is slow enough that preloading would make bot even slower preloadingGen = gen
pywikipedia-l@lists.wikimedia.org