Revision: 6186
Author: russblau
Date: 2008-12-22 17:05:56 +0000 (Mon, 22 Dec 2008)
Log Message:
-----------
Branching replace.py for port to rewrite
Added Paths:
-----------
branches/rewrite/pywikibot/scripts/replace.py
Copied: branches/rewrite/pywikibot/scripts/replace.py (from rev 6185,
trunk/pywikipedia/replace.py)
===================================================================
--- branches/rewrite/pywikibot/scripts/replace.py (rev 0)
+++ branches/rewrite/pywikibot/scripts/replace.py 2008-12-22 17:05:56 UTC (rev 6186)
@@ -0,0 +1,710 @@
+# -*- coding: utf-8 -*-
+"""
+This bot will make direct text replacements. It will retrieve information on
+which pages might need changes either from an XML dump or a text file, or only
+change a single page.
+
+These command line parameters can be used to specify which pages to work on:
+
+¶ms;
+
+-xml Retrieve information from a local XML dump (pages-articles
+ or pages-meta-current, see
http://download.wikimedia.org).
+ Argument can also be given as "-xml:filename".
+
+-page Only edit a specific page.
+ Argument can also be given as "-page:pagetitle". You can
+ give this parameter multiple times to edit multiple pages.
+
+Furthermore, the following command line parameters are supported:
+
+-regex Make replacements using regular expressions. If this argument
+ isn't given, the bot will make simple text replacements.
+
+-nocase Use case insensitive regular expressions.
+
+-xmlstart (Only works with -xml) Skip all articles in the XML dump
+ before the one specified (may also be given as
+ -xmlstart:Article).
+
+-addcat:cat_name Adds "cat_name" category to every altered page.
+
+-excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex
+ argument is given, XYZ will be regarded as a regular
+ expression.
+
+-requiretitle:XYZ Only do pages with titles that contain XYZ. If the -regex
+ argument is given, XYZ will be regarded as a regular
+ expression.
+
+-excepttext:XYZ Skip pages which contain the text XYZ. If the -regex
+ argument is given, XYZ will be regarded as a regular
+ expression.
+
+-exceptinside:XYZ Skip occurences of the to-be-replaced text which lie
+ within XYZ. If the -regex argument is given, XYZ will be
+ regarded as a regular expression.
+
+-exceptinsidetag:XYZ Skip occurences of the to-be-replaced text which lie
+ within an XYZ tag.
+
+-summary:XYZ Set the summary message text for the edit to XYZ, bypassing
+ the predefined message texts with original and replacements
+ inserted.
+
+-sleep:123 If you use -fix you can check multiple regex at the same time
+ in every page. This can lead to a great waste of CPU because
+ the bot will check every regex without waiting using all the
+ resources. This will slow it down between a regex and another
+ in order not to waste too much CPU.
+
+-fix:XYZ Perform one of the predefined replacements tasks, which are
+ given in the dictionary 'fixes' defined inside the file
+ fixes.py.
+ The -regex and -nocase argument and given replacements will
+ be ignored if you use -fix.
+ Currently available predefined fixes are:
+&fixes-help;
+
+-namespace:n Number or name of namespace to process. The parameter can be
+ used multiple times. It works in combination with all other
+ parameters, except for the -start parameter. If you e.g.
+ want to iterate over all categories starting at M, use
+ -start:Category:M.
+
+-always Don't prompt you for each replacement
+
+-recursive Recurse replacement as long as possible. Be careful, this
+ might lead to an infinite loop.
+
+-allowoverlap When occurences of the pattern overlap, replace all of them.
+ Be careful, this might lead to an infinite loop.
+
+other: First argument is the old text, second argument is the new
+ text. If the -regex argument is given, the first argument
+ will be regarded as a regular expression, and the second
+ argument might contain expressions like \\1 or \g<name>.
+
+Examples:
+
+If you want to change templates from the old syntax, e.g. {{msg:Stub}}, to the
+new syntax, e.g. {{Stub}}, download an XML dump file (pages-articles) from
+http://download.wikimedia.org, then use this command:
+
+ python replace.py -xml -regex "{{msg:(.*?)}}" "{{\\1}}"
+
+If you have a dump called foobar.xml and want to fix typos in articles, e.g.
+Errror -> Error, use this:
+
+ python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0
+
+If you have a page called 'John Doe' and want to fix the format of ISBNs, use:
+
+ python replace.py -page:John_Doe -fix:isbn
+
+This command will change 'referer' to 'referrer', but not in pages which
+talk about HTTP, where the typo has become part of the standard:
+
+ python replace.py referer referrer -file:typos.txt -excepttext:HTTP
+"""
+#
+# (C) Daniel Herding & the Pywikipediabot Team, 2004-2008
+#
+# Distributed under the terms of the MIT license.
+#
+
+from __future__ import generators
+import sys, re, time
+import wikipedia, pagegenerators, catlib, config
+import editarticle
+import webbrowser
+
+# Imports predefined replacements tasks from fixes.py
+import fixes
+
+# This is required for the text that is shown when you run this script
+# with the parameter -help.
+docuReplacements = {
+ '¶ms;': pagegenerators.parameterHelp,
+ '&fixes-help;': fixes.help,
+}
+
+__version__='$Id$'
+
+# Summary messages in different languages
+# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
+# below.`v
+msg = {
+ 'ar':u'%s روبوت : استبدال تلقائي للنص',
+ 'cs':u'Robot automaticky nahradil text: %s',
+ 'de':u'Bot: Automatisierte Textersetzung %s',
+ 'el':u'Ρομπότ: Αυτόματη αντικατάσταση κειμένου %s',
+ 'en':u'Robot: Automated text replacement %s',
+ 'es':u'Robot: Reemplazo automático de texto %s',
+ 'fa':u'ربات: تغییر خودکار متن %s',
+ 'fr':u'Bot : Remplacement de texte automatisé %s',
+ 'he':u'בוט: החלפת טקסט אוטומטית %s',
+ 'hu':u'Robot: Automatikus szövegcsere %s',
+ 'ia':u'Robot: Reimplaciamento automatic de texto %s',
+ 'id':u'Bot: Penggantian teks otomatis %s',
+ 'is':u'Vélmenni: breyti texta %s',
+ 'it':u'Bot: Sostituzione automatica %s',
+ 'ja':u'ロボットによる: 文字置き換え %s',
+ 'ka':u'რობოტი: ტექსტის ავტომატური შეცვლა %s',
+ 'kk':u'Бот: Мәтінді өздікті алмастырды: %s',
+ 'ksh':u'Bot: hät outomatesch Täx jetuusch: %s',
+ 'lt':u'robotas: Automatinis teksto keitimas %s',
+ 'nds':u'Bot: Text automaatsch utwesselt: %s',
+ 'nds-nl':u'Bot: autematisch tekse vervungen %s',
+ 'nl':u'Bot: automatisch tekst vervangen %s',
+ 'nn':u'robot: automatisk teksterstatning: %s',
+ 'no':u'robot: automatisk teksterstatning: %s',
+ 'pl':u'Robot automatycznie zamienia tekst %s',
+ 'pt':u'Bot: Mudança automática %s',
+ 'ru':u'Робот: Автоматизированная замена текста',
+ 'sr':u'Бот: Аутоматска замена текста %s',
+ 'sv':u'Bot: Automatisk textersättning: %s',
+ 'zh': u'機器人:執行文字代換作業 %s',
+ }
+
+
+class XmlDumpReplacePageGenerator:
+ """
+ Iterator that will yield Pages that might contain text to replace.
+
+ These pages will be retrieved from a local XML dump file.
+ Arguments:
+ * xmlFilename - The dump's path, either absolute or relative
+ * xmlStart - Skip all articles in the dump before this one
+ * replacements - A list of 2-tuples of original text (as a
+ compiled regular expression) and replacement
+ text (as a string).
+ * exceptions - A dictionary which defines when to ignore an
+ occurence. See docu of the ReplaceRobot
+ constructor below.
+
+ """
+ def __init__(self, xmlFilename, xmlStart, replacements, exceptions):
+ self.xmlFilename = xmlFilename
+ self.replacements = replacements
+ self.exceptions = exceptions
+ self.xmlStart = xmlStart
+ self.skipping = bool(xmlStart)
+
+ self.excsInside = []
+ if self.exceptions.has_key('inside-tags'):
+ self.excsInside += self.exceptions['inside-tags']
+ if self.exceptions.has_key('inside'):
+ self.excsInside += self.exceptions['inside']
+ import xmlreader
+ self.site = wikipedia.getSite()
+ dump = xmlreader.XmlDump(self.xmlFilename)
+ self.parser = dump.parse()
+
+ def __iter__(self):
+ try:
+ for entry in self.parser:
+ if self.skipping:
+ if entry.title != self.xmlStart:
+ continue
+ self.skipping = False
+ if not self.isTitleExcepted(entry.title) \
+ and not self.isTextExcepted(entry.text):
+ new_text = entry.text
+ for old, new in self.replacements:
+ new_text = wikipedia.replaceExcept(new_text, old, new,
self.excsInside, self.site)
+ if new_text != entry.text:
+ yield wikipedia.Page(self.site, entry.title)
+ except KeyboardInterrupt:
+ try:
+ if not self.skipping:
+ wikipedia.output(
+ u'To resume, use "-xmlstart:%s" on the command
line.'
+ % entry.title)
+ except NameError:
+ pass
+
+ def isTitleExcepted(self, title):
+ if self.exceptions.has_key('title'):
+ for exc in self.exceptions['title']:
+ if exc.search(title):
+ return True
+ if self.exceptions.has_key('require-title'):
+ for req in self.exceptions['require-title']:
+ if not req.search(title): # if not all requirements are met:
+ return True
+
+ return False
+
+ def isTextExcepted(self, text):
+ if self.exceptions.has_key('text-contains'):
+ for exc in self.exceptions['text-contains']:
+ if exc.search(text):
+ return True
+ return False
+
+
+class ReplaceRobot:
+ """
+ A bot that can do text replacements.
+ """
+ def __init__(self, generator, replacements, exceptions={},
+ acceptall=False, allowoverlap=False, recursive=False,
+ addedCat=None, sleep=None):
+ """
+ Arguments:
+ * generator - A generator that yields Page objects.
+ * replacements - A list of 2-tuples of original text (as a
+ compiled regular expression) and replacement
+ text (as a string).
+ * exceptions - A dictionary which defines when not to change an
+ occurence. See below.
+ * acceptall - If True, the user won't be prompted before changes
+ are made.
+ * allowoverlap - If True, when matches overlap, all of them are
+ replaced.
+ * addedCat - If set to a value, add this category to every page
+ touched.
+
+ Structure of the exceptions dictionary:
+ This dictionary can have these keys:
+
+ title
+ A list of regular expressions. All pages with titles that
+ are matched by one of these regular expressions are skipped.
+ text-contains
+ A list of regular expressions. All pages with text that
+ contains a part which is matched by one of these regular
+ expressions are skipped.
+ inside
+ A list of regular expressions. All occurences are skipped which
+ lie within a text region which is matched by one of these
+ regular expressions.
+ inside-tags
+ A list of strings. These strings must be keys from the
+ exceptionRegexes dictionary in wikipedia.replaceExcept().
+
+ """
+ self.generator = generator
+ self.replacements = replacements
+ self.exceptions = exceptions
+ self.acceptall = acceptall
+ self.allowoverlap = allowoverlap
+ self.recursive = recursive
+ if addedCat:
+ site = wikipedia.getSite()
+ cat_ns = site.category_namespaces()[0]
+ self.addedCat = wikipedia.Page(site,
+ cat_ns + ':' + addedCat)
+ self.sleep = sleep
+
+ def isTitleExcepted(self, title):
+ """
+ Iff one of the exceptions applies for the given title, returns True.
+ """
+ if self.exceptions.has_key('title'):
+ for exc in self.exceptions['title']:
+ if exc.search(title):
+ return True
+ if self.exceptions.has_key('require-title'):
+ for req in self.exceptions['require-title']:
+ if not req.search(title):
+ return True
+ return False
+
+ def isTextExcepted(self, original_text):
+ """
+ Iff one of the exceptions applies for the given page contents,
+ returns True.
+ """
+ if self.exceptions.has_key('text-contains'):
+ for exc in self.exceptions['text-contains']:
+ if exc.search(original_text):
+ return True
+ return False
+
+ def doReplacements(self, original_text):
+ """
+ Returns the text which is generated by applying all replacements to
+ the given text.
+ """
+ new_text = original_text
+ exceptions = []
+ if self.exceptions.has_key('inside-tags'):
+ exceptions += self.exceptions['inside-tags']
+ if self.exceptions.has_key('inside'):
+ exceptions += self.exceptions['inside']
+ for old, new in self.replacements:
+ if self.sleep != None:
+ time.sleep(self.sleep)
+ new_text = wikipedia.replaceExcept(new_text, old, new, exceptions,
+ allowoverlap=self.allowoverlap)
+ return new_text
+
+ def run(self):
+ """
+ Starts the robot.
+ """
+ # Run the generator which will yield Pages which might need to be
+ # changed.
+ for page in self.generator:
+ if self.isTitleExcepted(page.title()):
+ wikipedia.output(
+ u'Skipping %s because the title is on the exceptions list.'
+ % page.aslink())
+ continue
+ try:
+ # Load the page's text from the wiki
+ original_text = page.get(get_redirect=True)
+ if not page.canBeEdited():
+ wikipedia.output(u"You can't edit page %s"
+ % page.aslink())
+ continue
+ except wikipedia.NoPage:
+ wikipedia.output(u'Page %s not found' % page.aslink())
+ continue
+ new_text = original_text
+ while True:
+ if self.isTextExcepted(new_text):
+ wikipedia.output(
+ u'Skipping %s because it contains text that is on the exceptions list.'
+ % page.aslink())
+ break
+ new_text = self.doReplacements(new_text)
+ if new_text == original_text:
+ wikipedia.output('No changes were necessary in %s'
+ % page.aslink())
+ break
+ if self.recursive:
+ newest_text = self.doReplacements(new_text)
+ while (newest_text!=new_text):
+ new_text = newest_text
+ newest_text = self.doReplacements(new_text)
+ if hasattr(self, "addedCat"):
+ cats = page.categories(nofollow_redirects=True)
+ if self.addedCat not in cats:
+ cats.append(self.addedCat)
+ new_text = wikipedia.replaceCategoryLinks(new_text,
+ cats)
+ # Show the title of the page we're working on.
+ # Highlight the title in purple.
+ wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default}
<<<"
+ % page.title())
+ wikipedia.showDiff(original_text, new_text)
+ if self.acceptall:
+ break
+ choice = wikipedia.inputChoice(
+ u'Do you want to accept these changes?',
+ ['Yes', 'No', 'Edit', 'open in
Browser', 'All', "Quit"],
+ ['y', 'N', 'e', 'b',
'a', 'q'], 'N')
+ if choice == 'e':
+ editor = editarticle.TextEditor()
+ as_edited = editor.edit(original_text)
+ # if user didn't press Cancel
+ if as_edited and as_edited != new_text:
+ new_text = as_edited
+ continue
+ if choice == 'b':
+ webbrowser.open("http://%s%s" % (
+ page.site().hostname(),
+ page.site().nice_get_address(page.title())
+ ))
+ wikipedia.input("Press Enter when finished in browser.")
+ original_text = page.get(get_redirect=True, force=True)
+ new_text = original_text
+ continue
+ if choice == 'q':
+ return
+ if choice == 'a':
+ self.acceptall = True
+ if choice == 'y':
+ page.put_async(new_text)
+ # choice must be 'N'
+ break
+ if self.acceptall and new_text != original_text:
+ try:
+ page.put(new_text)
+ except wikipedia.EditConflict:
+ wikipedia.output(u'Skipping %s because of edit conflict'
+ % (page.title(),))
+ except wikipedia.SpamfilterError, e:
+ wikipedia.output(
+ u'Cannot change %s because of blacklist entry %s'
+ % (page.title(), e.url))
+ except wikipedia.PageNotSaved, error:
+ wikipedia.output(u'Error putting page: %s'
+ % (error.args,))
+ except wikipedia.LockedPage:
+ wikipedia.output(u'Skipping %s (locked page)'
+ % (page.title(),))
+
+def prepareRegexForMySQL(pattern):
+ pattern = pattern.replace('\s', '[:space:]')
+ pattern = pattern.replace('\d', '[:digit:]')
+ pattern = pattern.replace('\w', '[:alnum:]')
+
+ pattern = pattern.replace("'", "\\" + "'")
+ #pattern = pattern.replace('\\', '\\\\')
+ #for char in ['[', ']', "'"]:
+ # pattern = pattern.replace(char, '\%s' % char)
+ return pattern
+
+
+def main():
+ add_cat = None
+ gen = None
+ # summary message
+ summary_commandline = None
+ # Array which will collect commandline parameters.
+ # First element is original text, second element is replacement text.
+ commandline_replacements = []
+ # A list of 2-tuples of original text and replacement text.
+ replacements = []
+ # Don't edit pages which contain certain texts.
+ exceptions = {
+ 'title': [],
+ 'text-contains': [],
+ 'inside': [],
+ 'inside-tags': [],
+ 'require-title': [], # using a seperate requirements dict needs some
+ } # major refactoring of code.
+
+ # Should the elements of 'replacements' and 'exceptions' be
interpreted
+ # as regular expressions?
+ regex = False
+ # Predefined fixes from dictionary 'fixes' (see above).
+ fix = None
+ # the dump's path, either absolute or relative, which will be used
+ # if -xml flag is present
+ xmlFilename = None
+ useSql = False
+ PageTitles = []
+ # will become True when the user presses a ('yes to all') or uses the
+ # -always flag.
+ acceptall = False
+ # Will become True if the user inputs the commandline parameter -nocase
+ caseInsensitive = False
+ # Which namespaces should be processed?
+ # default to [] which means all namespaces will be processed
+ namespaces = []
+ # Do all hits when they overlap
+ allowoverlap = False
+ # Do not recurse replacement
+ recursive = False
+ # This factory is responsible for processing command line arguments
+ # that are also used by other scripts and that determine on which pages
+ # to work on.
+ genFactory = pagegenerators.GeneratorFactory()
+ # Load default summary message.
+ # BUG WARNING: This is probably incompatible with the -lang parameter.
+ wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg))
+ # Between a regex and another (using -fix) sleep some time (not to waste
+ # too much CPU
+ sleep = None
+
+ # Read commandline parameters.
+ for arg in wikipedia.handleArgs():
+ if arg == '-regex':
+ regex = True
+ elif arg.startswith('-xmlstart'):
+ if len(arg) == 9:
+ xmlStart = wikipedia.input(
+ u'Please enter the dumped article to start with:')
+ else:
+ xmlStart = arg[10:]
+ elif arg.startswith('-xml'):
+ if len(arg) == 4:
+ xmlFilename = wikipedia.input(
+ u'Please enter the XML dump\'s filename:')
+ else:
+ xmlFilename = arg[5:]
+ elif arg =='-sql':
+ useSql = True
+ elif arg.startswith('-page'):
+ if len(arg) == 5:
+ PageTitles.append(wikipedia.input(
+ u'Which page do you want to change?'))
+ else:
+ PageTitles.append(arg[6:])
+ elif arg.startswith('-excepttitle:'):
+ exceptions['title'].append(arg[13:])
+ elif arg.startswith('-requiretitle:'):
+ exceptions['require-title'].append(arg[14:])
+ elif arg.startswith('-excepttext:'):
+ exceptions['text-contains'].append(arg[12:])
+ elif arg.startswith('-exceptinside:'):
+ exceptions['inside'].append(arg[14:])
+ elif arg.startswith('-exceptinsidetag:'):
+ exceptions['inside-tags'].append(arg[17:])
+ elif arg.startswith('-fix:'):
+ fix = arg[5:]
+ elif arg.startswith('-sleep:'):
+ sleep = float(arg[7:])
+ elif arg == '-always':
+ acceptall = True
+ elif arg == '-recursive':
+ recursive = True
+ elif arg == '-nocase':
+ caseInsensitive = True
+ elif arg.startswith('-addcat:'):
+ add_cat = arg[8:]
+ elif arg.startswith('-namespace:'):
+ try:
+ namespaces.append(int(arg[11:]))
+ except ValueError:
+ namespaces.append(arg[11:])
+ elif arg.startswith('-summary:'):
+ wikipedia.setAction(arg[9:])
+ summary_commandline = True
+ elif arg.startswith('-allowoverlap'):
+ allowoverlap = True
+ else:
+ generator = genFactory.handleArg(arg)
+ if generator:
+ gen = generator
+ else:
+ commandline_replacements.append(arg)
+
+ if (len(commandline_replacements) % 2):
+ raise wikipedia.Error, 'require even number of replacements.'
+ elif (len(commandline_replacements) == 2 and fix == None):
+ replacements.append((commandline_replacements[0],
+ commandline_replacements[1]))
+ if summary_commandline == None:
+ wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg )
+ % (' (-' + commandline_replacements[0] + '
+'
+ + commandline_replacements[1] + ')'))
+ elif (len(commandline_replacements) > 1):
+ if (fix == None):
+ for i in xrange (0, len(commandline_replacements), 2):
+ replacements.append((commandline_replacements[i],
+ commandline_replacements[i + 1]))
+ if summary_commandline == None:
+ pairs = [( commandline_replacements[i],
+ commandline_replacements[i + 1] )
+ for i in range(0, len(commandline_replacements), 2)]
+ replacementsDescription = '(%s)' % ', '.join(
+ [('-' + pair[0] + ' +' + pair[1]) for pair in
pairs])
+ wikipedia.setAction(
+ wikipedia.translate(wikipedia.getSite(), msg )
+ % replacementsDescription)
+ else:
+ raise wikipedia.Error(
+ 'Specifying -fix with replacements is undefined')
+ elif fix == None:
+ old = wikipedia.input(u'Please enter the text that should be replaced:')
+ new = wikipedia.input(u'Please enter the new text:')
+ change = '(-' + old + ' +' + new
+ replacements.append((old, new))
+ while True:
+ old = wikipedia.input(
+u'Please enter another text that should be replaced, or press Enter to start:')
+ if old == '':
+ change = change + ')'
+ break
+ new = wikipedia.input(u'Please enter the new text:')
+ change = change + ' & -' + old + ' +' + new
+ replacements.append((old, new))
+ if not summary_commandline == True:
+ default_summary_message = wikipedia.translate(wikipedia.getSite(), msg) %
change
+ wikipedia.output(u'The summary message will default to: %s'
+ % default_summary_message)
+ summary_message = wikipedia.input(
+u'Press Enter to use this default message, or enter a description of the\nchanges
your bot will make:')
+ if summary_message == '':
+ summary_message = default_summary_message
+ wikipedia.setAction(summary_message)
+
+ else:
+ # Perform one of the predefined actions.
+ try:
+ fix = fixes.fixes[fix]
+ except KeyError:
+ wikipedia.output(u'Available predefined fixes are: %s'
+ % fixes.fixes.keys())
+ return
+ if fix.has_key('regex'):
+ regex = fix['regex']
+ if fix.has_key('msg'):
+ wikipedia.setAction(
+ wikipedia.translate(wikipedia.getSite(), fix['msg']))
+ if fix.has_key('exceptions'):
+ exceptions = fix['exceptions']
+ replacements = fix['replacements']
+
+ # already compile all regular expressions here to save time later
+ for i in range(len(replacements)):
+ old, new = replacements[i]
+ if not regex:
+ old = re.escape(old)
+ if caseInsensitive:
+ oldR = re.compile(old, re.UNICODE | re.IGNORECASE)
+ else:
+ oldR = re.compile(old, re.UNICODE)
+ replacements[i] = oldR, new
+
+ for exceptionCategory in ['title', 'require-title',
'text-contains', 'inside']:
+ if exceptions.has_key(exceptionCategory):
+ patterns = exceptions[exceptionCategory]
+ if not regex:
+ patterns = [re.escape(pattern) for pattern in patterns]
+ if caseInsensitive:
+ patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE)
+ for pattern in patterns]
+ else:
+ patterns = [re.compile(pattern, re.UNICODE)
+ for pattern in patterns]
+ exceptions[exceptionCategory] = patterns
+
+ if xmlFilename:
+ try:
+ xmlStart
+ except NameError:
+ xmlStart = None
+ gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart,
+ replacements, exceptions)
+ elif useSql:
+ whereClause = 'WHERE (%s)' % ' OR '.join(
+ ["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern)
+ for (old, new) in replacements])
+ if exceptions:
+ exceptClause = 'AND NOT (%s)' % ' OR '.join(
+ ["old_text RLIKE '%s'" %
prepareRegexForMySQL(exc.pattern)
+ for exc in exceptions])
+ else:
+ exceptClause = ''
+ query = u"""
+SELECT page_namespace, page_title
+FROM page
+JOIN text ON (page_id = old_id)
+%s
+%s
+LIMIT 200""" % (whereClause, exceptClause)
+ gen = pagegenerators.MySQLPageGenerator(query)
+
+ elif PageTitles:
+ pages = [wikipedia.Page(wikipedia.getSite(), PageTitle)
+ for PageTitle in PageTitles]
+ gen = iter(pages)
+
+ if not gen:
+ # syntax error, show help text from the top of this file
+ wikipedia.showHelp('replace')
+ return
+ if namespaces != []:
+ gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
+ if xmlFilename:
+ # XML parsing can be quite slow, so use smaller batches and
+ # longer lookahead.
+ preloadingGen = pagegenerators.PreloadingGenerator(gen,
+ pageNumber=20, lookahead=100)
+ else:
+ preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
+ bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap,
recursive, add_cat, sleep)
+ bot.run()
+
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ wikipedia.stopme()