Revision: 6186 Author: russblau Date: 2008-12-22 17:05:56 +0000 (Mon, 22 Dec 2008)
Log Message: ----------- Branching replace.py for port to rewrite
Added Paths: ----------- branches/rewrite/pywikibot/scripts/replace.py
Copied: branches/rewrite/pywikibot/scripts/replace.py (from rev 6185, trunk/pywikipedia/replace.py) =================================================================== --- branches/rewrite/pywikibot/scripts/replace.py (rev 0) +++ branches/rewrite/pywikibot/scripts/replace.py 2008-12-22 17:05:56 UTC (rev 6186) @@ -0,0 +1,710 @@ +# -*- coding: utf-8 -*- +""" +This bot will make direct text replacements. It will retrieve information on +which pages might need changes either from an XML dump or a text file, or only +change a single page. + +These command line parameters can be used to specify which pages to work on: + +¶ms; + +-xml Retrieve information from a local XML dump (pages-articles + or pages-meta-current, see http://download.wikimedia.org). + Argument can also be given as "-xml:filename". + +-page Only edit a specific page. + Argument can also be given as "-page:pagetitle". You can + give this parameter multiple times to edit multiple pages. + +Furthermore, the following command line parameters are supported: + +-regex Make replacements using regular expressions. If this argument + isn't given, the bot will make simple text replacements. + +-nocase Use case insensitive regular expressions. + +-xmlstart (Only works with -xml) Skip all articles in the XML dump + before the one specified (may also be given as + -xmlstart:Article). + +-addcat:cat_name Adds "cat_name" category to every altered page. + +-excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex + argument is given, XYZ will be regarded as a regular + expression. + +-requiretitle:XYZ Only do pages with titles that contain XYZ. If the -regex + argument is given, XYZ will be regarded as a regular + expression. + +-excepttext:XYZ Skip pages which contain the text XYZ. If the -regex + argument is given, XYZ will be regarded as a regular + expression. + +-exceptinside:XYZ Skip occurences of the to-be-replaced text which lie + within XYZ. If the -regex argument is given, XYZ will be + regarded as a regular expression. + +-exceptinsidetag:XYZ Skip occurences of the to-be-replaced text which lie + within an XYZ tag. + +-summary:XYZ Set the summary message text for the edit to XYZ, bypassing + the predefined message texts with original and replacements + inserted. + +-sleep:123 If you use -fix you can check multiple regex at the same time + in every page. This can lead to a great waste of CPU because + the bot will check every regex without waiting using all the + resources. This will slow it down between a regex and another + in order not to waste too much CPU. + +-fix:XYZ Perform one of the predefined replacements tasks, which are + given in the dictionary 'fixes' defined inside the file + fixes.py. + The -regex and -nocase argument and given replacements will + be ignored if you use -fix. + Currently available predefined fixes are: +&fixes-help; + +-namespace:n Number or name of namespace to process. The parameter can be + used multiple times. It works in combination with all other + parameters, except for the -start parameter. If you e.g. + want to iterate over all categories starting at M, use + -start:Category:M. + +-always Don't prompt you for each replacement + +-recursive Recurse replacement as long as possible. Be careful, this + might lead to an infinite loop. + +-allowoverlap When occurences of the pattern overlap, replace all of them. + Be careful, this might lead to an infinite loop. + +other: First argument is the old text, second argument is the new + text. If the -regex argument is given, the first argument + will be regarded as a regular expression, and the second + argument might contain expressions like \1 or \g<name>. + +Examples: + +If you want to change templates from the old syntax, e.g. {{msg:Stub}}, to the +new syntax, e.g. {{Stub}}, download an XML dump file (pages-articles) from +http://download.wikimedia.org, then use this command: + + python replace.py -xml -regex "{{msg:(.*?)}}" "{{\1}}" + +If you have a dump called foobar.xml and want to fix typos in articles, e.g. +Errror -> Error, use this: + + python replace.py -xml:foobar.xml "Errror" "Error" -namespace:0 + +If you have a page called 'John Doe' and want to fix the format of ISBNs, use: + + python replace.py -page:John_Doe -fix:isbn + +This command will change 'referer' to 'referrer', but not in pages which +talk about HTTP, where the typo has become part of the standard: + + python replace.py referer referrer -file:typos.txt -excepttext:HTTP +""" +# +# (C) Daniel Herding & the Pywikipediabot Team, 2004-2008 +# +# Distributed under the terms of the MIT license. +# + +from __future__ import generators +import sys, re, time +import wikipedia, pagegenerators, catlib, config +import editarticle +import webbrowser + +# Imports predefined replacements tasks from fixes.py +import fixes + +# This is required for the text that is shown when you run this script +# with the parameter -help. +docuReplacements = { + '¶ms;': pagegenerators.parameterHelp, + '&fixes-help;': fixes.help, +} + +__version__='$Id$' + +# Summary messages in different languages +# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes' +# below.`v +msg = { + 'ar':u'%s روبوت : استبدال تلقائي للنص', + 'cs':u'Robot automaticky nahradil text: %s', + 'de':u'Bot: Automatisierte Textersetzung %s', + 'el':u'Ρομπότ: Αυτόματη αντικατάσταση κειμένου %s', + 'en':u'Robot: Automated text replacement %s', + 'es':u'Robot: Reemplazo automático de texto %s', + 'fa':u'ربات: تغییر خودکار متن %s', + 'fr':u'Bot : Remplacement de texte automatisé %s', + 'he':u'בוט: החלפת טקסט אוטומטית %s', + 'hu':u'Robot: Automatikus szövegcsere %s', + 'ia':u'Robot: Reimplaciamento automatic de texto %s', + 'id':u'Bot: Penggantian teks otomatis %s', + 'is':u'Vélmenni: breyti texta %s', + 'it':u'Bot: Sostituzione automatica %s', + 'ja':u'ロボットによる: 文字置き換え %s', + 'ka':u'რობოტი: ტექსტის ავტომატური შეცვლა %s', + 'kk':u'Бот: Мәтінді өздікті алмастырды: %s', + 'ksh':u'Bot: hät outomatesch Täx jetuusch: %s', + 'lt':u'robotas: Automatinis teksto keitimas %s', + 'nds':u'Bot: Text automaatsch utwesselt: %s', + 'nds-nl':u'Bot: autematisch tekse vervungen %s', + 'nl':u'Bot: automatisch tekst vervangen %s', + 'nn':u'robot: automatisk teksterstatning: %s', + 'no':u'robot: automatisk teksterstatning: %s', + 'pl':u'Robot automatycznie zamienia tekst %s', + 'pt':u'Bot: Mudança automática %s', + 'ru':u'Робот: Автоматизированная замена текста', + 'sr':u'Бот: Аутоматска замена текста %s', + 'sv':u'Bot: Automatisk textersättning: %s', + 'zh': u'機器人:執行文字代換作業 %s', + } + + +class XmlDumpReplacePageGenerator: + """ + Iterator that will yield Pages that might contain text to replace. + + These pages will be retrieved from a local XML dump file. + Arguments: + * xmlFilename - The dump's path, either absolute or relative + * xmlStart - Skip all articles in the dump before this one + * replacements - A list of 2-tuples of original text (as a + compiled regular expression) and replacement + text (as a string). + * exceptions - A dictionary which defines when to ignore an + occurence. See docu of the ReplaceRobot + constructor below. + + """ + def __init__(self, xmlFilename, xmlStart, replacements, exceptions): + self.xmlFilename = xmlFilename + self.replacements = replacements + self.exceptions = exceptions + self.xmlStart = xmlStart + self.skipping = bool(xmlStart) + + self.excsInside = [] + if self.exceptions.has_key('inside-tags'): + self.excsInside += self.exceptions['inside-tags'] + if self.exceptions.has_key('inside'): + self.excsInside += self.exceptions['inside'] + import xmlreader + self.site = wikipedia.getSite() + dump = xmlreader.XmlDump(self.xmlFilename) + self.parser = dump.parse() + + def __iter__(self): + try: + for entry in self.parser: + if self.skipping: + if entry.title != self.xmlStart: + continue + self.skipping = False + if not self.isTitleExcepted(entry.title) \ + and not self.isTextExcepted(entry.text): + new_text = entry.text + for old, new in self.replacements: + new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside, self.site) + if new_text != entry.text: + yield wikipedia.Page(self.site, entry.title) + except KeyboardInterrupt: + try: + if not self.skipping: + wikipedia.output( + u'To resume, use "-xmlstart:%s" on the command line.' + % entry.title) + except NameError: + pass + + def isTitleExcepted(self, title): + if self.exceptions.has_key('title'): + for exc in self.exceptions['title']: + if exc.search(title): + return True + if self.exceptions.has_key('require-title'): + for req in self.exceptions['require-title']: + if not req.search(title): # if not all requirements are met: + return True + + return False + + def isTextExcepted(self, text): + if self.exceptions.has_key('text-contains'): + for exc in self.exceptions['text-contains']: + if exc.search(text): + return True + return False + + +class ReplaceRobot: + """ + A bot that can do text replacements. + """ + def __init__(self, generator, replacements, exceptions={}, + acceptall=False, allowoverlap=False, recursive=False, + addedCat=None, sleep=None): + """ + Arguments: + * generator - A generator that yields Page objects. + * replacements - A list of 2-tuples of original text (as a + compiled regular expression) and replacement + text (as a string). + * exceptions - A dictionary which defines when not to change an + occurence. See below. + * acceptall - If True, the user won't be prompted before changes + are made. + * allowoverlap - If True, when matches overlap, all of them are + replaced. + * addedCat - If set to a value, add this category to every page + touched. + + Structure of the exceptions dictionary: + This dictionary can have these keys: + + title + A list of regular expressions. All pages with titles that + are matched by one of these regular expressions are skipped. + text-contains + A list of regular expressions. All pages with text that + contains a part which is matched by one of these regular + expressions are skipped. + inside + A list of regular expressions. All occurences are skipped which + lie within a text region which is matched by one of these + regular expressions. + inside-tags + A list of strings. These strings must be keys from the + exceptionRegexes dictionary in wikipedia.replaceExcept(). + + """ + self.generator = generator + self.replacements = replacements + self.exceptions = exceptions + self.acceptall = acceptall + self.allowoverlap = allowoverlap + self.recursive = recursive + if addedCat: + site = wikipedia.getSite() + cat_ns = site.category_namespaces()[0] + self.addedCat = wikipedia.Page(site, + cat_ns + ':' + addedCat) + self.sleep = sleep + + def isTitleExcepted(self, title): + """ + Iff one of the exceptions applies for the given title, returns True. + """ + if self.exceptions.has_key('title'): + for exc in self.exceptions['title']: + if exc.search(title): + return True + if self.exceptions.has_key('require-title'): + for req in self.exceptions['require-title']: + if not req.search(title): + return True + return False + + def isTextExcepted(self, original_text): + """ + Iff one of the exceptions applies for the given page contents, + returns True. + """ + if self.exceptions.has_key('text-contains'): + for exc in self.exceptions['text-contains']: + if exc.search(original_text): + return True + return False + + def doReplacements(self, original_text): + """ + Returns the text which is generated by applying all replacements to + the given text. + """ + new_text = original_text + exceptions = [] + if self.exceptions.has_key('inside-tags'): + exceptions += self.exceptions['inside-tags'] + if self.exceptions.has_key('inside'): + exceptions += self.exceptions['inside'] + for old, new in self.replacements: + if self.sleep != None: + time.sleep(self.sleep) + new_text = wikipedia.replaceExcept(new_text, old, new, exceptions, + allowoverlap=self.allowoverlap) + return new_text + + def run(self): + """ + Starts the robot. + """ + # Run the generator which will yield Pages which might need to be + # changed. + for page in self.generator: + if self.isTitleExcepted(page.title()): + wikipedia.output( + u'Skipping %s because the title is on the exceptions list.' + % page.aslink()) + continue + try: + # Load the page's text from the wiki + original_text = page.get(get_redirect=True) + if not page.canBeEdited(): + wikipedia.output(u"You can't edit page %s" + % page.aslink()) + continue + except wikipedia.NoPage: + wikipedia.output(u'Page %s not found' % page.aslink()) + continue + new_text = original_text + while True: + if self.isTextExcepted(new_text): + wikipedia.output( + u'Skipping %s because it contains text that is on the exceptions list.' + % page.aslink()) + break + new_text = self.doReplacements(new_text) + if new_text == original_text: + wikipedia.output('No changes were necessary in %s' + % page.aslink()) + break + if self.recursive: + newest_text = self.doReplacements(new_text) + while (newest_text!=new_text): + new_text = newest_text + newest_text = self.doReplacements(new_text) + if hasattr(self, "addedCat"): + cats = page.categories(nofollow_redirects=True) + if self.addedCat not in cats: + cats.append(self.addedCat) + new_text = wikipedia.replaceCategoryLinks(new_text, + cats) + # Show the title of the page we're working on. + # Highlight the title in purple. + wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % page.title()) + wikipedia.showDiff(original_text, new_text) + if self.acceptall: + break + choice = wikipedia.inputChoice( + u'Do you want to accept these changes?', + ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"], + ['y', 'N', 'e', 'b', 'a', 'q'], 'N') + if choice == 'e': + editor = editarticle.TextEditor() + as_edited = editor.edit(original_text) + # if user didn't press Cancel + if as_edited and as_edited != new_text: + new_text = as_edited + continue + if choice == 'b': + webbrowser.open("http://%s%s" % ( + page.site().hostname(), + page.site().nice_get_address(page.title()) + )) + wikipedia.input("Press Enter when finished in browser.") + original_text = page.get(get_redirect=True, force=True) + new_text = original_text + continue + if choice == 'q': + return + if choice == 'a': + self.acceptall = True + if choice == 'y': + page.put_async(new_text) + # choice must be 'N' + break + if self.acceptall and new_text != original_text: + try: + page.put(new_text) + except wikipedia.EditConflict: + wikipedia.output(u'Skipping %s because of edit conflict' + % (page.title(),)) + except wikipedia.SpamfilterError, e: + wikipedia.output( + u'Cannot change %s because of blacklist entry %s' + % (page.title(), e.url)) + except wikipedia.PageNotSaved, error: + wikipedia.output(u'Error putting page: %s' + % (error.args,)) + except wikipedia.LockedPage: + wikipedia.output(u'Skipping %s (locked page)' + % (page.title(),)) + +def prepareRegexForMySQL(pattern): + pattern = pattern.replace('\s', '[:space:]') + pattern = pattern.replace('\d', '[:digit:]') + pattern = pattern.replace('\w', '[:alnum:]') + + pattern = pattern.replace("'", "\" + "'") + #pattern = pattern.replace('\', '\\') + #for char in ['[', ']', "'"]: + # pattern = pattern.replace(char, '%s' % char) + return pattern + + +def main(): + add_cat = None + gen = None + # summary message + summary_commandline = None + # Array which will collect commandline parameters. + # First element is original text, second element is replacement text. + commandline_replacements = [] + # A list of 2-tuples of original text and replacement text. + replacements = [] + # Don't edit pages which contain certain texts. + exceptions = { + 'title': [], + 'text-contains': [], + 'inside': [], + 'inside-tags': [], + 'require-title': [], # using a seperate requirements dict needs some + } # major refactoring of code. + + # Should the elements of 'replacements' and 'exceptions' be interpreted + # as regular expressions? + regex = False + # Predefined fixes from dictionary 'fixes' (see above). + fix = None + # the dump's path, either absolute or relative, which will be used + # if -xml flag is present + xmlFilename = None + useSql = False + PageTitles = [] + # will become True when the user presses a ('yes to all') or uses the + # -always flag. + acceptall = False + # Will become True if the user inputs the commandline parameter -nocase + caseInsensitive = False + # Which namespaces should be processed? + # default to [] which means all namespaces will be processed + namespaces = [] + # Do all hits when they overlap + allowoverlap = False + # Do not recurse replacement + recursive = False + # This factory is responsible for processing command line arguments + # that are also used by other scripts and that determine on which pages + # to work on. + genFactory = pagegenerators.GeneratorFactory() + # Load default summary message. + # BUG WARNING: This is probably incompatible with the -lang parameter. + wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg)) + # Between a regex and another (using -fix) sleep some time (not to waste + # too much CPU + sleep = None + + # Read commandline parameters. + for arg in wikipedia.handleArgs(): + if arg == '-regex': + regex = True + elif arg.startswith('-xmlstart'): + if len(arg) == 9: + xmlStart = wikipedia.input( + u'Please enter the dumped article to start with:') + else: + xmlStart = arg[10:] + elif arg.startswith('-xml'): + if len(arg) == 4: + xmlFilename = wikipedia.input( + u'Please enter the XML dump's filename:') + else: + xmlFilename = arg[5:] + elif arg =='-sql': + useSql = True + elif arg.startswith('-page'): + if len(arg) == 5: + PageTitles.append(wikipedia.input( + u'Which page do you want to change?')) + else: + PageTitles.append(arg[6:]) + elif arg.startswith('-excepttitle:'): + exceptions['title'].append(arg[13:]) + elif arg.startswith('-requiretitle:'): + exceptions['require-title'].append(arg[14:]) + elif arg.startswith('-excepttext:'): + exceptions['text-contains'].append(arg[12:]) + elif arg.startswith('-exceptinside:'): + exceptions['inside'].append(arg[14:]) + elif arg.startswith('-exceptinsidetag:'): + exceptions['inside-tags'].append(arg[17:]) + elif arg.startswith('-fix:'): + fix = arg[5:] + elif arg.startswith('-sleep:'): + sleep = float(arg[7:]) + elif arg == '-always': + acceptall = True + elif arg == '-recursive': + recursive = True + elif arg == '-nocase': + caseInsensitive = True + elif arg.startswith('-addcat:'): + add_cat = arg[8:] + elif arg.startswith('-namespace:'): + try: + namespaces.append(int(arg[11:])) + except ValueError: + namespaces.append(arg[11:]) + elif arg.startswith('-summary:'): + wikipedia.setAction(arg[9:]) + summary_commandline = True + elif arg.startswith('-allowoverlap'): + allowoverlap = True + else: + generator = genFactory.handleArg(arg) + if generator: + gen = generator + else: + commandline_replacements.append(arg) + + if (len(commandline_replacements) % 2): + raise wikipedia.Error, 'require even number of replacements.' + elif (len(commandline_replacements) == 2 and fix == None): + replacements.append((commandline_replacements[0], + commandline_replacements[1])) + if summary_commandline == None: + wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg ) + % (' (-' + commandline_replacements[0] + ' +' + + commandline_replacements[1] + ')')) + elif (len(commandline_replacements) > 1): + if (fix == None): + for i in xrange (0, len(commandline_replacements), 2): + replacements.append((commandline_replacements[i], + commandline_replacements[i + 1])) + if summary_commandline == None: + pairs = [( commandline_replacements[i], + commandline_replacements[i + 1] ) + for i in range(0, len(commandline_replacements), 2)] + replacementsDescription = '(%s)' % ', '.join( + [('-' + pair[0] + ' +' + pair[1]) for pair in pairs]) + wikipedia.setAction( + wikipedia.translate(wikipedia.getSite(), msg ) + % replacementsDescription) + else: + raise wikipedia.Error( + 'Specifying -fix with replacements is undefined') + elif fix == None: + old = wikipedia.input(u'Please enter the text that should be replaced:') + new = wikipedia.input(u'Please enter the new text:') + change = '(-' + old + ' +' + new + replacements.append((old, new)) + while True: + old = wikipedia.input( +u'Please enter another text that should be replaced, or press Enter to start:') + if old == '': + change = change + ')' + break + new = wikipedia.input(u'Please enter the new text:') + change = change + ' & -' + old + ' +' + new + replacements.append((old, new)) + if not summary_commandline == True: + default_summary_message = wikipedia.translate(wikipedia.getSite(), msg) % change + wikipedia.output(u'The summary message will default to: %s' + % default_summary_message) + summary_message = wikipedia.input( +u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:') + if summary_message == '': + summary_message = default_summary_message + wikipedia.setAction(summary_message) + + else: + # Perform one of the predefined actions. + try: + fix = fixes.fixes[fix] + except KeyError: + wikipedia.output(u'Available predefined fixes are: %s' + % fixes.fixes.keys()) + return + if fix.has_key('regex'): + regex = fix['regex'] + if fix.has_key('msg'): + wikipedia.setAction( + wikipedia.translate(wikipedia.getSite(), fix['msg'])) + if fix.has_key('exceptions'): + exceptions = fix['exceptions'] + replacements = fix['replacements'] + + # already compile all regular expressions here to save time later + for i in range(len(replacements)): + old, new = replacements[i] + if not regex: + old = re.escape(old) + if caseInsensitive: + oldR = re.compile(old, re.UNICODE | re.IGNORECASE) + else: + oldR = re.compile(old, re.UNICODE) + replacements[i] = oldR, new + + for exceptionCategory in ['title', 'require-title', 'text-contains', 'inside']: + if exceptions.has_key(exceptionCategory): + patterns = exceptions[exceptionCategory] + if not regex: + patterns = [re.escape(pattern) for pattern in patterns] + if caseInsensitive: + patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE) + for pattern in patterns] + else: + patterns = [re.compile(pattern, re.UNICODE) + for pattern in patterns] + exceptions[exceptionCategory] = patterns + + if xmlFilename: + try: + xmlStart + except NameError: + xmlStart = None + gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart, + replacements, exceptions) + elif useSql: + whereClause = 'WHERE (%s)' % ' OR '.join( + ["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern) + for (old, new) in replacements]) + if exceptions: + exceptClause = 'AND NOT (%s)' % ' OR '.join( + ["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern) + for exc in exceptions]) + else: + exceptClause = '' + query = u""" +SELECT page_namespace, page_title +FROM page +JOIN text ON (page_id = old_id) +%s +%s +LIMIT 200""" % (whereClause, exceptClause) + gen = pagegenerators.MySQLPageGenerator(query) + + elif PageTitles: + pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) + for PageTitle in PageTitles] + gen = iter(pages) + + if not gen: + # syntax error, show help text from the top of this file + wikipedia.showHelp('replace') + return + if namespaces != []: + gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) + if xmlFilename: + # XML parsing can be quite slow, so use smaller batches and + # longer lookahead. + preloadingGen = pagegenerators.PreloadingGenerator(gen, + pageNumber=20, lookahead=100) + else: + preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60) + bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep) + bot.run() + +if __name__ == "__main__": + try: + main() + finally: + wikipedia.stopme()