Revision: 8700 Author: xqt Date: 2010-11-06 02:18:01 +0000 (Sat, 06 Nov 2010) Log Message: ----------- saves the titles to disk (patch bug #3093682 submitted by Bin?\195?\161ris) Thanks! -- Introduced two new command line argument, -save and -savenew, and an updated main() to process them -- Updated ReplaceRobot class with two new functions and edit counter, and a new parameter -- Extended help (description of new feature and one more example) -- Some code wrappings to fit in 80 characters I found because I know xqt likes them :)
Modified Paths: -------------- trunk/pywikipedia/replace.py
Modified: trunk/pywikipedia/replace.py =================================================================== --- trunk/pywikipedia/replace.py 2010-11-06 00:48:58 UTC (rev 8699) +++ trunk/pywikipedia/replace.py 2010-11-06 02:18:01 UTC (rev 8700) @@ -32,6 +32,17 @@ before the one specified (may also be given as -xmlstart:Article).
+-save Saves the titles of the articles to a file instead of + modifying the articles. This way you may collect titles to + work on in automatic mode, and process them later with + -file. Opens the file for append, if exists. + If you insert the contents of the file into a wikipage, it + will appear as a numbered list, and may be used with -links. + Argument may also be given as "-save:filename". + +-savenew Just like -save, except that overwrites the existing file. + Argument may also be given as "-savenew:filename". + -addcat:cat_name Adds "cat_name" category to every altered page.
-excepttitle:XYZ Skip pages with titles that contain XYZ. If the -regex @@ -109,21 +120,30 @@
python replace.py -page:John_Doe -fix:isbn
+Let's suppose, you want to change "color" to "colour" manually, but gathering +the articles is too slow, so you want to save the list while you are sleeping. +You have Windows, so "python" is not necessary. Use this: + + replace.py -xml -save:color.txt color colour -always +You may use color.txt later with -file or -links, if you upload it to the wiki. + This command will change 'referer' to 'referrer', but not in pages which talk about HTTP, where the typo has become part of the standard:
python replace.py referer referrer -file:typos.txt -excepttext:HTTP + +Please type "replace.py | more" if you can't read the top of the help. """ from __future__ import generators # -# (C) Daniel Herding & the Pywikipedia team, 2004-2009 +# (C) Daniel Herding & the Pywikipedia team, 2004-2010 # __version__='$Id$' # # Distributed under the terms of the MIT license. #
-import sys, re, time +import sys, re, time, codecs import wikipedia as pywikibot import pagegenerators import editarticle @@ -263,7 +283,7 @@ """ def __init__(self, generator, replacements, exceptions={}, acceptall=False, allowoverlap=False, recursive=False, - addedCat=None, sleep=None, editSummary=''): + addedCat=None, sleep=None, editSummary='', articles=None): """ Arguments: * generator - A generator that yields Page objects. @@ -278,6 +298,8 @@ replaced. * addedCat - If set to a value, add this category to every page touched. + * articles - An open file to save the page titles. If None, + we work on our wikisite immediately (default).
Structure of the exceptions dictionary: This dictionary can have these keys: @@ -310,6 +332,11 @@ self.sleep = sleep # Some function to set default editSummary should probably be added self.editSummary = editSummary + self.articles = articles + + #An edit counter to split the file by 100 titles if -save or -savenew + #is on, and to display the number of edited articles otherwise. + self.editcounter = 0
def isTitleExcepted(self, title): """ @@ -354,6 +381,31 @@ allowoverlap=self.allowoverlap) return new_text
+ def writeEditCounter(self): + """ At the end of our work this writes the counter. """ + if self.articles: + pywikibot.output(u'%d title%s saved.' + % (self.editcounter, + (lambda x: bool(x-1) and 's were' or ' was') + (self.editcounter))) + else: + pywikibot.output(u'%d page%s changed.' + % (self.editcounter, + (lambda x: bool(x-1) and 's were' or ' was') + (self.editcounter))) + + def splitLine(self): + """Returns a splitline after every 100th title. Splitline is in HTML + comment format in case we want to insert the list into a wikipage. + We use it to make the file more readable. + + """ + if self.editcounter % 100: + return '' + else: + return (u'<!-- ***** %dth title is above this line. ***** -->\n' % + self.editcounter) + def run(self): """ Starts the robot. @@ -408,7 +460,8 @@ break choice = pywikibot.inputChoice( u'Do you want to accept these changes?', - ['Yes', 'No', 'Edit', 'open in Browser', 'All', "Quit"], + ['Yes', 'No', 'Edit', 'open in Browser', 'All', + 'Quit'], ['y', 'N', 'e', 'b', 'a', 'q'], 'N') if choice == 'e': editor = editarticle.TextEditor() @@ -427,30 +480,57 @@ new_text = original_text continue if choice == 'q': + self.writeEditCounter() return if choice == 'a': self.acceptall = True if choice == 'y': - page.put_async(new_text, self.editSummary) + if not self.articles: + #Primary behaviour: working on wiki + page.put_async(new_text, self.editSummary) + self.editcounter += 1 + #Bug: this increments even if put_async fails + #This is separately in two clauses of if for + #future purposes to get feedback form put_async + else: + #Save the title for later processing instead of editing + self.editcounter += 1 + self.articles.write(u'#%s\n%s' + % (page.title(asLink=True), + self.splitLine())) + self.articles.flush() # For the peace of our soul :-) # choice must be 'N' break if self.acceptall and new_text != original_text: - try: - page.put(new_text, self.editSummary) - except pywikibot.EditConflict: - pywikibot.output(u'Skipping %s because of edit conflict' - % (page.title(),)) - except pywikibot.SpamfilterError, e: - pywikibot.output( - u'Cannot change %s because of blacklist entry %s' - % (page.title(), e.url)) - except pywikibot.PageNotSaved, error: - pywikibot.output(u'Error putting page: %s' - % (error.args,)) - except pywikibot.LockedPage: - pywikibot.output(u'Skipping %s (locked page)' - % (page.title(),)) + if not self.articles: + #Primary behaviour: working on wiki + try: + page.put(new_text, self.editSummary) + self.editcounter += 1 #increment only on success + except pywikibot.EditConflict: + pywikibot.output(u'Skipping %s because of edit conflict' + % (page.title(),)) + except pywikibot.SpamfilterError, e: + pywikibot.output( + u'Cannot change %s because of blacklist entry %s' + % (page.title(), e.url)) + except pywikibot.PageNotSaved, error: + pywikibot.output(u'Error putting page: %s' + % (error.args,)) + except pywikibot.LockedPage: + pywikibot.output(u'Skipping %s (locked page)' + % (page.title(),)) + else: + #Save the title for later processing instead of editing + self.editcounter += 1 + self.articles.write(u'#%s\n%s' + % (page.title(asLink=True), + self.splitLine())) + self.articles.flush()
+ #Finally: + self.writeEditCounter() + def prepareRegexForMySQL(pattern): pattern = pattern.replace('\s', '[:space:]') pattern = pattern.replace('\d', '[:digit:]') @@ -517,6 +597,11 @@ # Between a regex and another (using -fix) sleep some time (not to waste # too much CPU sleep = None + # Do not save the page titles, rather work on wiki + titlefile = None + filename = None + # If we save, primary behaviour is append rather then new file + append = True
# Read commandline parameters. for arg in pywikibot.handleArgs(*args): @@ -539,9 +624,22 @@ elif arg.startswith('-page'): if len(arg) == 5: PageTitles.append(pywikibot.input( - u'Which page do you want to change?')) + u'Which page do you want to change?')) else: PageTitles.append(arg[6:]) + elif arg.startswith('-savenew'): + if len(arg) == 8: + filename = pywikibot.input( +u'Please enter the filename to save the titles \n(will be deleted if exists):') + else: + filename = arg[9:] + elif arg.startswith('-save'): + append = False + if len(arg) == 5: + filename = pywikibot.input( + u'Please enter the filename to save the titles:') + else: + filename = arg[6:] elif arg.startswith('-excepttitle:'): exceptions['title'].append(arg[13:]) elif arg.startswith('-requiretitle:'): @@ -585,8 +683,9 @@ replacements.append((commandline_replacements[0], commandline_replacements[1])) if not summary_commandline: - editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % (' (-' + commandline_replacements[0] + ' +' - + commandline_replacements[1] + ')') + editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % ( + ' (-%s +%s)' % (commandline_replacements[0], + commandline_replacements[1])) elif (len(commandline_replacements) > 1): if (fix is None): for i in xrange (0, len(commandline_replacements), 2): @@ -598,7 +697,8 @@ for i in range(0, len(commandline_replacements), 2)] replacementsDescription = '(%s)' % ', '.join( [('-' + pair[0] + ' +' + pair[1]) for pair in pairs]) - editSummary = pywikibot.translate(pywikibot.getSite(), msg ) % replacementsDescription + editSummary = pywikibot.translate(pywikibot.getSite(), msg ) \ + % replacementsDescription else: raise pywikibot.Error( 'Specifying -fix with replacements is undefined') @@ -710,10 +810,30 @@ preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=20, lookahead=100) else: - preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=maxquerysize) + preloadingGen = pagegenerators.PreloadingGenerator(gen, + pageNumber=maxquerysize) + + #Finally we open the file for page titles or set article to None + if filename: + try: + #This opens in strict error mode, that means bot will stop + #on encoding errors with ValueError. + #See http://docs.python.org/library/codecs.html#codecs.open + titlefile = codecs.open(filename, encoding='utf-8', + mode=(lambda x: x and 'a' or 'w')(append)) + except IOError: + pywikibot.output("%s cannot be opened for writing." % + filename) + return bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, - allowoverlap, recursive, add_cat, sleep, editSummary) - bot.run() + allowoverlap, recursive, add_cat, sleep, editSummary, + titlefile) + try: + bot.run() + finally: + if titlefile: + #Just for the spirit of programming (it was flushed) + titlefile.close()
if __name__ == "__main__":
pywikipedia-svn@lists.wikimedia.org