Revision: 5692 Author: avar Date: 2008-07-08 11:40:25 +0000 (Tue, 08 Jul 2008)
Log Message: ----------- A new bot that allows you to pipe articles through external programs
Modified Paths: -------------- trunk/pywikipedia/CONTENTS
Added Paths: ----------- trunk/pywikipedia/piper.py
Modified: trunk/pywikipedia/CONTENTS =================================================================== --- trunk/pywikipedia/CONTENTS 2008-07-07 19:52:24 UTC (rev 5691) +++ trunk/pywikipedia/CONTENTS 2008-07-08 11:40:25 UTC (rev 5692) @@ -111,6 +111,9 @@ nowcommons.py : This bot can be deleted images with NowCommons template. pagefromfile.py : This bot takes its input from a file that contains a number of pages to be put on the wiki. +piper.py : Pipes article text through external program(s) on STDIN and collects + its STDOUT which is used as the new article text of it differs from the + original. redirect.py : Fix double redirects and broken redirects. Note: solve_disambiguation also has functions which treat redirects.
Added: trunk/pywikipedia/piper.py =================================================================== --- trunk/pywikipedia/piper.py (rev 0) +++ trunk/pywikipedia/piper.py 2008-07-08 11:40:25 UTC (rev 5692) @@ -0,0 +1,215 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +This is a bot that uses external filtering programs to munge the +article text, for example: + + python piper.py -filter:'tr A-Z a-z' Wikipedia:Sandbox + +Would lower case the article with tr(1). + +Muliple -filter commands can be specified: + + python piper.py -filter:cat -filter:'tr A-Z a-z' -filter:'tr a-z A-Z' Wikipedia:Sandbox + + +Would pipe the article text through cat(1) (NOOP) and then lower case +it with tr(1) and upper case it again with tr(1) + +The following parameters are supported: + +¶ms; + + -debug If given, doesn't do any real changes, but only shows + what would have been changed. + + -always Always commit changes without asking you to accept them + + -filter: Filter the article text through this program, can be + given multiple times to filter through multiple programs in + the order which they are given + +In addition all command-line parameters are passed to +genFactory.handleArg() which means pagegenerators.py arguments are +supported. + +""" +__version__ = '$Id: basic.py 4946 2008-01-29 14:58:25Z wikipedian $' +import wikipedia +import pagegenerators + +import os +import pipes +import tempfile + +# This is required for the text that is shown when you run this script +# with the parameter -help. +docuReplacements = { + '¶ms;': pagegenerators.parameterHelp +} + +class PiperBot: + # Edit summary message that should be used. + # NOTE: Put a good description here, and add translations, if possible! + msg = { + 'en': u'Robot: Piping the article text through %s', + 'is': u'Vélmenni: Pípa texta síðunnar í gegnum %s' + } + + def __init__(self, generator, debug, filters, always): + """ + Constructor. Parameters: + * generator - The page generator that determines on which pages + to work on. + * debug - If True, doesn't do any real changes, but only shows + what would have been changed. + * always - If True, don't prompt for changes + """ + self.generator = generator + self.debug = debug + self.always = always + self.filters = filters + + def run(self): + # Set the edit summary message + pipes = ', '.join(self.filters) + wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg) % pipes) + for page in self.generator: + self.treat(page) + + def pipe(self, program, text): + """ + Pipes a given text through a given program and returns it + """ + + text = text.encode('utf-8') + + pipe = pipes.Template() + pipe.append(program.encode("ascii"), '--') + + # Create a temporary filename to save the piped stuff to + tempFilename = '%s.%s' % (tempfile.mktemp(), 'txt') + file = pipe.open(tempFilename, 'w') + file.write(text) + file.close() + + # Now retrieve the munged text + mungedText = open(tempFilename, 'r').read() + # clean up + os.unlink(tempFilename) + + unicode_text = unicode(mungedText, 'utf-8') + + return unicode_text + + # debug + #def savePage(self, name, text): + # mungedName = name.replace(":", "_").replace("/", "_").replace(" ", "_") + # + # saveName = "/tmp/piper/%s" % mungedName + # file = open(saveName, 'w') + # file.write(text.encode("utf-8")) + # file.close() + # print "Wrote to %s" % saveName + + def treat(self, page): + """ + Loads the given page, does some changes, and saves it. + """ + try: + # Load the page + text = page.get() + except wikipedia.NoPage: + wikipedia.output(u"Page %s does not exist; skipping." % page.aslink()) + return + except wikipedia.IsRedirectPage: + wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink()) + return + + # debug + # self.savePage(page.title(), text) + + # Munge! + for program in self.filters: + text = self.pipe(program, text); + + # only save if something was changed + if text != page.get(): + # Show the title of the page we're working on. + # Highlight the title in purple. + wikipedia.output(u"\n\n>>> %s <<<" % page.title()) + # show what was changed + wikipedia.showDiff(page.get(), text) + if not self.debug: + if not self.always: + choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N') + else: + choice = 'y' + if choice == 'y': + try: + # Save the page + page.put(text) + except wikipedia.LockedPage: + wikipedia.output(u"Page %s is locked; skipping." % page.aslink()) + except wikipedia.EditConflict: + wikipedia.output(u'Skipping %s because of edit conflict' % (page.title())) + except wikipedia.SpamfilterError, error: + wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url)) + + +def main(): + # This factory is responsible for processing command line arguments + # that are also used by other scripts and that determine on which pages + # to work on. + genFactory = pagegenerators.GeneratorFactory() + # The generator gives the pages that should be worked upon. + gen = None + # This temporary array is used to read the page title if one single + # page to work on is specified by the arguments. + pageTitleParts = [] + # If debug is True, doesn't do any real changes, but only show + # what would have been changed. + debug = False + # will become True when the user uses the -always flag. + always = False + # The program to pipe stuff through + filters = [] + + # Parse command line arguments + for arg in wikipedia.handleArgs(): + if arg.startswith("-debug"): + debug = True + elif arg.startswith("-filter:"): + prog = arg[8:] + filters.append(prog) + elif arg.startswith("-always"): + always = True + else: + # check if a standard argument like + # -start:XYZ or -ref:Asdf was given. + generator = genFactory.handleArg(arg) + if generator: + gen = generator + else: + pageTitleParts.append(arg) + + if pageTitleParts != []: + # We will only work on a single page. + pageTitle = ' '.join(pageTitleParts) + page = wikipedia.Page(wikipedia.getSite(), pageTitle) + gen = iter([page]) + + if gen: + # The preloading generator is responsible for downloading multiple + # pages from the wiki simultaneously. + gen = pagegenerators.PreloadingGenerator(gen) + bot = PiperBot(gen, debug, filters, always) + bot.run() + else: + wikipedia.showHelp() + +if __name__ == "__main__": + try: + main() + finally: + wikipedia.stopme()
pywikipedia-l@lists.wikimedia.org