[Pywikipedia-l] SVN: [5692] trunk/pywikipedia
avar at svn.wikimedia.org
avar at svn.wikimedia.org
Tue Jul 8 11:40:25 UTC 2008
Revision: 5692
Author: avar
Date: 2008-07-08 11:40:25 +0000 (Tue, 08 Jul 2008)
Log Message:
-----------
A new bot that allows you to pipe articles through external programs
Modified Paths:
--------------
trunk/pywikipedia/CONTENTS
Added Paths:
-----------
trunk/pywikipedia/piper.py
Modified: trunk/pywikipedia/CONTENTS
===================================================================
--- trunk/pywikipedia/CONTENTS 2008-07-07 19:52:24 UTC (rev 5691)
+++ trunk/pywikipedia/CONTENTS 2008-07-08 11:40:25 UTC (rev 5692)
@@ -111,6 +111,9 @@
nowcommons.py : This bot can be deleted images with NowCommons template.
pagefromfile.py : This bot takes its input from a file that contains a number of
pages to be put on the wiki.
+piper.py : Pipes article text through external program(s) on STDIN and collects
+ its STDOUT which is used as the new article text of it differs from the
+ original.
redirect.py : Fix double redirects and broken redirects. Note:
solve_disambiguation also has functions which treat
redirects.
Added: trunk/pywikipedia/piper.py
===================================================================
--- trunk/pywikipedia/piper.py (rev 0)
+++ trunk/pywikipedia/piper.py 2008-07-08 11:40:25 UTC (rev 5692)
@@ -0,0 +1,215 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+This is a bot that uses external filtering programs to munge the
+article text, for example:
+
+ python piper.py -filter:'tr A-Z a-z' Wikipedia:Sandbox
+
+Would lower case the article with tr(1).
+
+Muliple -filter commands can be specified:
+
+ python piper.py -filter:cat -filter:'tr A-Z a-z' -filter:'tr a-z A-Z' Wikipedia:Sandbox
+
+
+Would pipe the article text through cat(1) (NOOP) and then lower case
+it with tr(1) and upper case it again with tr(1)
+
+The following parameters are supported:
+
+¶ms;
+
+ -debug If given, doesn't do any real changes, but only shows
+ what would have been changed.
+
+ -always Always commit changes without asking you to accept them
+
+ -filter: Filter the article text through this program, can be
+ given multiple times to filter through multiple programs in
+ the order which they are given
+
+In addition all command-line parameters are passed to
+genFactory.handleArg() which means pagegenerators.py arguments are
+supported.
+
+"""
+__version__ = '$Id: basic.py 4946 2008-01-29 14:58:25Z wikipedian $'
+import wikipedia
+import pagegenerators
+
+import os
+import pipes
+import tempfile
+
+# This is required for the text that is shown when you run this script
+# with the parameter -help.
+docuReplacements = {
+ '¶ms;': pagegenerators.parameterHelp
+}
+
+class PiperBot:
+ # Edit summary message that should be used.
+ # NOTE: Put a good description here, and add translations, if possible!
+ msg = {
+ 'en': u'Robot: Piping the article text through %s',
+ 'is': u'Vélmenni: Pípa texta síðunnar í gegnum %s'
+ }
+
+ def __init__(self, generator, debug, filters, always):
+ """
+ Constructor. Parameters:
+ * generator - The page generator that determines on which pages
+ to work on.
+ * debug - If True, doesn't do any real changes, but only shows
+ what would have been changed.
+ * always - If True, don't prompt for changes
+ """
+ self.generator = generator
+ self.debug = debug
+ self.always = always
+ self.filters = filters
+
+ def run(self):
+ # Set the edit summary message
+ pipes = ', '.join(self.filters)
+ wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg) % pipes)
+ for page in self.generator:
+ self.treat(page)
+
+ def pipe(self, program, text):
+ """
+ Pipes a given text through a given program and returns it
+ """
+
+ text = text.encode('utf-8')
+
+ pipe = pipes.Template()
+ pipe.append(program.encode("ascii"), '--')
+
+ # Create a temporary filename to save the piped stuff to
+ tempFilename = '%s.%s' % (tempfile.mktemp(), 'txt')
+ file = pipe.open(tempFilename, 'w')
+ file.write(text)
+ file.close()
+
+ # Now retrieve the munged text
+ mungedText = open(tempFilename, 'r').read()
+ # clean up
+ os.unlink(tempFilename)
+
+ unicode_text = unicode(mungedText, 'utf-8')
+
+ return unicode_text
+
+ # debug
+ #def savePage(self, name, text):
+ # mungedName = name.replace(":", "_").replace("/", "_").replace(" ", "_")
+ #
+ # saveName = "/tmp/piper/%s" % mungedName
+ # file = open(saveName, 'w')
+ # file.write(text.encode("utf-8"))
+ # file.close()
+ # print "Wrote to %s" % saveName
+
+ def treat(self, page):
+ """
+ Loads the given page, does some changes, and saves it.
+ """
+ try:
+ # Load the page
+ text = page.get()
+ except wikipedia.NoPage:
+ wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())
+ return
+ except wikipedia.IsRedirectPage:
+ wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
+ return
+
+ # debug
+ # self.savePage(page.title(), text)
+
+ # Munge!
+ for program in self.filters:
+ text = self.pipe(program, text);
+
+ # only save if something was changed
+ if text != page.get():
+ # Show the title of the page we're working on.
+ # Highlight the title in purple.
+ wikipedia.output(u"\n\n>>> %s <<<" % page.title())
+ # show what was changed
+ wikipedia.showDiff(page.get(), text)
+ if not self.debug:
+ if not self.always:
+ choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
+ else:
+ choice = 'y'
+ if choice == 'y':
+ try:
+ # Save the page
+ page.put(text)
+ except wikipedia.LockedPage:
+ wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
+ except wikipedia.EditConflict:
+ wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
+ except wikipedia.SpamfilterError, error:
+ wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
+
+
+def main():
+ # This factory is responsible for processing command line arguments
+ # that are also used by other scripts and that determine on which pages
+ # to work on.
+ genFactory = pagegenerators.GeneratorFactory()
+ # The generator gives the pages that should be worked upon.
+ gen = None
+ # This temporary array is used to read the page title if one single
+ # page to work on is specified by the arguments.
+ pageTitleParts = []
+ # If debug is True, doesn't do any real changes, but only show
+ # what would have been changed.
+ debug = False
+ # will become True when the user uses the -always flag.
+ always = False
+ # The program to pipe stuff through
+ filters = []
+
+ # Parse command line arguments
+ for arg in wikipedia.handleArgs():
+ if arg.startswith("-debug"):
+ debug = True
+ elif arg.startswith("-filter:"):
+ prog = arg[8:]
+ filters.append(prog)
+ elif arg.startswith("-always"):
+ always = True
+ else:
+ # check if a standard argument like
+ # -start:XYZ or -ref:Asdf was given.
+ generator = genFactory.handleArg(arg)
+ if generator:
+ gen = generator
+ else:
+ pageTitleParts.append(arg)
+
+ if pageTitleParts != []:
+ # We will only work on a single page.
+ pageTitle = ' '.join(pageTitleParts)
+ page = wikipedia.Page(wikipedia.getSite(), pageTitle)
+ gen = iter([page])
+
+ if gen:
+ # The preloading generator is responsible for downloading multiple
+ # pages from the wiki simultaneously.
+ gen = pagegenerators.PreloadingGenerator(gen)
+ bot = PiperBot(gen, debug, filters, always)
+ bot.run()
+ else:
+ wikipedia.showHelp()
+
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ wikipedia.stopme()
More information about the Pywikipedia-l
mailing list