[Pywikipedia-l] SVN: [5365] trunk/pywikipedia/djvutext.py
jayvdb at svn.wikimedia.org
jayvdb at svn.wikimedia.org
Wed May 14 10:56:07 UTC 2008
Revision: 5365
Author: jayvdb
Date: 2008-05-14 10:56:07 +0000 (Wed, 14 May 2008)
Log Message:
-----------
Wikisource DJVU text layer extraction bot
Added Paths:
-----------
trunk/pywikipedia/djvutext.py
Added: trunk/pywikipedia/djvutext.py
===================================================================
--- trunk/pywikipedia/djvutext.py (rev 0)
+++ trunk/pywikipedia/djvutext.py 2008-05-14 10:56:07 UTC (rev 5365)
@@ -0,0 +1,191 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+This bot uploads text from djvu files onto pages in the "Page"
+namespace. It is intended to be used for Wikisource.
+
+The following parameters are supported:
+
+ -debug If given, doesn't do any real changes, but only shows
+ what would have been changed.
+ -ask Ask for confirmation before creating each page.
+ -index:... Name of the index page
+ -djvu:... Filename of the djvu file
+ -pages:<start>-<end> Page range to upload; <end> is optional
+
+All other parameters will be regarded as part of the title of a single page,
+and the bot will only work on that single page.
+"""
+__version__ = '$Id: basic.py 5227 2008-04-17 09:37:15Z mfarag $'
+import wikipedia
+import os
+import config, codecs
+
+# This is required for the text that is shown when you run this script
+# with the parameter -help.
+docuReplacements = {
+}
+
+class DjVuTextBot:
+ # Edit summary message that should be used.
+ # NOTE: Put a good description here, and add translations, if possible!
+ msg = {
+ 'en': u'Robot: creating page with text extracted from DjVu',
+ 'fr': u'Bot: creating page with texte extracted from DjVu',
+ }
+ # On English Wikisource, {{blank page}} is used to track blank pages.
+ # It may be omitted by adding an empty string like has been done for 'fr'.
+ blank = {
+ 'en': u'{{blank page}}',
+ 'fr': u'',
+ }
+
+ def __init__(self, djvu, index, pages):
+ """
+ Constructor. Parameters:
+ djvu : filename
+ index : page name
+ pages : page range
+ """
+ self.djvu = djvu
+ self.index = index
+ self.pages = pages
+ self.debug = False
+ self.ask = False
+
+ def NoOfImages(self):
+ cmd = "djvused -e 'n' %s" % (self.djvu)
+ count = os.popen( cmd ).readline().rstrip()
+ #count = count[:-1]
+ print "page count = %s" % count
+ count = int(count)
+ print "page count = %d" % count
+ return int(count)
+
+ def PagesGenerator(self):
+ start = 1
+ end = self.NoOfImages()
+
+ if self.pages:
+ if self.pages.index('-'):
+ pos = self.pages.index('-')
+ start = self.pages[:pos]
+ if pos < len(self.pages)-1:
+ end = self.pages[pos+1:]
+ end = int(end)
+ else:
+ start = self.pages
+ i = int(start)
+ print "processing pages %d-%d" % (i, end)
+ while i <= end:
+ yield i
+ i=i+1
+
+ def run(self):
+ # Set the edit summary message
+ wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg))
+
+ linkingPage = wikipedia.Page(wikipedia.getSite(), self.index)
+ self.prefix = linkingPage.titleWithoutNamespace()
+ gen = self.PagesGenerator()
+
+ site = wikipedia.getSite()
+ self.username = config.usernames[site.family.name][site.lang]
+
+ for pageno in gen:
+ print "Processing page %d" % pageno
+ self.treat(pageno)
+
+ def treat(self, pageno):
+ """
+ Loads the given page, does some changes, and saves it.
+ """
+ page = wikipedia.Page(wikipedia.getSite(), 'Page:%s/%d' % (self.prefix, pageno) )
+ exists = page.exists()
+
+ ################################################################
+ # NOTE: Here you can modify the text in whatever way you want. #
+ ################################################################
+
+ print "fetching page %d" % (pageno)
+ cmd = "djvutxt -page=%d %s djvutxt.out" % (pageno, self.djvu)
+ os.system ( cmd )
+
+ f = codecs.open("djvutxt.out", 'r', config.textfile_encoding)
+
+ djvutxt = f.read()
+
+ f.close()
+
+ if not djvutxt:
+ djvutxt = wikipedia.translate(wikipedia.getSite(), self.blank)
+ text = '<noinclude>{{PageQuality|1|%s}}<div class="pagetext">%s</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,"\n\n\n",djvutxt)
+
+ # only save if something was changed
+ if (not exists and text) or text != page.get():
+ # Show the title of the page we're working on.
+ # Highlight the title in purple.
+ wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
+ # show what was changed
+ if exists:
+ wikipedia.showDiff(page.get(), text)
+ else:
+ wikipedia.output(text)
+ if not self.debug:
+ choice = 'y'
+ if self.ask:
+ choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
+ if choice == 'y':
+ try:
+ # Save the page
+ page.put_async(text)
+ except wikipedia.LockedPage:
+ wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
+ except wikipedia.EditConflict:
+ wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
+ except wikipedia.SpamfilterError, error:
+ wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
+
+
+def main():
+ index = None
+ djvu = None
+ pages = None
+ # what would have been changed.
+ debug = False
+ ask = False
+
+ # Parse command line arguments
+ for arg in wikipedia.handleArgs():
+ if arg.startswith("-debug"):
+ debug = True
+ elif arg.startswith("-ask"):
+ ask = True
+ elif arg.startswith("-djvu:"):
+ djvu = arg[6:]
+ elif arg.startswith("-index:"):
+ index = arg[7:]
+ elif arg.startswith("-pages:"):
+ pages = arg[7:]
+ else:
+ print "Unknown argument %s" % arg
+
+ if djvu and index:
+ index_page = wikipedia.Page(wikipedia.getSite(), index)
+ if not index_page.exists():
+ wikipedia.output("%s does not exist" % index)
+ raise Exception
+
+ wikipedia.output("uploading text from %s to %s" % (djvu, index) )
+
+ bot = DjVuTextBot(djvu, index, pages)
+ bot.ask = ask
+ bot.run()
+ else:
+ wikipedia.showHelp()
+
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ wikipedia.stopme()
More information about the Pywikipedia-l
mailing list