[Pywikipedia-l] SVN: [5365] trunk/pywikipedia/djvutext.py

Wed May 14 10:56:07 UTC 2008

Revision: 5365
Author:   jayvdb
Date:     2008-05-14 10:56:07 +0000 (Wed, 14 May 2008)

Log Message:
-----------
Wikisource DJVU text layer extraction bot

Added Paths:
-----------
    trunk/pywikipedia/djvutext.py

Added: trunk/pywikipedia/djvutext.py
===================================================================

--- trunk/pywikipedia/djvutext.py	                        (rev 0)
+++ trunk/pywikipedia/djvutext.py	2008-05-14 10:56:07 UTC (rev 5365)
@@ -0,0 +1,191 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+"""
+This bot uploads text from djvu files onto pages in the "Page"
+namespace.  It is intended to be used for Wikisource.
+
+The following parameters are supported:
+
+    -debug         If given, doesn't do any real changes, but only shows
+                   what would have been changed.
+    -ask           Ask for confirmation before creating each page.
+    -index:...     Name of the index page
+    -djvu:...      Filename of the djvu file
+    -pages:<start>-<end> Page range to upload; <end> is optional 
+
+All other parameters will be regarded as part of the title of a single page,
+and the bot will only work on that single page.
+"""
+__version__ = '$Id: basic.py 5227 2008-04-17 09:37:15Z mfarag $'
+import wikipedia
+import os
+import config, codecs
+
+# This is required for the text that is shown when you run this script
+# with the parameter -help.
+docuReplacements = {
+}
+
+class DjVuTextBot:
+    # Edit summary message that should be used.
+    # NOTE: Put a good description here, and add translations, if possible!
+    msg = {
+        'en': u'Robot: creating page with text extracted from DjVu',
+	'fr': u'Bot: creating page with texte extracted from DjVu',
+    }
+    # On English Wikisource, {{blank page}} is used to track blank pages.
+    # It may be omitted by adding an empty string like has been done for 'fr'.
+    blank = {
+        'en': u'{{blank page}}',
+	'fr': u'',
+    }
+
+    def __init__(self, djvu, index, pages):
+        """
+        Constructor. Parameters:
+	   djvu : filename
+	   index : page name
+	   pages : page range
+        """
+        self.djvu = djvu
+        self.index = index
+	self.pages = pages
+	self.debug = False
+	self.ask = False
+
+    def NoOfImages(self):
+	cmd = "djvused -e 'n' %s" % (self.djvu)
+        count = os.popen( cmd ).readline().rstrip()
+	#count = count[:-1]
+	print "page count = %s" % count
+	count = int(count)
+	print "page count = %d" % count
+	return int(count)
+
+    def PagesGenerator(self):
+        start = 1
+	end = self.NoOfImages()
+
+	if self.pages:
+	    if self.pages.index('-'):
+	        pos = self.pages.index('-')
+	        start = self.pages[:pos]
+		if pos < len(self.pages)-1:
+		    end = self.pages[pos+1:]
+		    end = int(end)
+	    else:
+	        start = self.pages
+	i = int(start)
+	print "processing pages %d-%d" % (i, end)
+	while i <= end:
+	   yield i
+	   i=i+1
+
+    def run(self):
+        # Set the edit summary message
+        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg))
+
+        linkingPage = wikipedia.Page(wikipedia.getSite(), self.index)
+        self.prefix = linkingPage.titleWithoutNamespace()
+        gen = self.PagesGenerator()
+
+	site = wikipedia.getSite()
+        self.username = config.usernames[site.family.name][site.lang]
+
+        for pageno in gen:
+	    print "Processing page %d" % pageno
+            self.treat(pageno)
+
+    def treat(self, pageno):
+        """
+        Loads the given page, does some changes, and saves it.
+        """
+	page = wikipedia.Page(wikipedia.getSite(), 'Page:%s/%d' % (self.prefix, pageno) )
+	exists = page.exists()
+
+        ################################################################
+        # NOTE: Here you can modify the text in whatever way you want. #
+        ################################################################
+
+        print "fetching page %d" % (pageno)
+	cmd = "djvutxt -page=%d %s djvutxt.out" % (pageno, self.djvu)
+        os.system ( cmd )
+
+        f = codecs.open("djvutxt.out", 'r', config.textfile_encoding)
+
+	djvutxt = f.read()
+
+	f.close()
+
+        if not djvutxt:
+	    djvutxt = wikipedia.translate(wikipedia.getSite(), self.blank)
+	text = '<noinclude>{{PageQuality|1|%s}}<div class="pagetext">%s</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,"\n\n\n",djvutxt)
+
+        # only save if something was changed
+        if (not exists and text) or text != page.get():
+            # Show the title of the page we're working on.
+            # Highlight the title in purple.
+            wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
+            # show what was changed
+	    if exists:
+                wikipedia.showDiff(page.get(), text)
+            else:
+	        wikipedia.output(text)
+            if not self.debug:
+	        choice = 'y'
+		if self.ask:
+                    choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
+                if choice == 'y':
+                    try:
+                        # Save the page
+                        page.put_async(text)
+                    except wikipedia.LockedPage:
+                        wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
+                    except wikipedia.EditConflict:
+                        wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
+                    except wikipedia.SpamfilterError, error:
+                        wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
+
+
+def main():
+    index = None
+    djvu = None
+    pages = None
+    # what would have been changed.
+    debug = False
+    ask = False
+
+    # Parse command line arguments
+    for arg in wikipedia.handleArgs():
+        if arg.startswith("-debug"):
+            debug = True
+        elif arg.startswith("-ask"):
+	    ask = True
+        elif arg.startswith("-djvu:"):
+	    djvu = arg[6:]
+	elif arg.startswith("-index:"):
+	    index = arg[7:]
+        elif arg.startswith("-pages:"):
+	    pages = arg[7:]
+        else:
+            print "Unknown argument %s" % arg
+
+    if djvu and index:
+	index_page = wikipedia.Page(wikipedia.getSite(), index)
+	if not index_page.exists():
+	     wikipedia.output("%s does not exist" % index)
+	     raise Exception
+
+        wikipedia.output("uploading text from %s to %s" % (djvu, index) )
+
+        bot = DjVuTextBot(djvu, index, pages)
+	bot.ask = ask
+        bot.run()
+    else:
+        wikipedia.showHelp()
+
+if __name__ == "__main__":
+    try:
+        main()
+    finally:
+        wikipedia.stopme()