[Pywikipedia-l] SVN: [5963] trunk/pywikipedia/djvutext.py

jayvdb at svn.wikimedia.org jayvdb at svn.wikimedia.org
Tue Oct 14 05:22:36 UTC 2008


Revision: 5963
Author:   jayvdb
Date:     2008-10-14 05:22:36 +0000 (Tue, 14 Oct 2008)

Log Message:
-----------
Check that the djvu file has a text layer

Modified Paths:
--------------
    trunk/pywikipedia/djvutext.py

Modified: trunk/pywikipedia/djvutext.py
===================================================================
--- trunk/pywikipedia/djvutext.py	2008-10-14 03:07:44 UTC (rev 5962)
+++ trunk/pywikipedia/djvutext.py	2008-10-14 05:22:36 UTC (rev 5963)
@@ -100,9 +100,37 @@
         self.username = config.usernames[site.family.name][site.lang]
 
         for pageno in gen:
-	    print "Processing page %d" % pageno
+	    wikipedia.output("Processing page %d" % pageno)
             self.treat(pageno)
 
+    def has_text(self):
+        cmd = "djvudump \"%s\" > \"%s\".out" % (self.djvu, self.djvu)
+        os.system ( cmd )
+
+        f = codecs.open("%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
+
+        s = f.read()
+        f.close()
+
+        import string
+        blah = string.find(s, 'TXTz') # text layers are described with this value
+
+        if string.find(s, 'TXTz') >= 0:
+            return True
+        else:
+            return False
+       
+    def get_page(self, pageno):
+        wikipedia.output("fetching page %d" % (pageno))
+        cmd = "djvutxt -page=%d \"%s\" \"%s.out\"" % (pageno, self.djvu, self.djvu)
+        os.system ( cmd )
+
+        f = codecs.open("%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
+
+        djvu_text = f.read()
+        f.close()
+        return djvu_text
+
     def treat(self, pageno):
         """
         Loads the given page, does some changes, and saves it.
@@ -112,20 +140,8 @@
 	page = wikipedia.Page(site, '%s:%s/%d' % (page_namespace, self.prefix, pageno) )
 	exists = page.exists()
 
-        ################################################################
-        # NOTE: Here you can modify the text in whatever way you want. #
-        ################################################################
+        djvutxt = self.get_page(pageno)
 
-        print "fetching page %d" % (pageno)
-	cmd = "djvutxt -page=%d \"%s\" \"%s.out\"" % (pageno, self.djvu, self.djvu)
-        os.system ( cmd )
-
-        f = codecs.open("%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
-
-	djvutxt = f.read()
-
-	f.close()
-
         if not djvutxt:
 	    djvutxt = wikipedia.translate(wikipedia.getSite(), self.blank)
 	text = '<noinclude>{{PageQuality|1|%s}}<div class="pagetext">%s</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,"\n\n\n",djvutxt)
@@ -210,6 +226,9 @@
         wikipedia.output("uploading text from %s to %s" % (djvu, index_page) )
 
         bot = DjVuTextBot(djvu, index, pages)
+	if not bot.has_text():
+            raise ValueError("No text layer in djvu file")
+
         bot.ask = ask
         bot.run()
     else:





More information about the Pywikipedia-l mailing list