[Pywikipedia-l] SVN: [5963] trunk/pywikipedia/djvutext.py
jayvdb at svn.wikimedia.org
jayvdb at svn.wikimedia.org
Tue Oct 14 05:22:36 UTC 2008
Revision: 5963
Author: jayvdb
Date: 2008-10-14 05:22:36 +0000 (Tue, 14 Oct 2008)
Log Message:
-----------
Check that the djvu file has a text layer
Modified Paths:
--------------
trunk/pywikipedia/djvutext.py
Modified: trunk/pywikipedia/djvutext.py
===================================================================
--- trunk/pywikipedia/djvutext.py 2008-10-14 03:07:44 UTC (rev 5962)
+++ trunk/pywikipedia/djvutext.py 2008-10-14 05:22:36 UTC (rev 5963)
@@ -100,9 +100,37 @@
self.username = config.usernames[site.family.name][site.lang]
for pageno in gen:
- print "Processing page %d" % pageno
+ wikipedia.output("Processing page %d" % pageno)
self.treat(pageno)
+ def has_text(self):
+ cmd = "djvudump \"%s\" > \"%s\".out" % (self.djvu, self.djvu)
+ os.system ( cmd )
+
+ f = codecs.open("%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
+
+ s = f.read()
+ f.close()
+
+ import string
+ blah = string.find(s, 'TXTz') # text layers are described with this value
+
+ if string.find(s, 'TXTz') >= 0:
+ return True
+ else:
+ return False
+
+ def get_page(self, pageno):
+ wikipedia.output("fetching page %d" % (pageno))
+ cmd = "djvutxt -page=%d \"%s\" \"%s.out\"" % (pageno, self.djvu, self.djvu)
+ os.system ( cmd )
+
+ f = codecs.open("%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
+
+ djvu_text = f.read()
+ f.close()
+ return djvu_text
+
def treat(self, pageno):
"""
Loads the given page, does some changes, and saves it.
@@ -112,20 +140,8 @@
page = wikipedia.Page(site, '%s:%s/%d' % (page_namespace, self.prefix, pageno) )
exists = page.exists()
- ################################################################
- # NOTE: Here you can modify the text in whatever way you want. #
- ################################################################
+ djvutxt = self.get_page(pageno)
- print "fetching page %d" % (pageno)
- cmd = "djvutxt -page=%d \"%s\" \"%s.out\"" % (pageno, self.djvu, self.djvu)
- os.system ( cmd )
-
- f = codecs.open("%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace')
-
- djvutxt = f.read()
-
- f.close()
-
if not djvutxt:
djvutxt = wikipedia.translate(wikipedia.getSite(), self.blank)
text = '<noinclude>{{PageQuality|1|%s}}<div class="pagetext">%s</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,"\n\n\n",djvutxt)
@@ -210,6 +226,9 @@
wikipedia.output("uploading text from %s to %s" % (djvu, index_page) )
bot = DjVuTextBot(djvu, index, pages)
+ if not bot.has_text():
+ raise ValueError("No text layer in djvu file")
+
bot.ask = ask
bot.run()
else:
More information about the Pywikipedia-l
mailing list