Revision: 5963 Author: jayvdb Date: 2008-10-14 05:22:36 +0000 (Tue, 14 Oct 2008)
Log Message: ----------- Check that the djvu file has a text layer
Modified Paths: -------------- trunk/pywikipedia/djvutext.py
Modified: trunk/pywikipedia/djvutext.py =================================================================== --- trunk/pywikipedia/djvutext.py 2008-10-14 03:07:44 UTC (rev 5962) +++ trunk/pywikipedia/djvutext.py 2008-10-14 05:22:36 UTC (rev 5963) @@ -100,9 +100,37 @@ self.username = config.usernames[site.family.name][site.lang]
for pageno in gen: - print "Processing page %d" % pageno + wikipedia.output("Processing page %d" % pageno) self.treat(pageno)
+ def has_text(self): + cmd = "djvudump "%s" > "%s".out" % (self.djvu, self.djvu) + os.system ( cmd ) + + f = codecs.open("%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace') + + s = f.read() + f.close() + + import string + blah = string.find(s, 'TXTz') # text layers are described with this value + + if string.find(s, 'TXTz') >= 0: + return True + else: + return False + + def get_page(self, pageno): + wikipedia.output("fetching page %d" % (pageno)) + cmd = "djvutxt -page=%d "%s" "%s.out"" % (pageno, self.djvu, self.djvu) + os.system ( cmd ) + + f = codecs.open("%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace') + + djvu_text = f.read() + f.close() + return djvu_text + def treat(self, pageno): """ Loads the given page, does some changes, and saves it. @@ -112,20 +140,8 @@ page = wikipedia.Page(site, '%s:%s/%d' % (page_namespace, self.prefix, pageno) ) exists = page.exists()
- ################################################################ - # NOTE: Here you can modify the text in whatever way you want. # - ################################################################ + djvutxt = self.get_page(pageno)
- print "fetching page %d" % (pageno) - cmd = "djvutxt -page=%d "%s" "%s.out"" % (pageno, self.djvu, self.djvu) - os.system ( cmd ) - - f = codecs.open("%s.out" % self.djvu, 'r', config.textfile_encoding, 'replace') - - djvutxt = f.read() - - f.close() - if not djvutxt: djvutxt = wikipedia.translate(wikipedia.getSite(), self.blank) text = '<noinclude>{{PageQuality|1|%s}}<div class="pagetext">%s</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,"\n\n\n",djvutxt) @@ -210,6 +226,9 @@ wikipedia.output("uploading text from %s to %s" % (djvu, index_page) )
bot = DjVuTextBot(djvu, index, pages) + if not bot.has_text(): + raise ValueError("No text layer in djvu file") + bot.ask = ask bot.run() else: