[Pywikipedia-l] SVN: [6095] trunk/pywikipedia/djvutext.py
nicdumz at svn.wikimedia.org
nicdumz at svn.wikimedia.org
Fri Nov 14 06:16:35 UTC 2008
Revision: 6095
Author: nicdumz
Date: 2008-11-14 06:16:35 +0000 (Fri, 14 Nov 2008)
Log Message:
-----------
What about some big cleanups ?
Make it unicode safe, remove tabs, fix indentation, ensure that debug mode can be activated as promised... And various style changes
Modified Paths:
--------------
trunk/pywikipedia/djvutext.py
Modified: trunk/pywikipedia/djvutext.py
===================================================================
--- trunk/pywikipedia/djvutext.py 2008-11-13 12:01:50 UTC (rev 6094)
+++ trunk/pywikipedia/djvutext.py 2008-11-14 06:16:35 UTC (rev 6095)
@@ -33,59 +33,53 @@
# NOTE: Put a good description here, and add translations, if possible!
msg = {
'en': u'Robot: creating page with text extracted from DjVu',
- 'ar': u'روبوت: إنشاء صفحة بنص مستخرج من DjVu',
- 'fr': u'Bot: creating page with texte extracted from DjVu',
+ 'ar': u'روبوت: إنشاء صفحة بنص مستخرج من DjVu',
+ 'fr': u'Bot: creating page with texte extracted from DjVu',
'pt': u'Bot: criando página com texto extraído do DjVu',
}
# On English Wikisource, {{blank page}} is used to track blank pages.
# It may be omitted by adding an empty string like has been done for 'fr'.
blank = {
'en': u'{{blank page}}',
- 'fr': u'',
+ 'fr': u'',
'pt': u'',
}
- def __init__(self, djvu, index, pages):
+ def __init__(self, djvu, index, pages, ask=False, debug=False):
"""
Constructor. Parameters:
- djvu : filename
- index : page name
- pages : page range
+ djvu : filename
+ index : page name
+ pages : page range
"""
self.djvu = djvu
self.index = index
- self.pages = pages
- self.debug = False
- self.ask = False
+ self.pages = pages
+ self.debug = debug
+ self.ask = ask
def NoOfImages(self):
- cmd = "djvused -e 'n' \"%s\"" % (self.djvu)
+ cmd = "djvused -e 'n' \"%s\"" % (self.djvu)
count = os.popen( cmd ).readline().rstrip()
- #count = count[:-1]
- print "page count = %s" % count
- count = int(count)
- print "page count = %d" % count
- return int(count)
+ count = int(count)
+ wikipedia.output("page count = %d" % count)
+ return count
def PagesGenerator(self):
start = 1
- end = self.NoOfImages()
+ end = self.NoOfImages()
- if self.pages:
- pos = self.pages.find('-')
- if pos != -1:
- start = self.pages[:pos]
- if pos < len(self.pages)-1:
- end = self.pages[pos+1:]
- end = int(end)
- else:
- start = self.pages
- end = int(start)
- i = int(start)
- print "processing pages %d-%d" % (i, end)
- while i <= end:
- yield i
- i=i+1
+ if self.pages:
+ pos = self.pages.find('-')
+ if pos != -1:
+ start = int(self.pages[:pos])
+ if pos < len(self.pages)-1:
+ end = int(self.pages[pos+1:])
+ else:
+ start = int(self.pages)
+ end = start
+ wikipedia.output(u"Processing pages %d-%d" % (start, end))
+ return range(start, end+1)
def run(self):
# Set the edit summary message
@@ -93,16 +87,16 @@
linkingPage = wikipedia.Page(wikipedia.getSite(), self.index)
self.prefix = linkingPage.titleWithoutNamespace()
- if self.prefix[0:6] == 'Liber:':
- self.prefix = self.prefix[6:]
- wikipedia.output("Using prefix %s" % self.prefix)
- gen = self.PagesGenerator()
-
- site = wikipedia.getSite()
+ if self.prefix[0:6] == 'Liber:':
+ self.prefix = self.prefix[6:]
+ wikipedia.output(u"Using prefix %s" % self.prefix)
+ gen = self.PagesGenerator()
+
+ site = wikipedia.getSite()
self.username = config.usernames[site.family.name][site.lang]
for pageno in gen:
- wikipedia.output("Processing page %d" % pageno)
+ wikipedia.output("Processing page %d" % pageno)
self.treat(pageno)
def has_text(self):
@@ -113,14 +107,7 @@
s = f.read()
f.close()
-
- import string
- blah = string.find(s, 'TXTz') # text layers are described with this value
-
- if string.find(s, 'TXTz') >= 0:
- return True
- else:
- return False
+ return s.find('TXTz') >= 0
def get_page(self, pageno):
wikipedia.output(unicode("fetching page %d" % (pageno)))
@@ -137,54 +124,57 @@
"""
Loads the given page, does some changes, and saves it.
"""
- site = wikipedia.getSite()
- page_namespace = site.family.namespaces[104][site.lang]
- page = wikipedia.Page(site, '%s:%s/%d' % (page_namespace, self.prefix, pageno) )
- exists = page.exists()
+ site = wikipedia.getSite()
+ page_namespace = site.family.namespaces[104][site.lang]
+ page = wikipedia.Page(site, u'%s:%s/%d' % (page_namespace, self.prefix, pageno) )
+ exists = page.exists()
djvutxt = self.get_page(pageno)
if not djvutxt:
- djvutxt = wikipedia.translate(wikipedia.getSite(), self.blank)
- text = '<noinclude>{{PageQuality|1|%s}}<div class="pagetext">%s</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,"\n\n\n",djvutxt)
+ djvutxt = wikipedia.translate(wikipedia.getSite(), self.blank)
+ text = u'<noinclude>{{PageQuality|1|%s}}<div class="pagetext">\n\n\n</noinclude>%s<noinclude><references/></div></noinclude>' % (self.username,djvutxt)
# convert to wikisyntax
# this adds a second line feed, which makes a new paragraph
- text = text.replace('', "\n")
+ text = text.replace('', "\n")
# only save if something was changed
# automatically ask if overwriting an existing page
- old_text = ''
+
ask = self.ask
if exists:
- ask = 'y'
+ ask = True
old_text = page.get()
+ if old_text == text:
+ wikipedia.output(u"No changes were needed on %s" % page.aslink())
+ return
+ else:
+ old_text = ''
- if not exists or text != old_text:
- # Show the title of the page we're working on.
- # Highlight the title in purple.
- wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
- # show what was changed
- if exists:
- wikipedia.showDiff(old_text, text)
- else:
- wikipedia.output(text)
- if not self.debug:
- choice = 'y'
- if ask:
- choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
- if choice == 'y':
- try:
- # Save the page
- page.put_async(text)
- except wikipedia.LockedPage:
- wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
- except wikipedia.EditConflict:
- wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
- except wikipedia.SpamfilterError, error:
- wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
+ wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
+ wikipedia.showDiff(old_text, text)
+ if self.debug:
+ wikipedia.inputChoice(u'Debug mode... Press enter to continue', [], [], 'dummy')
+ return
+ if ask:
+ choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
+ else:
+ choice = 'y'
+ if choice == 'y':
+ try:
+ # Save the page
+ page.put_async(text)
+ except wikipedia.LockedPage:
+ wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
+ except wikipedia.EditConflict:
+ wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
+ except wikipedia.SpamfilterError, error:
+ wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
+
+
def main():
import os
index = None
@@ -199,15 +189,15 @@
if arg.startswith("-debug"):
debug = True
elif arg.startswith("-ask"):
- ask = True
+ ask = True
elif arg.startswith("-djvu:"):
- djvu = arg[6:]
- elif arg.startswith("-index:"):
- index = arg[7:]
+ djvu = arg[6:]
+ elif arg.startswith("-index:"):
+ index = arg[7:]
elif arg.startswith("-pages:"):
- pages = arg[7:]
+ pages = arg[7:]
else:
- print "Unknown argument %s" % arg
+ wikipedia.output(u"Unknown argument %s" % arg)
# Check the djvu file exists
if djvu:
@@ -222,7 +212,7 @@
index_page = wikipedia.Page(site, index)
if site.family.name != 'wikisource':
- raise wikipedia.PageNotFound(u"Found family '%s'; Wikisource required." % site.family.name)
+ raise wikipedia.PageNotFound(u"Found family '%s'; Wikisource required." % site.family.name)
if not index_page.exists() and index_page.namespace() == 0:
index_namespace = wikipedia.Page(site, 'MediaWiki:Proofreadpage index namespace').get()
@@ -231,15 +221,14 @@
u"%s:%s" % (index_namespace, index))
if not index_page.exists():
- raise wikipedia.NoPage("Page '%s' does not exist" % index)
+ raise wikipedia.NoPage(u"Page '%s' does not exist" % index)
- wikipedia.output(u"uploading text from %s to %s" % (djvu, index_page) )
+ wikipedia.output(u"uploading text from %s to %s" % (djvu, index_page.aslink()) )
- bot = DjVuTextBot(djvu, index, pages)
- if not bot.has_text():
+ bot = DjVuTextBot(djvu, index, pages, ask, debug)
+ if not bot.has_text():
raise ValueError("No text layer in djvu file")
- bot.ask = ask
bot.run()
else:
wikipedia.showHelp()
More information about the Pywikipedia-l
mailing list