Revision: 8765 Author: purodha Date: 2010-12-07 15:18:55 +0000 (Tue, 07 Dec 2010) Log Message: ----------- Add -hintsonly option to interwiki.py making the 1st existing hinted page the start page if none supplied.
Modified Paths: -------------- trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2010-12-07 12:50:59 UTC (rev 8764) +++ trunk/pywikipedia/interwiki.py 2010-12-07 15:18:55 UTC (rev 8765) @@ -110,6 +110,18 @@ could be used for further explainings of the bot action. This will only be used in non-autonomous mode.
+ -hintsonly The bot does not ask for a page to work on, even if none of + the above page sources was specified. This will make the + first existing page of -hint or -hinfile slip in as the start + page, determining properties like namespace, disambiguation + state, and so on. When no existing page is found in the + hints, the bot does nothing. + Hitting return without input on the "Which page to check:" + prompt has the same effect as using -hintsonly. + Options like -back, -same or -wiktionary are in effect only + after a page has been found to work on. + (note: without ending colon) + These arguments are useful to provide hints to the bot:
-hint: used as -hint:de:Anweisung to give the robot a hint @@ -952,19 +964,21 @@ this Object. """
- def __init__(self, originPage, hints = None): + def __init__(self, originPage = None, hints = None): """Constructor. Takes as arguments the Page on the home wiki plus optionally a list of hints for translation"""
if globalvar.contentsondisk: - originPage = StoredPage(originPage) + if originPage: + originPage = StoredPage(originPage)
# Remember the "origin page" self.originPage = originPage # todo is a list of all pages that still need to be analyzed. # Mark the origin page as todo. self.todo = PageTree() - self.todo.add(originPage) + if originPage: + self.todo.add(originPage)
# done is a list of all pages that have been analyzed and that # are known to belong to this subject. @@ -973,7 +987,10 @@ # pages are values. It stores where we found each page. # As we haven't yet found a page that links to the origin page, we # start with an empty list for it. - self.foundIn = {self.originPage:[]} + if originPage: + self.foundIn = {self.originPage:[]} + else: + self.foundIn = {} # This is a list of all pages that are currently scheduled for # download. self.pending = PageTree() @@ -1024,23 +1041,25 @@ """ for tree in [self.done, self.pending, self.todo]: for page in tree.filter(site): - if page.namespace() == self.originPage.namespace(): + # -hintsonly: before we have an origin page, any namespace will do. + if self.originPage and page.namespace() == self.originPage.namespace(): if page.exists() and not page.isRedirectPage() and not page.isCategoryRedirect(): return page return None
def translate(self, hints = None, keephintedsites = False): """Add the given translation hints to the todo list""" - if globalvar.same: + if globalvar.same and self.originPage: if hints: - pages = titletranslate.translate(self.originPage, hints = hints + ['all:'], auto = globalvar.auto, removebrackets -= globalvar.hintnobracket) + pages = titletranslate.translate(self.originPage, hints = hints + ['all:'], + auto = globalvar.auto, removebrackets = globalvar.hintnobracket) else: - pages = titletranslate.translate(self.originPage, hints = ['all:'], auto = globalvar.auto, removebrackets -= globalvar.hintnobracket) + pages = titletranslate.translate(self.originPage, hints = ['all:'], + auto = globalvar.auto, removebrackets = globalvar.hintnobracket) else: - pages = titletranslate.translate(self.originPage, hints = hints, auto = globalvar.auto, removebrackets -= globalvar.hintnobracket) + pages = titletranslate.translate(self.originPage, hints = hints, + auto = globalvar.auto, removebrackets = globalvar.hintnobracket, + site = pywikibot.getSite() ) for page in pages: if globalvar.contentsondisk: page = StoredPage(page) @@ -1100,7 +1119,7 @@ """ if self.forcedStop: return False - if globalvar.nobackonly: + if globalvar.nobackonly and originPage: # cannot check backlink before we have an origin page if page == self.originPage: try: pywikibot.output(u"%s has a backlink from %s." @@ -1138,7 +1157,7 @@ if linkedPage in self.foundIn: # We have seen this page before, don't ask again. return False - elif self.originPage.namespace() != linkedPage.namespace(): + elif self.originPage and self.originPage.namespace() != linkedPage.namespace(): # Allow for a mapping between different namespaces crossFrom = self.originPage.site().family.crossnamespace.get(self.originPage.namespace(), {}) crossTo = crossFrom.get(self.originPage.site().language(), crossFrom.get('_default', {})) @@ -1181,10 +1200,11 @@ return True else: # same namespaces, no problem + # or no origin page yet, also no problem return False
def wiktionaryMismatch(self, page): - if globalvar.same=='wiktionary': + if self.originPage and globalvar.same=='wiktionary': if page.title().lower() != self.originPage.title().lower(): pywikibot.output(u"NOTE: Ignoring %s for %s in wiktionary mode" % (page.title(asLink=True), self.originPage.title(asLink=True))) return True @@ -1207,6 +1227,8 @@ alternativePage is either None, or a page that the user has chosen to use instead of the given page. """ + if not self.originPage: + return (False, None) # any page matches until we have an origin page if globalvar.autonomous: if self.originPage.isDisambig() and not page.isDisambig(): pywikibot.output(u"NOTE: Ignoring link from disambiguation page %s to non-disambiguation %s" @@ -1296,8 +1318,8 @@ elif not newhint: break else: - pages = titletranslate.translate(self.originPage, hints = [newhint], auto = globalvar.auto, removebrackets -= globalvar.hintnobracket) + pages = titletranslate.translate(self.originPage, hints = [newhint], + auto = globalvar.auto, removebrackets = globalvar.hintnobracket) for page in pages: self.addIfNew(page, counter, None) if globalvar.hintsareright: @@ -1323,9 +1345,10 @@ if globalvar.skipauto: dictName, year = page.autoFormat() if dictName is not None: - pywikibot.output(u'WARNING: %s:%s relates to %s:%s, which is an auto entry %s(%s)' - % (self.originPage.site().language(), self.originPage.title(), - page.site().language(),page.title(),dictName,year)) + if self.originPage: + pywikibot.output(u'WARNING: %s:%s relates to %s:%s, which is an auto entry %s(%s)' + % (self.originPage.site().language(), self.originPage.title(), + page.site().language(),page.title(),dictName,year))
# Abort processing if the bot is running in autonomous mode. if globalvar.autonomous: @@ -1371,7 +1394,8 @@ if not globalvar.quiet or pywikibot.verbose: pywikibot.output(u"NOTE: %s is %sredirect to %s" % (page.aslink(True), redir, redirectTargetPage.aslink(True))) - if page == self.originPage: + if self.originPage is None or page == self.originPage: + # the 1st existig page becomes the origin page, if none was supplied if globalvar.initialredirect: if globalvar.contentsondisk: redirectTargetPage = StoredPage(redirectTargetPage) @@ -1396,8 +1420,8 @@ if self.addIfNew(redirectTargetPage, counter, page): if config.interwiki_shownew or pywikibot.verbose: pywikibot.output(u"%s: %s gives new %sredirect %s" - % (self.originPage.title(asLink=True), page.aslink(True), - redir, redirectTargetPage.aslink(True))) + % (self.originPage.title(asLink=True), page.aslink(True), + redir, redirectTargetPage.aslink(True))) continue
# must be behind the page.isRedirectPage() part @@ -1410,7 +1434,8 @@ for site, count in self.todo.siteCounts(): counter.minus(site, count) self.todo = PageTree() - self.done = PageTree() + self.done = PageTree() + self.originPage = None continue
elif page.section(): @@ -1419,6 +1444,9 @@ continue
# Page exists, isnt a redirect, and is a plain link (no section) + if self.originPage is None: + # the 1st existig page becomes the origin page, if none was supplied + self.originPage = page try: iw = page.interwiki() except pywikibot.NoSuchSite: @@ -1665,10 +1693,11 @@ if self.forcedStop: # autonomous with problem pywikibot.output(u"======Aborted processing %s======" % self.originPage.aslink(True)) return - if self.originPage.isRedirectPage(): - return - if self.originPage.isCategoryRedirect(): - return + if self.originPage: + if self.originPage.isRedirectPage(): + return + if self.originPage.isCategoryRedirect(): + return if not self.untranslated and globalvar.untranslatedonly: return # The following check is not always correct and thus disabled. @@ -1677,14 +1706,18 @@ # if len(self.done) == 1: # # No interwiki at all # return - pywikibot.output(u"======Post-processing %s======" % self.originPage.aslink(True)) + if self.originPage: + pywikibot.output(u"======Post-processing %s======" % self.originPage.aslink(True)) # Assemble list of accepted interwiki links new = self.assemble() if new is None: # User said give up pywikibot.output(u"======Aborted processing %s======" % self.originPage.aslink(True)) return + if not len(new): # nothing else to do + return
# Make sure new contains every page link, including the page we are processing + # TODO: sould be move to assemble() # replaceLinks will skip the site it's working on. if self.originPage.site() not in new: if not self.originPage.site().family.interwiki_forward: #TODO: make this possible as well. @@ -2155,7 +2188,8 @@ else: mode = 'written' f = codecs.open(dumpfn, mode[0], 'utf-8') for subj in self.subjects: - f.write(subj.originPage.aslink(None)+'\n') + if subj.originPage: + f.write(subj.originPage.aslink(None)+'\n') f.close() pywikibot.output(u'Dump %s (%s) %s.' % (site.lang, site.family.name, mode)) return dumpfn @@ -2430,6 +2464,7 @@
def main(): singlePageTitle = [] + opthintsonly = False start = None # Which namespaces should be processed? # default to [] which means all namespaces will be processed @@ -2485,6 +2520,8 @@ optRestore = not globalvar.restoreAll elif arg == '-continue': optContinue = True + elif arg == '-hintsonly': + opthintsonly = True elif arg.startswith('-namespace:'): try: namespaces.append(int(arg[11:])) @@ -2593,9 +2630,12 @@ readWarnfile(warnfile, bot) else: singlePageTitle = ' '.join(singlePageTitle) - if not singlePageTitle: - singlePageTitle = pywikibot.input(u'Which page to check:') - singlePage = pywikibot.Page(pywikibot.getSite(), singlePageTitle) + if not singlePageTitle and not opthintsonly: + singlePageTitle = pywikibot.input(u'Which page to check:') + if singlePageTitle: + singlePage = pywikibot.Page(pywikibot.getSite(), singlePageTitle) + else: + singlePage = None bot.add(singlePage, hints = globalvar.hints)
try:
pywikipedia-svn@lists.wikimedia.org