Revision: 6355 Author: purodha Date: 2009-02-17 14:36:30 +0000 (Tue, 17 Feb 2009)
Log Message: ----------- 1) Message updated. 2) Parameter set added: -api -start -until -number 3) Action type parameter added: "both" 4) Generator added that yields each type of redirect via API 5) New generator and action only poorly & superficially patched onto existing code. Note: the various generators are highly inconsistent; a small part of API generator code is in bot instead of generator; -namespaces nonstandard and partially broken; summary messages may be missing for "both -api" in some cases; code should be heavily cleaned up and straighted.
Modified Paths: -------------- trunk/pywikipedia/redirect.py
Modified: trunk/pywikipedia/redirect.py =================================================================== --- trunk/pywikipedia/redirect.py 2009-02-16 22:16:53 UTC (rev 6354) +++ trunk/pywikipedia/redirect.py 2009-02-17 14:36:30 UTC (rev 6355) @@ -6,35 +6,56 @@
Syntax:
- python redirect.py action [-argument] + python redirect.py action [-arguments ...]
where action can be one of these:
double Fix redirects which point to other redirects broken Delete redirects where targets don't exist. Requires adminship. +both Both of the above. Permitted only with -api. Implies -api.
-and argument can be: +and arguments can be:
-xml Retrieve information from a local XML dump (http://download.wikimedia.org). Argument can also be given as - "-xml:filename.xml". If this argument isn't given, info will be - loaded from a special page of the live wiki. + "-xml:filename.xml". Cannot be used with -api or -moves. + If neither of -xml -api -moves is given, info will be loaded from + a special page of the live wiki.
--namespace:n Namespace to process. Works only with an XML dump. +-api Retrieve information from the wiki via MediaWikis application + program interface (API). Cannot be used with -xml or -moves. + If neither of -xml -api -moves is given, info will be loaded from + a special page of the live wiki.
+-moves Use the page move log to find double-redirect candidates. Only + works with action "double", does not work with either -xml, or -api. + If neither of -xml -api -moves is given, info will be loaded from + a special page of the live wiki. + +-namespace:n Namespace to process. Works only with an XML dump, or the API + interface. Can be given multiple times, for several namespaces. + If omitted, with -xml all namespaces are treated, with -api + only the main (article) namespace is treated. + -offset:n With -xml, the number of the redirect to restart with (see progress). With -moves, the number of hours ago to start scanning moved pages. Otherwise, ignored.
--moves Instead of using Special:Doubleredirects, use the page move - log to find double-redirect candidates (only works with - action "double", does not work with -xml) +-start:title With -api, the starting page title in each namespace. + Otherwise ignored. Page needs not exist.
+-until:title With -api, the possible last page title in each namespace. + Otherwise ignored. Page needs not exist. + +-number:n With -api, the maximum count of redirects to work upon. + Otherwise ignored. Use 0 for unlimited + -always Don't prompt you for each replacement.
""" # -# (C) Daniel Herding, 2004 +# (C) Daniel Herding, 2004. +# Purodha Blissenbach, 2009. # # Distributed under the terms of the MIT license. # @@ -102,7 +123,7 @@ 'ka': u'რობოტი: გადამისამართებული გვერდი არ არსებობს', 'ko': u'로봇: 끊긴 넘겨주기', 'kk': u'Бот: Айдату нысанасы жоқ болды', - 'ksh':u'Bot: Dė Ömlëijdong jingk ennet Liiere', + 'ksh':u'Bot: Dė [[Special:BrokenRedirects|Ömlëijdong jingk ennet Liiere]]', 'lt': u'robotas: Peradresavimas į niekur', 'nds':u'Bot: Kaputte Wiederleiden rutmakt', 'nl': u'Bot: doelpagina doorverwijzing bestaat niet', @@ -141,11 +162,16 @@
class RedirectGenerator: def __init__(self, xmlFilename=None, namespaces=[], offset=-1, - use_move_log=False): + use_move_log=False, + use_api=False, start=None, until=None, number=None): self.xmlFilename = xmlFilename self.namespaces = namespaces self.offset = offset self.use_move_log = use_move_log + self.use_api = use_api + self.api_start = start + self.api_until = until + self.api_number = number
def get_redirects_from_dump(self, alsoGetPageTitles = False): ''' @@ -217,8 +243,170 @@ else: return redict
+ def get_redirect_pageids_via_api(self, number = u'max', namespaces = [], site = None, + start = None, until = None ): + """ + Generator which will yield page IDs of Pages that are redirects. + Get number of page ids in one go. + Iterates over namespaces, Main if an empty list. + In each namespace, start alphabetically from a pagetitle start, wich need not exist. + """ + # wikipedia.output(u'====> get_redirect_pageids_via_api(number=%s, #ns=%d, start=%s, until=%s)' % (number, len(namespaces), start, until)) + import urllib + if site is None: + site = wikipedia.getSite() + if namespaces == []: + namespaces = [ 0 ] + apiQ0 = site.api_address() + apiQ0 += 'action=query' + apiQ0 += '&list=allpages' + apiQ0 += '&apfilterredir=redirects' + apiQ0 += '&aplimit=%s' % number + apiQ0 += '&format=xml' + apPageTitleRe = re.compile(' pageid="(.*?)" .*? title="(.*?)"') + apPageIdRe = re.compile(' pageid="(.*?)"') + apfromRe = re.compile(' apfrom="(.*?)"') + for ns in namespaces: + # print (ns) + apiQns = apiQ0 + '&apnamespace=%s' % ns + # print (apiQns) + while apiQns: + apiQ = apiQns + if start: + apiQ += '&apfrom=%s' % urllib.quote(start.encode(site.encoding())) + # print (apiQ) + result = site.getUrl(apiQ) + # wikipedia.output(u'===RESULT===\n%s\n' % result) + if until: + for (pageid, pagetitle) in apPageTitleRe.findall(result): + # wikipedia.output(u'===PAGEID=%s: %s' % (pageid, pagetitle)) ## TODO: make this a -verbose mode output, independant of -until + if pagetitle > until: + apiQns = None + break + yield pageid + else: + for pageid in apPageIdRe.findall(result): + # wikipedia.output(u'===PAGEID=%s' % pageid) + yield pageid + m = apfromRe.search(result) + if m: + start = m.group(1) + else: + break + + def _next_redirects_via_api_commandline(self, apiQi, number = 'max', namespaces = [], + site = None, start = None, until = None ): + """ + yields commands to the api for checking a set op page ids. + """ + # wikipedia.output(u'====> _next_redirects_via_api_commandline(apiQi=%s, number=%s, #ns=%d, start=%s, until=%s)' % (apiQi, number, len(namespaces), start, until)) + if site is None: + site = wikipedia.getSite() + if namespaces == []: + namespaces = [ 0 ] + maxurllen = 1018 # accomodate "GET " + apiQ + CR + LF in 1024 bytes. + apiQ = '' + for pageid in self.get_redirect_pageids_via_api(number = number, namespaces = namespaces, + site = site, start = start, until = until ): + if apiQ: + tmp = ( '%s|%s' % ( apiQ, pageid ) ) + else: + tmp = ( '%s%s' % ( apiQi, pageid ) ) + if len(tmp) > maxurllen and apiQ: + yield apiQ + tmp = '' + apiQ = tmp + if apiQ: + yield apiQ + + def get_redirects_via_api(self, number = u'max', namespaces = [], site = None, start = None, + until = None, maxlen = 8 ): + """ + Generator which will yield a tuple of data about Pages that are redirects: + 0 - page title of a redirect page + 1 - type of redirect: + 0 - broken redirect, target page title missing + 1 - normal redirect, target page exists and is not a redirect + 2..maxlen - start of a redirect chain of that many redirects + (currently, the API seems not to return sufficient data + to make these return values possible, but that may change) + maxlen+1 - start of an even longer chain, or a loop + (currently, the API seems not to return sufficient data + to allow this return vaules, but that may change) + None - start of a redirect chain of unknown length, or loop + 2 - target page title of the redirect, or chain (may not exist) + 3 - target page of the redirect, or end of chain, or page title where + chain or loop detecton was halted, or None if unknown + Get number of page ids in one go. + Iterates over namespaces, Main if an empty list. + In each namespace, start alphabetically from a pagetitle start, wich need not exist. + """ + # wikipedia.output(u'====> get_redirects_via_api(number=%s, #ns=%d, start=%s, until=%s, maxlen=%s)' % (number, len(namespaces), start, until, maxlen)) + import urllib + if site is None: + site = wikipedia.getSite() + if namespaces == []: + namespaces = [ 0 ] + apiQ1 = site.api_address() + apiQ1 += 'action=query' + apiQ1 += '&redirects' + apiQ1 += '&format=xml' + apiQ1 += '&pageids=' + redirectRe = re.compile('<r from="(.*?)" to="(.*?)"') + missingpageRe = re.compile('<page .*? title="(.*?)" missing=""') + existingpageRe = re.compile('<page pageid=".*?" .*? title="(.*?)"') + for apiQ in self._next_redirects_via_api_commandline(apiQ1, number = number, + namespaces = namespaces, site = site, start = start, until = until ): + # wikipedia.output (u'===apiQ=%s' % apiQ) + result = site.getUrl(apiQ) + # wikipedia.output(u'===RESULT===\n%s\n' % result) + redirects = {} + pages = {} + for redirect in redirectRe.findall(result): + # wikipedia.output (u'R: %s => %s' % redirect) + redirects[redirect[0]] = redirect[1] + for pagetitle in missingpageRe.findall(result): + # wikipedia.output (u'M: %s' % pagetitle) + pages[pagetitle] = False + for pagetitle in existingpageRe.findall(result): + # wikipedia.output (u'P: %s' % pagetitle) + pages[pagetitle] = True + for redirect in redirects: + target = redirects[redirect] + result = 0 + final = None + try: + if pages[target]: + final = target + try: + while result <= maxlen: + result += 1 + final = redirects[final] + # result = None + except KeyError: + pass + except KeyError: + result = None + pass + yield (redirect, result, target, final) + # wikipedia.output (u'X%d: %s => %s ----> %s' % (result, redirect, target, final)) + def retrieve_broken_redirects(self): - if self.xmlFilename == None: + if self.use_api: + mysite = wikipedia.getSite() + count = 0 + for (pagetitle, type, target, final) in self.get_redirects_via_api( + namespaces = self.namespaces, + site = mysite, start = self.api_start, + until = self.api_until, maxlen = 2): + if type == 0: + yield pagetitle + if self.api_number: + count += 1 + if count >= self.api_number: + break + + elif self.xmlFilename == None: # retrieve information from the live wiki's maintenance page mysite = wikipedia.getSite() # broken redirect maintenance page's URL @@ -246,7 +434,21 @@ yield key
def retrieve_double_redirects(self): - if self.xmlFilename == None: + if self.use_api: + mysite = wikipedia.getSite() + count = 0 + for (pagetitle, type, target, final) in self.get_redirects_via_api( + namespaces = self.namespaces, + site = mysite, start = self.api_start, + until = self.api_until, maxlen = 2): + if type != 0 and type != 1: + yield pagetitle + if self.api_number: + count += 1 + if count >= self.api_number: + break + + elif self.xmlFilename == None: if self.use_move_log: for redir_page in self.get_moved_pages_redirects(): yield redir_page.title() @@ -334,29 +536,38 @@ break offset_time = m.group(1)
- class RedirectRobot: - def __init__(self, action, generator, always=False): + def __init__(self, action, generator, always=False, number=None): self.action = action self.generator = generator self.always = always + self.number = number + self.exiting = False
def prompt(self, question): if not self.always: - choice = wikipedia.inputChoice(question, ['Yes', 'No', 'All'], - ['y', 'N', 'a'], 'N') + choice = wikipedia.inputChoice(question, ['Yes', 'No', 'All', 'Quit'], + ['y', 'N', 'a', 'q'], 'N') if choice == 'n': return False + elif choice == 'q': + self.exiting = True + return False elif choice == 'a': self.always = True return True
def delete_broken_redirects(self): + mysite = wikipedia.getSite() # get reason for deletion text - reason = wikipedia.translate(wikipedia.getSite(), reason_broken) + reason = wikipedia.translate(mysite, reason_broken) + for redir_name in self.generator.retrieve_broken_redirects(): + self.delete_1_broken_redirect(mysite, redir_name, reason) + if self.exiting: + break
- for redir_name in self.generator.retrieve_broken_redirects(): - redir_page = wikipedia.Page(wikipedia.getSite(), redir_name) + def delete_1_broken_redirect(self, mysite, redir_name, reason): + redir_page = wikipedia.Page(mysite, redir_name) # Show the title of the page we're working on. # Highlight the title in purple. wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" @@ -395,7 +606,13 @@
def fix_double_redirects(self): mysite = wikipedia.getSite() + summary = wikipedia.translate(mysite, msg_double) for redir_name in self.generator.retrieve_double_redirects(): + self.fix_1_double_redirect(mysite, redir_name, summary) + if self.exiting: + break + + def fix_1_double_redirect(self, mysite, redir_name, summary): redir = wikipedia.Page(mysite, redir_name) # Show the title of the page we're working on. # Highlight the title in purple. @@ -507,7 +724,32 @@ % (redir.title(), error)) break
+ def fix_double_or_delete_broken_redirects(self): + # TODO: part of this should be moved to generator, the rest merged into self.run() + mysite = wikipedia.getSite() + # get reason for deletion text + delete_reason = wikipedia.translate(mysite, reason_broken) + double_summary = wikipedia.translate(mysite, msg_double) + count = 0 + for (redir_name, code, target, final) in self.generator.get_redirects_via_api( + namespaces = self.generator.namespaces, + site = mysite, start = self.generator.api_start, + until = self.generator.api_until, maxlen = 2): + if code == 1: + continue + elif code == 0: + self.delete_1_broken_redirect(mysite, redir_name, delete_reason) + count += 1 + else: + self.fix_1_double_redirect(mysite, redir_name, double_summary) + count += 1 + # print ('%s .. %s' % (count, self.number)) + if self.exiting or ( self.number and count >= self.number ): + break + def run(self): + # TODO: make all generators return a redicet type indicator, + # thus make them usabile with 'both' if self.action == 'double': # get summary text wikipedia.setAction( @@ -515,6 +757,8 @@ self.fix_double_redirects() elif self.action == 'broken': self.delete_broken_redirects() + elif self.action == 'both': + self.fix_double_or_delete_broken_redirects()
def main(*args): # read command line parameters @@ -532,12 +776,20 @@ # (only with dump); default to -1 which means all redirects are checked offset = -1 moved_pages = False + api = False + start = '' + until = '' + number = None always = False for arg in wikipedia.handleArgs(*args): if arg == 'double': action = 'double' elif arg == 'broken': action = 'broken' + elif arg == 'both': + action = 'both' + elif arg == '-api': + api = True elif arg.startswith('-xml'): if len(arg) == 4: xmlFilename = wikipedia.input( @@ -547,22 +799,40 @@ elif arg.startswith('-moves'): moved_pages = True elif arg.startswith('-namespace:'): + ns = arg[11:] + if ns == '': + ## "-namespace:" does NOT yield -namespace:0 further down the road! + ns = wikipedia.input( + u'Please enter a namespace by its number: ') +# u'Please enter a namespace by its name or number: ') TODO! at least for some generators. + if ns == '': + ns = '0' try: - namespaces.append(int(arg[11:])) + ns = int(ns) except ValueError: - namespaces.append(arg[11:]) +#-namespace:all Process all namespaces. Works only with the API read interface. +#-namespace:all Process all namespaces. Works only with the API read interface. + pass + if not ns in namespaces: + namespaces.append(ns) elif arg.startswith('-offset:'): offset = int(arg[8:]) + elif arg.startswith('-start:'): + start = arg[7:] + elif arg.startswith('-until:'): + until = arg[7:] + elif arg.startswith('-number:'): + number = int(arg[8:]) elif arg == '-always': always = True else: wikipedia.output(u'Unknown argument: %s' % arg)
- if not action: + if not action or (api and moved_pages) or (xmlFilename and moved_pages) or (api and xmlFilename): wikipedia.showHelp('redirect') else: - gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages) - bot = RedirectRobot(action, gen, always) + gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages, api, start, until, number) + bot = RedirectRobot(action, gen, always, number) bot.run()
if __name__ == '__main__':