[Pywikipedia-l] SVN: [6355] trunk/pywikipedia/redirect.py - pywikibot

17 Feb 2009

Revision: 6355
Author:   purodha
Date:     2009-02-17 14:36:30 +0000 (Tue, 17 Feb 2009)
Log Message:
-----------
1) Message updated.
2) Parameter set added: -api -start -until -number
3) Action type parameter added: "both"
4) Generator added that yields each type of redirect via API
5) New generator and action only poorly & superficially patched
   onto existing code.
Note: the various generators are highly inconsistent; a small
      part of API generator code is in bot instead of generator;
      -namespaces nonstandard and partially broken; summary
      messages may be missing for "both -api" in some cases;
      code should be heavily cleaned up and straighted.
Modified Paths:
--------------
    trunk/pywikipedia/redirect.py
Modified: trunk/pywikipedia/redirect.py
===================================================================

--- trunk/pywikipedia/redirect.py	2009-02-16 22:16:53 UTC (rev 6354)
+++ trunk/pywikipedia/redirect.py	2009-02-17 14:36:30 UTC (rev 6355)
@@ -6,35 +6,56 @@
Syntax:
-    python redirect.py action [-argument]
+    python redirect.py action [-arguments ...]
where action can be one of these:
double         Fix redirects which point to other redirects
 broken         Delete redirects where targets don't exist. Requires adminship.
+both           Both of the above. Permitted only with -api. Implies -api.
-and argument can be:
+and arguments can be:
-xml           Retrieve information from a local XML dump
                (http://download.wikimedia.org). Argument can also be given as
-               "-xml:filename.xml". If this argument isn't given, info will be
-               loaded from a special page of the live wiki.
+               "-xml:filename.xml". Cannot be used with -api or -moves.
+               If neither of -xml -api -moves is given, info will be loaded from
+               a special page of the live wiki.
--namespace:n   Namespace to process. Works only with an XML dump.
+-api           Retrieve information from the wiki via MediaWikis application
+               program interface (API). Cannot be used with -xml or -moves.
+               If neither of -xml -api -moves is given, info will be loaded from
+               a special page of the live wiki.
+-moves         Use the page move log to find double-redirect candidates. Only
+               works with action "double", does not work with either -xml, or -api.
+               If neither of -xml -api -moves is given, info will be loaded from
+               a special page of the live wiki.
+
+-namespace:n   Namespace to process. Works only with an XML dump, or the API
+               interface. Can be given multiple times, for several namespaces.
+               If omitted, with -xml all namespaces are treated, with -api
+               only the main (article) namespace is treated.
+
 -offset:n      With -xml, the number of the redirect to restart with (see
                progress). With -moves, the number of hours ago to start
                scanning moved pages. Otherwise, ignored.
--moves         Instead of using Special:Doubleredirects, use the page move
-               log to find double-redirect candidates (only works with
-               action "double", does not work with -xml)
+-start:title   With -api, the starting page title in each namespace.
+               Otherwise ignored. Page needs not exist.
+-until:title   With -api, the possible last page title in each namespace.
+               Otherwise ignored. Page needs not exist.
+
+-number:n      With -api, the maximum count of redirects to work upon.
+               Otherwise ignored. Use 0 for unlimited
+
 -always        Don't prompt you for each replacement.
"""
 #
-# (C) Daniel Herding, 2004
+# (C) Daniel Herding, 2004.
+#     Purodha Blissenbach, 2009.
 #
 # Distributed under the terms of the MIT license.
 #
@@ -102,7 +123,7 @@
     'ka': u'რობოტი: გადამისამართებული გვერდი არ არსებობს',
     'ko': u'로봇: 끊긴 넘겨주기',
     'kk': u'Бот: Айдату нысанасы жоқ болды',
-    'ksh':u'Bot: Dė Ömlëijdong jingk ennet Liiere',
+    'ksh':u'Bot: Dė [[Special:BrokenRedirects|Ömlëijdong jingk ennet Liiere]]',
     'lt': u'robotas: Peradresavimas į niekur',
     'nds':u'Bot: Kaputte Wiederleiden rutmakt',
     'nl': u'Bot: doelpagina doorverwijzing bestaat niet',
@@ -141,11 +162,16 @@
class RedirectGenerator:
     def __init__(self, xmlFilename=None, namespaces=[], offset=-1,
-                 use_move_log=False):
+                 use_move_log=False,
+                 use_api=False, start=None, until=None, number=None):
         self.xmlFilename = xmlFilename
         self.namespaces = namespaces
         self.offset = offset
         self.use_move_log = use_move_log
+        self.use_api = use_api
+        self.api_start = start
+        self.api_until = until
+        self.api_number = number
def get_redirects_from_dump(self, alsoGetPageTitles = False):
         '''
@@ -217,8 +243,170 @@
         else:
             return redict
+    def get_redirect_pageids_via_api(self, number = u'max', namespaces = [], site = None,
+                             start = None, until = None ):
+        """
+        Generator which will yield page IDs of Pages that are redirects.
+        Get number of page ids in one go.
+        Iterates over namespaces, Main if an empty list.
+        In each namespace, start alphabetically from a pagetitle start, wich need not exist.
+        """
+        # wikipedia.output(u'====> get_redirect_pageids_via_api(number=%s, #ns=%d, start=%s, until=%s)' % (number, len(namespaces), start, until))
+        import urllib
+        if site is None:
+            site = wikipedia.getSite()
+        if namespaces == []:
+            namespaces = [ 0 ]
+        apiQ0 = site.api_address() 
+        apiQ0 += 'action=query'
+        apiQ0 += '&list=allpages'
+        apiQ0 += '&apfilterredir=redirects'
+        apiQ0 += '&aplimit=%s' % number
+        apiQ0 += '&format=xml'
+        apPageTitleRe = re.compile(' pageid="(.*?)" .*? title="(.*?)"')
+        apPageIdRe = re.compile(' pageid="(.*?)"')
+        apfromRe = re.compile(' apfrom="(.*?)"')
+        for ns in namespaces:
+            # print (ns)
+            apiQns = apiQ0 + '&apnamespace=%s' % ns
+            # print (apiQns)
+            while apiQns:
+                apiQ = apiQns
+                if start:
+                    apiQ += '&apfrom=%s' % urllib.quote(start.encode(site.encoding()))
+                # print (apiQ)
+                result = site.getUrl(apiQ)
+                # wikipedia.output(u'===RESULT===\n%s\n' % result)
+                if until:
+                    for (pageid, pagetitle) in apPageTitleRe.findall(result):
+                        # wikipedia.output(u'===PAGEID=%s: %s' % (pageid, pagetitle)) ## TODO: make this a -verbose mode output, independant of -until
+                        if pagetitle > until:
+                           apiQns = None
+                           break
+                        yield pageid
+                else:
+                    for pageid in apPageIdRe.findall(result):
+                        # wikipedia.output(u'===PAGEID=%s' % pageid)
+                        yield pageid
+                m = apfromRe.search(result)
+                if m:
+                    start = m.group(1)
+                else:
+                    break
+
+    def _next_redirects_via_api_commandline(self, apiQi, number = 'max', namespaces = [],
+                             site = None, start = None, until = None ):
+        """
+        yields commands to the api for checking a set op page ids.
+        """
+        # wikipedia.output(u'====> _next_redirects_via_api_commandline(apiQi=%s, number=%s, #ns=%d, start=%s, until=%s)' % (apiQi, number, len(namespaces), start, until))
+        if site is None:
+            site = wikipedia.getSite()
+        if namespaces == []:
+            namespaces = [ 0 ]
+        maxurllen = 1018	# accomodate "GET " + apiQ + CR + LF in 1024 bytes.
+        apiQ = ''
+        for pageid in self.get_redirect_pageids_via_api(number = number, namespaces = namespaces,
+                             site = site, start = start, until = until ):
+            if apiQ:
+                tmp = ( '%s|%s' % ( apiQ, pageid ) )
+            else:
+                tmp = ( '%s%s' % ( apiQi, pageid ) )
+            if len(tmp) > maxurllen and apiQ:
+                yield apiQ
+                tmp = ''
+            apiQ = tmp
+        if apiQ:
+            yield apiQ
+
+    def get_redirects_via_api(self, number = u'max', namespaces = [], site = None, start = None,
+                             until = None, maxlen = 8 ):
+        """
+        Generator which will yield a tuple of data about Pages that are redirects:
+            0 - page title of a redirect page
+            1 - type of redirect:
+                         0 - broken redirect, target page title missing
+                         1 - normal redirect, target page exists and is not a redirect
+                 2..maxlen - start of a redirect chain of that many redirects
+                             (currently, the API seems not to return sufficient data
+                             to make these return values possible, but that may change)
+                  maxlen+1 - start of an even longer chain, or a loop 
+                             (currently, the API seems not to return sufficient data
+                             to allow this return vaules, but that may change)
+                      None - start of a redirect chain of unknown length, or loop
+            2 - target page title of the redirect, or chain (may not exist)
+            3 - target page of the redirect, or end of chain, or page title where
+                chain or loop detecton was halted, or None if unknown
+        Get number of page ids in one go.
+        Iterates over namespaces, Main if an empty list.
+        In each namespace, start alphabetically from a pagetitle start, wich need not exist.
+        """
+        # wikipedia.output(u'====> get_redirects_via_api(number=%s, #ns=%d, start=%s, until=%s, maxlen=%s)' % (number, len(namespaces), start, until, maxlen))
+        import urllib
+        if site is None:
+            site = wikipedia.getSite()
+        if namespaces == []:
+            namespaces = [ 0 ]
+        apiQ1 = site.api_address() 
+        apiQ1 += 'action=query'
+        apiQ1 += '&redirects'
+        apiQ1 += '&format=xml'
+        apiQ1 += '&pageids='
+        redirectRe = re.compile('<r from="(.*?)" to="(.*?)"')
+        missingpageRe = re.compile('<page .*? title="(.*?)" missing=""')
+        existingpageRe = re.compile('<page pageid=".*?" .*? title="(.*?)"')
+        for apiQ in self._next_redirects_via_api_commandline(apiQ1, number = number,
+                                 namespaces = namespaces, site = site, start = start, until = until ):
+            # wikipedia.output (u'===apiQ=%s' % apiQ)
+            result = site.getUrl(apiQ)
+            # wikipedia.output(u'===RESULT===\n%s\n' % result)
+            redirects = {}
+            pages = {}
+            for redirect in redirectRe.findall(result):
+                # wikipedia.output (u'R: %s => %s' % redirect)
+                redirects[redirect[0]] = redirect[1]
+            for pagetitle in missingpageRe.findall(result):
+                # wikipedia.output (u'M: %s' % pagetitle)
+                pages[pagetitle] = False
+            for pagetitle in existingpageRe.findall(result):
+                # wikipedia.output (u'P: %s' % pagetitle)
+                pages[pagetitle] = True
+            for redirect in redirects:
+                target = redirects[redirect]
+                result = 0
+                final = None
+                try:
+                    if pages[target]:
+                        final = target
+                        try:
+                            while result <= maxlen:
+                               result += 1
+                               final = redirects[final]
+                            # result = None
+                        except KeyError:
+                            pass
+                except KeyError:
+                    result = None
+                    pass
+                yield (redirect, result, target, final)
+                # wikipedia.output (u'X%d: %s => %s ----> %s' % (result, redirect, target, final))
+
     def retrieve_broken_redirects(self):
-        if self.xmlFilename == None:
+        if self.use_api:
+            mysite = wikipedia.getSite()
+            count = 0
+            for (pagetitle, type, target, final) in self.get_redirects_via_api(
+                                         namespaces = self.namespaces,
+                                         site = mysite, start = self.api_start,
+                                         until = self.api_until, maxlen = 2):
+                if type == 0:
+                    yield pagetitle
+                    if self.api_number:
+                        count += 1
+                        if count >= self.api_number:
+                            break
+
+        elif self.xmlFilename == None:
             # retrieve information from the live wiki's maintenance page
             mysite = wikipedia.getSite()
             # broken redirect maintenance page's URL
@@ -246,7 +434,21 @@
                     yield key
def retrieve_double_redirects(self):
-        if self.xmlFilename == None:
+        if self.use_api:
+            mysite = wikipedia.getSite()
+            count = 0
+            for (pagetitle, type, target, final) in self.get_redirects_via_api(
+                                         namespaces = self.namespaces,
+                                         site = mysite, start = self.api_start,
+                                         until = self.api_until, maxlen = 2):
+                if type != 0 and type != 1:
+                    yield pagetitle
+                    if self.api_number:
+                        count += 1
+                        if count >= self.api_number:
+                            break
+
+        elif self.xmlFilename == None:
             if self.use_move_log:
                 for redir_page in self.get_moved_pages_redirects():
                     yield redir_page.title()
@@ -334,29 +536,38 @@
                 break
             offset_time = m.group(1)
-
 class RedirectRobot:
-    def __init__(self, action, generator, always=False):
+    def __init__(self, action, generator, always=False, number=None):
         self.action = action
         self.generator = generator
         self.always = always
+        self.number = number
+        self.exiting = False
def prompt(self, question):
         if not self.always:
-            choice = wikipedia.inputChoice(question, ['Yes', 'No', 'All'],
-                                           ['y', 'N', 'a'], 'N')
+            choice = wikipedia.inputChoice(question, ['Yes', 'No', 'All', 'Quit'],
+                                           ['y', 'N', 'a', 'q'], 'N')
             if choice == 'n':
                 return False
+            elif choice == 'q':
+                self.exiting = True
+                return False
             elif choice == 'a':
                 self.always = True
         return True
def delete_broken_redirects(self):
+        mysite = wikipedia.getSite()
         # get reason for deletion text
-        reason = wikipedia.translate(wikipedia.getSite(), reason_broken)
+        reason = wikipedia.translate(mysite, reason_broken)
+        for redir_name in self.generator.retrieve_broken_redirects():
+            self.delete_1_broken_redirect(mysite, redir_name, reason)
+            if self.exiting:
+                break
-        for redir_name in self.generator.retrieve_broken_redirects():
-            redir_page = wikipedia.Page(wikipedia.getSite(), redir_name)
+    def delete_1_broken_redirect(self, mysite, redir_name, reason):
+            redir_page = wikipedia.Page(mysite, redir_name)
             # Show the title of the page we're working on.
             # Highlight the title in purple.
             wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
@@ -395,7 +606,13 @@
def fix_double_redirects(self):
         mysite = wikipedia.getSite()
+        summary = wikipedia.translate(mysite, msg_double)
         for redir_name in self.generator.retrieve_double_redirects():
+            self.fix_1_double_redirect(mysite, redir_name, summary)
+            if self.exiting:
+                break
+
+    def fix_1_double_redirect(self, mysite, redir_name, summary):
             redir = wikipedia.Page(mysite, redir_name)
             # Show the title of the page we're working on.
             # Highlight the title in purple.
@@ -507,7 +724,32 @@
                                 % (redir.title(), error))
                 break
+    def fix_double_or_delete_broken_redirects(self):
+        # TODO: part of this should be moved to generator, the rest merged into self.run()
+        mysite = wikipedia.getSite()
+        # get reason for deletion text
+        delete_reason = wikipedia.translate(mysite, reason_broken)
+        double_summary = wikipedia.translate(mysite, msg_double)
+        count = 0
+        for (redir_name, code, target, final) in self.generator.get_redirects_via_api(
+                                         namespaces = self.generator.namespaces,
+                                         site = mysite, start = self.generator.api_start,
+                                         until = self.generator.api_until, maxlen = 2):
+            if code == 1:
+                continue
+            elif code == 0:
+                self.delete_1_broken_redirect(mysite, redir_name, delete_reason)
+                count += 1
+            else:
+                self.fix_1_double_redirect(mysite, redir_name, double_summary)
+                count += 1
+            # print ('%s .. %s' % (count, self.number))
+            if self.exiting or ( self.number and count >= self.number ):
+                break
+
     def run(self):
+        # TODO: make all generators return a redicet type indicator,
+        #        thus make them usabile with 'both'
         if self.action == 'double':
             # get summary text
             wikipedia.setAction(
@@ -515,6 +757,8 @@
             self.fix_double_redirects()
         elif self.action == 'broken':
             self.delete_broken_redirects()
+        elif self.action == 'both':
+            self.fix_double_or_delete_broken_redirects()
def main(*args):
     # read command line parameters
@@ -532,12 +776,20 @@
     # (only with dump); default to -1 which means all redirects are checked
     offset = -1
     moved_pages = False
+    api = False
+    start = ''
+    until = ''
+    number = None
     always = False
     for arg in wikipedia.handleArgs(*args):
         if arg == 'double':
             action = 'double'
         elif arg == 'broken':
             action = 'broken'
+        elif arg == 'both':
+            action = 'both'
+        elif arg == '-api':
+            api = True
         elif arg.startswith('-xml'):
             if len(arg) == 4:
                 xmlFilename = wikipedia.input(
@@ -547,22 +799,40 @@
         elif arg.startswith('-moves'):
             moved_pages = True
         elif arg.startswith('-namespace:'):
+            ns = arg[11:]
+            if ns == '':
+		## "-namespace:" does NOT yield -namespace:0 further down the road!
+                ns = wikipedia.input(
+                                u'Please enter a namespace by its number: ')
+#                                u'Please enter a namespace by its name or number: ')  TODO! at least for some generators.
+            if ns == '':
+               ns = '0'
             try:
-                namespaces.append(int(arg[11:]))
+                ns = int(ns)
             except ValueError:
-                namespaces.append(arg[11:])
+#-namespace:all Process all namespaces. Works only with the API read interface.
+#-namespace:all Process all namespaces. Works only with the API read interface.
+               pass
+            if not ns in namespaces:
+               namespaces.append(ns)
         elif arg.startswith('-offset:'):
             offset = int(arg[8:])
+        elif arg.startswith('-start:'):
+            start = arg[7:]
+        elif arg.startswith('-until:'):
+            until = arg[7:]
+        elif arg.startswith('-number:'):
+            number = int(arg[8:])
         elif arg == '-always':
             always = True
         else:
             wikipedia.output(u'Unknown argument: %s' % arg)
-    if not action:
+    if not action or (api and moved_pages) or (xmlFilename and moved_pages) or (api and xmlFilename):
         wikipedia.showHelp('redirect')
     else:
-        gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages)
-        bot = RedirectRobot(action, gen, always)
+        gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages, api, start, until, number)
+        bot = RedirectRobot(action, gen, always, number)
         bot.run()
if __name__ == '__main__':