Revision: 5174 Author: russblau Date: 2008-04-01 20:06:31 +0000 (Tue, 01 Apr 2008)
Log Message: ----------- Add option to use moved-pages log as a source of double-redirects (this will mainly be useful on larger wikis *cough*English Wikipedia*cough* where the double-redirects accumulate faster than the Special: page can keep up with them. Also fix one message.
Modified Paths: -------------- trunk/pywikipedia/redirect.py
Modified: trunk/pywikipedia/redirect.py =================================================================== --- trunk/pywikipedia/redirect.py 2008-04-01 17:49:40 UTC (rev 5173) +++ trunk/pywikipedia/redirect.py 2008-04-01 20:06:31 UTC (rev 5174) @@ -25,6 +25,10 @@ -offset:n Number of redirect to restart with (see progress). Works only with an XML dump.
+-moves Instead of using Special:Doubleredirects, use the page move + log to find double-redirect candidates (only works with + action "double", does not work with -xml) + -always Don't prompt you for each replacement.
""" @@ -103,10 +107,12 @@ }
class RedirectGenerator: - def __init__(self, xmlFilename = None, namespaces = [], offset = -1): + def __init__(self, xmlFilename=None, namespaces=[], offset=-1, + use_move_log=False): self.xmlFilename = xmlFilename self.namespaces = namespaces self.offset = offset + self.use_move_log = use_move_log
def get_redirects_from_dump(self, alsoGetPageTitles = False): ''' @@ -208,6 +214,10 @@
def retrieve_double_redirects(self): if self.xmlFilename == None: + if self.use_move_log: + for redir_page in self.get_moved_pages_redirects(): + yield redir_page.title() + return mysite = wikipedia.getSite() # retrieve information from the live wiki's maintenance page # double redirect maintenance page's URL @@ -236,8 +246,44 @@ wikipedia.output(u'\nChecking redirect %i of %i...' % (num + 1, len(redict)))
+ move_regex = re.compile( + r'<li>.*?<a href="/wiki/User:.*?>.*?</a> ' + r'(<a href="/wiki/User_talk:.*?>Talk</a> | ' + r'<a href="/wiki/Special:Contributions/.*?>contribs</a>) ' + r'moved <a href="/w/index.php?title=.*?>(.*?)</a> to ' + r'<a href="/wiki/.*?>.*?</a>.*?</li>' ) + + def get_moved_pages_redirects(self): + '''generate redirects to recently-moved pages''' + offset=0 + site = wikipedia.getSite() + while offset <= 10000: # Can't access more than 10000 log entries + move_url = \ + "/w/index.php?title=Special:Log&limit=500&offset=%i&type=move"\ + % offset + try: + move_list = site.getUrl(move_url) +# wikipedia.output(u"[%i]" % offset) + except: + import traceback + traceback.print_exc() + return + for moved_page in self.move_regex.findall(move_list): + # moved_page is now a redirect, so any redirects pointing + # to it need to be changed + try: + for page in wikipedia.Page(site, moved_page + ).getReferences(follow_redirects=True, + redirectsOnly=True): + yield page + except wikipedia.NoPage: + # original title must have been deleted after move + continue + offset += 500 + + class RedirectRobot: - def __init__(self, action, generator, always = False): + def __init__(self, action, generator, always=False): self.action = action self.generator = generator self.always = always @@ -308,7 +354,7 @@ elif len(redirList) == 2: wikipedia.output( u'Skipping: Redirect target %s is not a redirect.' - % redir.aslink()) + % newRedir.aslink()) break # do nothing except wikipedia.SectionError: wikipedia.output( @@ -413,6 +459,7 @@ # at which redirect shall we start searching double redirects again # (only with dump); default to -1 which means all redirects are checked offset = -1 + moved_pages = False always = False for arg in wikipedia.handleArgs(): if arg == 'double': @@ -425,6 +472,8 @@ u'Please enter the XML dump's filename: ') else: xmlFilename = arg[5:] + elif arg.startswith('-moves'): + moved_pages = True elif arg.startswith('-namespace:'): try: namespaces.append(int(arg[11:])) @@ -440,7 +489,7 @@ if not action: wikipedia.showHelp('redirect') else: - gen = RedirectGenerator(xmlFilename, namespaces, offset) + gen = RedirectGenerator(xmlFilename, namespaces, offset, moved_pages) bot = RedirectRobot(action, gen, always) bot.run()