Revision: 7815 Author: xqt Date: 2009-12-22 14:39:37 +0000 (Tue, 22 Dec 2009)
Log Message: ----------- retrieving movelog pages via API
Modified Paths: -------------- trunk/pywikipedia/redirect.py
Modified: trunk/pywikipedia/redirect.py =================================================================== --- trunk/pywikipedia/redirect.py 2009-12-22 10:07:33 UTC (rev 7814) +++ trunk/pywikipedia/redirect.py 2009-12-22 14:39:37 UTC (rev 7815) @@ -19,17 +19,15 @@ -xml Retrieve information from a local XML dump (http://download.wikimedia.org). Argument can also be given as "-xml:filename.xml". Cannot be used with -api or -moves. - If neither of -xml -api -moves is given, info will be loaded - from a special page of the live wiki.
+-moves Use the page move log to find double-redirect candidates. Only + works with action "double", does not work with -xml. You may + use -api option for retrieving pages via API + -api Retrieve information from the wiki via MediaWikis application - program interface (API). Cannot be used with -xml or -moves. - If neither of -xml -api -moves is given, info will be loaded - from a special page of the live wiki. + program interface (API). Cannot be used with -xml.
--moves Use the page move log to find double-redirect candidates. Only - works with action "double", does not work with either -xml, or - -api. If neither of -xml -api -moves is given, info will be + NOTE: If neither of -xml -api -moves is given, info will be loaded from a special page of the live wiki.
-namespace:n Namespace to process. Works only with an XML dump, or the API @@ -63,7 +61,7 @@ from __future__ import generators import wikipedia, config, query import xmlreader -import re, sys +import re, sys, datetime
__version__='$Id$'
@@ -411,7 +409,7 @@ yield key
def retrieve_double_redirects(self): - if self.use_api: + if self.use_api and not self.use_move_log: count = 0 for (pagetitle, type, target, final) \ in self.get_redirects_via_api(maxlen=2): @@ -424,7 +422,11 @@
elif self.xmlFilename == None: if self.use_move_log: - for redir_page in self.get_moved_pages_redirects(): + if config.use_api: + gen = self.get_moved_pages_redirects_via_api() + else: + gen = self.get_moved_pages_redirects() + for redir_page in gen: yield redir_page.title() return # retrieve information from the live wiki's maintenance page @@ -454,10 +456,46 @@ wikipedia.output(u'\nChecking redirect %i of %i...' % (num + 1, len(redict)))
+ def get_moved_pages_redirects_via_api(self): + if self.offset <= 0: + self.offset = 1 + start = datetime.datetime.utcnow() \ + - datetime.timedelta(0, self.offset*3600) + offset_time = start.strftime("%Y%m%d%H%M%S") + params = { + 'action' :'query', + 'list' :'logevents', + 'letype' :'move', + 'leprop' :'title|details', + 'lelimit' : '500', + 'lestart' : offset_time, + } + data = query.GetData(params, encodeTitle = False)#['query']['logevents'] + if 'warnings' in data: + raise + allmoves = data['query']['logevents'] + wikipedia.output(u'Retrieving %d moved pages via API...' % len(allmoves)) + if wikipedia.verbose: + wikipedia.output(u"[%s]" % offset_time) + for moved in allmoves: + moved_page = wikipedia.Page(self.site, moved['title']) + try: + if not moved_page.isRedirectPage(): + continue + except wikipedia.BadTitle: + continue + except wikipedia.ServerError: + continue + try: + for page in moved_page.getReferences(follow_redirects=True, redirectsOnly=True): + yield page + except wikipedia.NoPage: + # original title must have been deleted after move + continue + def get_moved_pages_redirects(self): '''generate redirects to recently-moved pages''' # this will run forever, until user interrupts it - import datetime
move_regex = re.compile( r'moved <a href.*?>(.*?)</a> to <a href=.*?>.*?</a>.*?</li>') @@ -823,7 +861,7 @@ else: wikipedia.output(u'Unknown argument: %s' % arg)
- if not action or (api and moved_pages) or (xmlFilename and moved_pages)\ + if not action or (xmlFilename and moved_pages)\ or (api and xmlFilename): wikipedia.showHelp('redirect') else:
pywikipedia-svn@lists.wikimedia.org