Revision: 4619 Author: yurik Date: 2007-11-30 05:31:33 +0000 (Fri, 30 Nov 2007)
Log Message: ----------- Updated query library & casechacker script to use the new API
Modified Paths: -------------- trunk/pywikipedia/casechecker.py trunk/pywikipedia/query.py
Modified: trunk/pywikipedia/casechecker.py =================================================================== --- trunk/pywikipedia/casechecker.py 2007-11-29 16:16:45 UTC (rev 4618) +++ trunk/pywikipedia/casechecker.py 2007-11-30 05:31:33 UTC (rev 4619) @@ -74,9 +74,9 @@
langs = { 'ru': { - 'alphabet' : u'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя', - 'localsuspects': u'АаВЕеКкМНОоРрСсТуХх', - 'latinsuspects': u'AaBEeKkMHOoPpCcTyXx', + 'alphabet' : u'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯяІі', + 'localsuspects': u'АаВЕеКкМНОоРрСсТуХхІі', + 'latinsuspects': u'AaBEeKkMHOoPpCcTyXxIi', }, 'uk': { 'alphabet' : u'АаБбВвГ㥴ДдЕеЄєЖжЗзИиІіЇїЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЮюЯяЬь', @@ -111,7 +111,6 @@ title = None replace = False stopAfter = 0 - verbose = False wikilog = None wikilogfile = 'wikilog.txt' autonomous = False @@ -136,9 +135,7 @@ self.replace = True elif arg.startswith('-limit:'): self.stopAfter = int(arg[7:]) - elif arg == '-verbose': - self.verbose = True - elif arg == '-autonomous': + elif arg == '-autonomous' or arg == '-a': self.autonomous = True elif arg.startswith('-ns:'): self.namespaces.append( int(arg[4:]) ) @@ -155,14 +152,15 @@ else: self.namespaces = [0]
- self.params = {'what' : 'allpages', - 'aplimit' : self.aplimit, - 'apfilterredir' : 'nonredirects', - 'noprofile' : '' } - + self.params = { 'action' : 'query', + 'generator' : 'allpages', + 'gaplimit' : self.aplimit, + 'gapfilterredir': 'nonredirects'} + if self.links: - self.params['what'] += '|links|categories'; + self.params['prop'] = 'links|categories'
+ self.site = wikipedia.getSite()
if self.site.lang in self.langs: @@ -200,46 +198,53 @@ count = 0 lastLetter = '' for namespace in self.namespaces: - self.params['apnamespace'] = namespace + self.params['gapnamespace'] = namespace title = None
while True: # Get data - self.params['apfrom'] = self.apfrom - data = query.GetData(self.site.lang, self.params, self.verbose) + self.params['gapfrom'] = self.apfrom + data = query.GetData(self.site.lang, self.params, wikipedia.verbose, True) try: - self.apfrom = data['query']['allpages']['next'] + self.apfrom = data['query-continue']['allpages']['gapfrom'] except: self.apfrom = None
# Process received data - if 'pages' in data: + if 'query' in data and 'pages' in data['query']: firstItem = True - for pageID, page in data['pages'].iteritems(): + for pageID, page in data['query']['pages'].iteritems(): printed = False title = page['title'] if firstItem: if lastLetter != title[0]: - print 'Processing ' + title + try: + print 'Processing ' + title + except: + print 'Processing unprintable title' lastLetter = title[0] firstItem = False if self.titles: err = self.ProcessTitle(title) if err: - if page['ns'] == 14: - self.WikiLog(u"* Move category content: " + err[0]) - else: - changed = False - if self.replace: - newTitle = self.PickTarget(False, title, title, err[1]) - if newTitle: - src = wikipedia.Page(self.site, title) - src.move( newTitle, wikipedia.translate(self.site, self.msgRename)) - changed = True + changed = False + if self.replace: + newTitle = self.PickTarget(False, title, title, err[1]) + if newTitle: + editSummary = wikipedia.translate(self.site, self.msgRename) + src = wikipedia.Page(self.site, title) + if page['ns'] == 14: + import category + dst = wikipedia.Page(self.site, newTitle) + bot = category.CategoryMoveRobot(src.titleWithoutNamespace(), dst.titleWithoutNamespace(), self.autonomous, editSummary, True) + bot.run() + else: + src.move(newTitle, editSummary) + changed = True
- if not changed: - self.WikiLog(u"* " + err[0]) - printed = True + if not changed: + self.WikiLog(u"* " + err[0]) + printed = True
if self.links: allLinks = None @@ -257,7 +262,7 @@ msg = []
for l in allLinks: - ltxt = l['*'] + ltxt = l['title'] err = self.ProcessTitle(ltxt) if err: newTitle = None
Modified: trunk/pywikipedia/query.py =================================================================== --- trunk/pywikipedia/query.py 2007-11-29 16:16:45 UTC (rev 4618) +++ trunk/pywikipedia/query.py 2007-11-30 05:31:33 UTC (rev 4619) @@ -4,8 +4,9 @@ import wikipedia import simplejson import urllib +import time
-def GetData( lang, params, verbose = False ): +def GetData( lang, params, verbose = False, useAPI = False, retryCount = 5 ): """Get data from the query api, and convert it into a data object """ site = wikipedia.getSite( lang ) @@ -15,7 +16,9 @@ params[k] = unicode(v)
params['format'] = 'json' - params['noprofile'] = '' + + if not useAPI: + params['noprofile'] = ''
for k,v in params.iteritems(): if type(v) == type(u''): @@ -30,29 +33,45 @@ data = None titlecount = 0
- path = u"/w/query.php?" + urllib.urlencode( params.items() ) + if useAPI: + path = site.api_address() + urllib.urlencode( params.items() ) + else: + path = site.query_address() + urllib.urlencode( params.items() )
if verbose: - wikipedia.output( u"Requesting %d titles from %s:%s" % (titlecount, lang, path) ) + if titlecount > 0: + wikipedia.output( u"Requesting %d titles from %s:%s" % (titlecount, lang, path) ) + else: + wikipedia.output( u"Request %s:%s" % (lang, path) )
- url = site.family.querypath(lang) - - retryCount = 1 - + lastError = None + retry_idle_time = 5 while retryCount >= 0: try: + jsontext = "Nothing received" jsontext = site.getUrl( path, retry=True, data=data )
# This will also work, but all unicode strings will need to be converted from \u notation # decodedObj = eval( jsontext ) - decodedObj = simplejson.loads( jsontext ) + return simplejson.loads( jsontext ) break
except ValueError, error: retryCount -= 1 wikipedia.output( u"Error downloading data: %s" % error ) + wikipedia.output( u"Request %s:%s" % (lang, path) ) + wikipedia.debugDump('ApiGetDataParse', site, str(error) + '\n' + path, jsontext) + lastError = error + if retryCount >= 0: + wikipedia.output( u"Retrying in %i seconds..." % retry_idle_time ) + time.sleep(retry_idle_time) + # Next time wait longer, but not longer than half an hour + retry_idle_time *= 2 + if retry_idle_time > 300: + retry_idle_time = 300 +
- return decodedObj + raise lastError
def GetInterwikies( lang, titles, extraParams = None ): """ Usage example: data = GetInterwikies('ru','user:yurik')
pywikipedia-l@lists.wikimedia.org