Revision: 7068 Author: alexsh Date: 2009-07-15 20:23:26 +0000 (Wed, 15 Jul 2009)
Log Message: ----------- site().linksearch: add API method (take 2 hours to debug...Orz)
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2009-07-15 19:25:32 UTC (rev 7067) +++ trunk/pywikipedia/wikipedia.py 2009-07-15 20:23:26 UTC (rev 7068) @@ -5808,44 +5808,74 @@
def linksearch(self, siteurl, limit=500): """Yield Pages from results of Special:Linksearch for 'siteurl'.""" - output(u'Querying [[Special:Linksearch]]...') cache = [] R = re.compile('title ?="([^<>]*?)">[^<>]*</a></li>')
urlsToRetrieve = [siteurl] if not siteurl.startswith('*.'): urlsToRetrieve.append('*.' + siteurl) - for url in urlsToRetrieve: - offset = 0 - while True: - path = self.linksearch_address(url, limit=limit, offset=offset) - get_throttle() - html = self.getUrl(path) - #restricting the HTML source : - #when in the source, this div marks the beginning of the input - loc = html.find('<div class="mw-spcontent">') - if loc > -1: - html = html[loc:] - #when in the source, marks the end of the linklist - loc = html.find('<div class="printfooter">') - if loc > -1: - html = html[:loc] + if config.use_api: + output(u'Querying API...') + for url in urlsToRetrieve: + params = { + 'action': 'query', + 'list' : 'exturlusage', + 'eulimit': limit, + 'euquery': url, + } + keepGo = True + while keepGo: + data = query.GetData(params, useAPI = True) + if data['query']['exturlusage'] == []: + break + + if data.has_key(u'query-continue'): + params['euoffset'] = data[u'query-continue'][u'exturlusage'][u'euoffset'] + else: + keepGo = False
- #our regex fetches internal page links and the link they contain - links = R.findall(html) - if not links: - #no more page to be fetched for that link - break - for title in links: - if not siteurl in title: - # the links themselves have similar form - if title in cache: - continue - else: - cache.append(title) - yield Page(self, title) - offset += limit + data = data['query']['exturlusage'] + for pages in data: + if not siteurl in pages['title']: + # the links themselves have similar form + if pages['title'] in cache: + continue + else: + cache.append(pages['title']) + yield Page(self, pages['title']) + else: + output(u'Querying [[Special:Linksearch]]...') + for url in urlsToRetrieve: + offset = 0 + while True: + path = self.linksearch_address(url, limit=limit, offset=offset) + get_throttle() + html = self.getUrl(path) + #restricting the HTML source : + #when in the source, this div marks the beginning of the input + loc = html.find('<div class="mw-spcontent">') + if loc > -1: + html = html[loc:] + #when in the source, marks the end of the linklist + loc = html.find('<div class="printfooter">') + if loc > -1: + html = html[:loc]
+ #our regex fetches internal page links and the link they contain + links = R.findall(html) + if not links: + #no more page to be fetched for that link + break + for title in links: + if not siteurl in title: + # the links themselves have similar form + if title in cache: + continue + else: + cache.append(title) + yield Page(self, title) + offset += limit + def __repr__(self): return self.family.name+":"+self.lang
pywikipedia-svn@lists.wikimedia.org