Revision: 4745 Author: rotem Date: 2007-12-22 10:14:39 +0000 (Sat, 22 Dec 2007)
Log Message: ----------- Avoiding duplicate code in the query of Special:Linksearch.
Modified Paths: -------------- trunk/pywikipedia/family.py trunk/pywikipedia/pagegenerators.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/family.py =================================================================== --- trunk/pywikipedia/family.py 2007-12-21 21:28:46 UTC (rev 4744) +++ trunk/pywikipedia/family.py 2007-12-22 10:14:39 UTC (rev 4745) @@ -2721,7 +2721,7 @@ return '%s?title=%s:Ipblocklist&action=search&ip=%s' % (self.path(code), self.special_namespace_url(code), name)
def linksearch_address(self, code, link, limit=500, offset=0): - return '%s?title=%s:Linksearch&limit=%d&offset=%d&target=%s' % (self.path(code), self.special_namespace_url(code), limit, offset, link) + return '%s?title=%s:Linksearch&limit=%d&offset=%d&target=%s' % (self.path(code), self.special_namespace_url(code), limit, offset, link)
def version_history_address(self, code, name, limit = config.special_page_limit): return '%s?title=%s&action=history&limit=%d' % (self.path(code), name, limit)
Modified: trunk/pywikipedia/pagegenerators.py =================================================================== --- trunk/pywikipedia/pagegenerators.py 2007-12-21 21:28:46 UTC (rev 4744) +++ trunk/pywikipedia/pagegenerators.py 2007-12-22 10:14:39 UTC (rev 4745) @@ -299,29 +299,11 @@ def LinksearchPageGenerator(link, step=500, site=None): """Yields all pages that include a specified link, according to [[Special:Linksearch]]. - Retrieves in chunks of size "step" (default 500). - Does not guarantee that resulting pages are unique. """ if site is None: site = wikipedia.getSite() - elRX = re.compile('<a .* class="external ?" .*</a>.*<a .*>(.*)</a>') #TODO: de-uglify? - offset = 0 - pageyeldlist = list() - found = step - while found == step: - found = 0 - url = site.linksearch_address(link,limit=step,offset=offset) - wikipedia.output(u'Querying [[Special:Linksearch]]...') - data = site.getUrl(url) - for elM in elRX.finditer(data): - found += 1 - pagenameofthelink = elM.group(1) - if pagenameofthelink in pageyeldlist: - continue - else: - pageyeldlist.append(pagenameofthelink) - yield wikipedia.Page(site, pagenameofthelink) - offset += step + for page in site.linksearch(link): + yield page
def SearchPageGenerator(query, number = 100, namespaces = None, site = None): """
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2007-12-21 21:28:46 UTC (rev 4744) +++ trunk/pywikipedia/wikipedia.py 2007-12-22 10:14:39 UTC (rev 4745) @@ -4484,8 +4484,10 @@ """Yield Pages from results of Special:Linksearch for 'siteurl'.""" if siteurl.startswith('*.'): siteurl = siteurl[2:] - for url in [siteurl, "*."+siteurl]: - path = self.family.linksearch_address(self.lang, url) + output(u'Querying [[Special:Linksearch]]...') + cache = [] + for url in [siteurl, '*.' + siteurl]: + path = self.linksearch_address(url) get_throttle() html = self.getUrl(path) loc = html.find('<div class="mw-spcontent">') @@ -4498,7 +4500,11 @@ for title in R.findall(html): if not siteurl in title: # the links themselves have similar form - yield Page(self,title) + if title in cache: + continue + else: + cache.append(title) + yield Page(self, title)
def __repr__(self): return self.family.name+":"+self.lang