Revision: 5205 Author: nicdumz Date: 2008-04-12 10:15:11 +0000 (Sat, 12 Apr 2008)
Log Message: -----------
Yeehee !! I can commit :) < < <
Repairing the weblink (Special:Linksearch) pagegenerators which has been broken for ages :
BEFORE :
~/projets/pywikipedia\ > python pagegenerators.py -weblink:myspace.com -lang:fr | wc -l Checked for running processes. 1 processes currently running, including the current process. Querying [[Special:Linksearch]]... 453
AFTER :
~/projets/devpywiki\ > python pagegenerators.py -weblink:myspace.com -lang:fr | wc -l Checked for running processes. 1 processes currently running, including the current process. Querying [[Special:Linksearch]]... 2199
Modified Paths: -------------- trunk/pywikipedia/pagegenerators.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/pagegenerators.py =================================================================== --- trunk/pywikipedia/pagegenerators.py 2008-04-11 20:29:11 UTC (rev 5204) +++ trunk/pywikipedia/pagegenerators.py 2008-04-12 10:15:11 UTC (rev 5205) @@ -411,7 +411,7 @@ """ if site is None: site = wikipedia.getSite() - for page in site.linksearch(link): + for page in site.linksearch(link, limit=step): yield page
def SearchPageGenerator(query, number = 100, namespaces = None, site = None): @@ -872,6 +872,9 @@ transclusionPage = wikipedia.Page(wikipedia.getSite(), 'Template:%s' % transclusionPageTitle) gen = ReferringPageGenerator(transclusionPage, onlyTemplateInclusion = True) elif arg.startswith('-start'): + if arg.startswith('-startxml'): + wikipedia.output(u'-startxml : wrong parameter') + sys.exit() firstPageTitle = arg[7:] if not firstPageTitle: firstPageTitle = wikipedia.input(u'At which page do you want to start?')
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2008-04-11 20:29:11 UTC (rev 5204) +++ trunk/pywikipedia/wikipedia.py 2008-04-12 10:15:11 UTC (rev 5205) @@ -4876,32 +4876,45 @@ else: break
- def linksearch(self, siteurl): + def linksearch(self, siteurl, limit=500): """Yield Pages from results of Special:Linksearch for 'siteurl'.""" if siteurl.startswith('*.'): siteurl = siteurl[2:] output(u'Querying [[Special:Linksearch]]...') cache = [] + R = re.compile('title ?="(.*?)"') for url in [siteurl, '*.' + siteurl]: - path = self.linksearch_address(url) - get_throttle() - html = self.getUrl(path) - loc = html.find('<div class="mw-spcontent">') - if loc > -1: - html = html[loc:] - loc = html.find('<div class="printfooter">') - if loc > -1: - html = html[:loc] - R = re.compile('title ?="(.*?)"') - for title in R.findall(html): - if not siteurl in title: - # the links themselves have similar form - if title in cache: - continue - else: - cache.append(title) - yield Page(self, title) + offset = 0 + while True: + path = self.linksearch_address(url, limit=limit, offset=offset) + get_throttle() + html = self.getUrl(path) + #restricting the HTML source : + #when in the source, this div marks the beginning of the input + loc = html.find('<div class="mw-spcontent">') + if loc > -1: + html = html[loc:] + #when in the source, marks the end of the linklist + loc = html.find('<div class="printfooter">') + if loc > -1: + html = html[:loc]
+ #our regex fetches internal page links and the link they contain + links = R.findall(html) + if not links: + #no more page to be fetched for that link + break + for title in links: + if not siteurl in title: + # the links themselves have similar form + if title in cache: + continue + else: + cache.append(title) + yield Page(self, title) + offset += limit + + def __repr__(self): return self.family.name+":"+self.lang
pywikipedia-l@lists.wikimedia.org