Revision: 5205
Author: nicdumz
Date: 2008-04-12 10:15:11 +0000 (Sat, 12 Apr 2008)
Log Message:
-----------
> > Yeehee !! I can commit :) < < <
Repairing the weblink (Special:Linksearch) pagegenerators which has been broken for ages
:
BEFORE :
~/projets/pywikipedia\ > python pagegenerators.py -weblink:myspace.com -lang:fr | wc
-l
Checked for running processes. 1 processes currently running, including the current
process.
Querying [[Special:Linksearch]]...
453
AFTER :
~/projets/devpywiki\ > python pagegenerators.py -weblink:myspace.com -lang:fr | wc -l
Checked for running processes. 1 processes currently running, including the current
process.
Querying [[Special:Linksearch]]...
2199
Modified Paths:
--------------
trunk/pywikipedia/pagegenerators.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2008-04-11 20:29:11 UTC (rev 5204)
+++ trunk/pywikipedia/pagegenerators.py 2008-04-12 10:15:11 UTC (rev 5205)
@@ -411,7 +411,7 @@
"""
if site is None:
site = wikipedia.getSite()
- for page in site.linksearch(link):
+ for page in site.linksearch(link, limit=step):
yield page
def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
@@ -872,6 +872,9 @@
transclusionPage = wikipedia.Page(wikipedia.getSite(), 'Template:%s'
% transclusionPageTitle)
gen = ReferringPageGenerator(transclusionPage, onlyTemplateInclusion = True)
elif arg.startswith('-start'):
+ if arg.startswith('-startxml'):
+ wikipedia.output(u'-startxml : wrong parameter')
+ sys.exit()
firstPageTitle = arg[7:]
if not firstPageTitle:
firstPageTitle = wikipedia.input(u'At which page do you want to
start?')
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2008-04-11 20:29:11 UTC (rev 5204)
+++ trunk/pywikipedia/wikipedia.py 2008-04-12 10:15:11 UTC (rev 5205)
@@ -4876,32 +4876,45 @@
else:
break
- def linksearch(self, siteurl):
+ def linksearch(self, siteurl, limit=500):
"""Yield Pages from results of Special:Linksearch for
'siteurl'."""
if siteurl.startswith('*.'):
siteurl = siteurl[2:]
output(u'Querying [[Special:Linksearch]]...')
cache = []
+ R = re.compile('title ?=\"(.*?)\"')
for url in [siteurl, '*.' + siteurl]:
- path = self.linksearch_address(url)
- get_throttle()
- html = self.getUrl(path)
- loc = html.find('<div class="mw-spcontent">')
- if loc > -1:
- html = html[loc:]
- loc = html.find('<div class="printfooter">')
- if loc > -1:
- html = html[:loc]
- R = re.compile('title ?=\"(.*?)\"')
- for title in R.findall(html):
- if not siteurl in title:
- # the links themselves have similar form
- if title in cache:
- continue
- else:
- cache.append(title)
- yield Page(self, title)
+ offset = 0
+ while True:
+ path = self.linksearch_address(url, limit=limit, offset=offset)
+ get_throttle()
+ html = self.getUrl(path)
+ #restricting the HTML source :
+ #when in the source, this div marks the beginning of the input
+ loc = html.find('<div class="mw-spcontent">')
+ if loc > -1:
+ html = html[loc:]
+ #when in the source, marks the end of the linklist
+ loc = html.find('<div class="printfooter">')
+ if loc > -1:
+ html = html[:loc]
+ #our regex fetches internal page links and the link they contain
+ links = R.findall(html)
+ if not links:
+ #no more page to be fetched for that link
+ break
+ for title in links:
+ if not siteurl in title:
+ # the links themselves have similar form
+ if title in cache:
+ continue
+ else:
+ cache.append(title)
+ yield Page(self, title)
+ offset += limit
+
+
def __repr__(self):
return self.family.name+":"+self.lang