[Pywikipedia-l] SVN: [4745] trunk/pywikipedia
rotem at svn.wikimedia.org
rotem at svn.wikimedia.org
Sat Dec 22 10:14:40 UTC 2007
Revision: 4745
Author: rotem
Date: 2007-12-22 10:14:39 +0000 (Sat, 22 Dec 2007)
Log Message:
-----------
Avoiding duplicate code in the query of Special:Linksearch.
Modified Paths:
--------------
trunk/pywikipedia/family.py
trunk/pywikipedia/pagegenerators.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py 2007-12-21 21:28:46 UTC (rev 4744)
+++ trunk/pywikipedia/family.py 2007-12-22 10:14:39 UTC (rev 4745)
@@ -2721,7 +2721,7 @@
return '%s?title=%s:Ipblocklist&action=search&ip=%s' % (self.path(code), self.special_namespace_url(code), name)
def linksearch_address(self, code, link, limit=500, offset=0):
- return '%s?title=%s:Linksearch&limit=%d&offset=%d&target=%s' % (self.path(code), self.special_namespace_url(code), limit, offset, link)
+ return '%s?title=%s:Linksearch&limit=%d&offset=%d&target=%s' % (self.path(code), self.special_namespace_url(code), limit, offset, link)
def version_history_address(self, code, name, limit = config.special_page_limit):
return '%s?title=%s&action=history&limit=%d' % (self.path(code), name, limit)
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2007-12-21 21:28:46 UTC (rev 4744)
+++ trunk/pywikipedia/pagegenerators.py 2007-12-22 10:14:39 UTC (rev 4745)
@@ -299,29 +299,11 @@
def LinksearchPageGenerator(link, step=500, site=None):
"""Yields all pages that include a specified link, according to
[[Special:Linksearch]].
- Retrieves in chunks of size "step" (default 500).
- Does not guarantee that resulting pages are unique.
"""
if site is None:
site = wikipedia.getSite()
- elRX = re.compile('<a .* class="external ?" .*</a>.*<a .*>(.*)</a>') #TODO: de-uglify?
- offset = 0
- pageyeldlist = list()
- found = step
- while found == step:
- found = 0
- url = site.linksearch_address(link,limit=step,offset=offset)
- wikipedia.output(u'Querying [[Special:Linksearch]]...')
- data = site.getUrl(url)
- for elM in elRX.finditer(data):
- found += 1
- pagenameofthelink = elM.group(1)
- if pagenameofthelink in pageyeldlist:
- continue
- else:
- pageyeldlist.append(pagenameofthelink)
- yield wikipedia.Page(site, pagenameofthelink)
- offset += step
+ for page in site.linksearch(link):
+ yield page
def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
"""
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-12-21 21:28:46 UTC (rev 4744)
+++ trunk/pywikipedia/wikipedia.py 2007-12-22 10:14:39 UTC (rev 4745)
@@ -4484,8 +4484,10 @@
"""Yield Pages from results of Special:Linksearch for 'siteurl'."""
if siteurl.startswith('*.'):
siteurl = siteurl[2:]
- for url in [siteurl, "*."+siteurl]:
- path = self.family.linksearch_address(self.lang, url)
+ output(u'Querying [[Special:Linksearch]]...')
+ cache = []
+ for url in [siteurl, '*.' + siteurl]:
+ path = self.linksearch_address(url)
get_throttle()
html = self.getUrl(path)
loc = html.find('<div class="mw-spcontent">')
@@ -4498,7 +4500,11 @@
for title in R.findall(html):
if not siteurl in title:
# the links themselves have similar form
- yield Page(self,title)
+ if title in cache:
+ continue
+ else:
+ cache.append(title)
+ yield Page(self, title)
def __repr__(self):
return self.family.name+":"+self.lang
More information about the Pywikipedia-l
mailing list