[Pywikipedia-l] SVN: [4745] trunk/pywikipedia

rotem at svn.wikimedia.org rotem at svn.wikimedia.org
Sat Dec 22 10:14:40 UTC 2007


Revision: 4745
Author:   rotem
Date:     2007-12-22 10:14:39 +0000 (Sat, 22 Dec 2007)

Log Message:
-----------
Avoiding duplicate code in the query of Special:Linksearch.

Modified Paths:
--------------
    trunk/pywikipedia/family.py
    trunk/pywikipedia/pagegenerators.py
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py	2007-12-21 21:28:46 UTC (rev 4744)
+++ trunk/pywikipedia/family.py	2007-12-22 10:14:39 UTC (rev 4745)
@@ -2721,7 +2721,7 @@
       return '%s?title=%s:Ipblocklist&action=search&ip=%s' % (self.path(code), self.special_namespace_url(code), name)
 
     def linksearch_address(self, code, link, limit=500, offset=0):
-      return '%s?title=%s:Linksearch&limit=%d&offset=%d&target=%s' % (self.path(code), self.special_namespace_url(code), limit, offset, link)
+        return '%s?title=%s:Linksearch&limit=%d&offset=%d&target=%s' % (self.path(code), self.special_namespace_url(code), limit, offset, link)
 
     def version_history_address(self, code, name, limit = config.special_page_limit):
         return '%s?title=%s&action=history&limit=%d' % (self.path(code), name, limit)

Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py	2007-12-21 21:28:46 UTC (rev 4744)
+++ trunk/pywikipedia/pagegenerators.py	2007-12-22 10:14:39 UTC (rev 4745)
@@ -299,29 +299,11 @@
 def LinksearchPageGenerator(link, step=500, site=None):
     """Yields all pages that include a specified link, according to
     [[Special:Linksearch]].
-    Retrieves in chunks of size "step" (default 500).
-    Does not guarantee that resulting pages are unique.
     """
     if site is None:
         site = wikipedia.getSite()
-    elRX = re.compile('<a .* class="external ?" .*</a>.*<a .*>(.*)</a>') #TODO: de-uglify?
-    offset = 0
-    pageyeldlist = list()
-    found = step
-    while found == step:
-        found = 0
-        url = site.linksearch_address(link,limit=step,offset=offset)
-        wikipedia.output(u'Querying [[Special:Linksearch]]...')
-        data = site.getUrl(url)
-        for elM in elRX.finditer(data):
-            found += 1
-            pagenameofthelink = elM.group(1)
-            if pagenameofthelink in pageyeldlist:
-                continue
-            else:
-                pageyeldlist.append(pagenameofthelink)
-                yield wikipedia.Page(site, pagenameofthelink)
-        offset += step
+    for page in site.linksearch(link):
+        yield page
 
 def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
     """

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2007-12-21 21:28:46 UTC (rev 4744)
+++ trunk/pywikipedia/wikipedia.py	2007-12-22 10:14:39 UTC (rev 4745)
@@ -4484,8 +4484,10 @@
         """Yield Pages from results of Special:Linksearch for 'siteurl'."""
         if siteurl.startswith('*.'):
             siteurl = siteurl[2:]
-        for url in [siteurl, "*."+siteurl]:
-            path = self.family.linksearch_address(self.lang, url)
+        output(u'Querying [[Special:Linksearch]]...')
+        cache = []
+        for url in [siteurl, '*.' + siteurl]:
+            path = self.linksearch_address(url)
             get_throttle()
             html = self.getUrl(path)
             loc = html.find('<div class="mw-spcontent">')
@@ -4498,7 +4500,11 @@
             for title in R.findall(html):
                 if not siteurl in title:
                     # the links themselves have similar form
-                    yield Page(self,title)
+                    if title in cache:
+                        continue
+                    else:
+                        cache.append(title)
+                        yield Page(self, title)
 
     def __repr__(self):
         return self.family.name+":"+self.lang





More information about the Pywikipedia-l mailing list