SVN: [5205] trunk/pywikipedia - Pywikipedia-l

12 Apr 2008

Revision: 5205
Author:   nicdumz
Date:     2008-04-12 10:15:11 +0000 (Sat, 12 Apr 2008)
Log Message:
-----------
...
...
...
Yeehee !! I can commit :) < < <
Repairing the weblink (Special:Linksearch) pagegenerators which has been broken for ages :
BEFORE :
~/projets/pywikipedia\ > python pagegenerators.py -weblink:myspace.com -lang:fr | wc -l
Checked for running processes. 1 processes currently running, including the current process.
Querying [[Special:Linksearch]]...
453
AFTER :
~/projets/devpywiki\ > python pagegenerators.py -weblink:myspace.com -lang:fr | wc -l
Checked for running processes. 1 processes currently running, including the current process.
Querying [[Special:Linksearch]]...
2199
Modified Paths:
--------------
    trunk/pywikipedia/pagegenerators.py
    trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================

--- trunk/pywikipedia/pagegenerators.py	2008-04-11 20:29:11 UTC (rev 5204)
+++ trunk/pywikipedia/pagegenerators.py	2008-04-12 10:15:11 UTC (rev 5205)
@@ -411,7 +411,7 @@
     """
     if site is None:
         site = wikipedia.getSite()
-    for page in site.linksearch(link):
+    for page in site.linksearch(link, limit=step):
         yield page
def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
@@ -872,6 +872,9 @@
             transclusionPage = wikipedia.Page(wikipedia.getSite(), 'Template:%s' % transclusionPageTitle)
             gen = ReferringPageGenerator(transclusionPage, onlyTemplateInclusion = True)
         elif arg.startswith('-start'):
+            if arg.startswith('-startxml'):
+                wikipedia.output(u'-startxml : wrong parameter')
+                sys.exit()
             firstPageTitle = arg[7:]
             if not firstPageTitle:
                 firstPageTitle = wikipedia.input(u'At which page do you want to start?')
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2008-04-11 20:29:11 UTC (rev 5204)
+++ trunk/pywikipedia/wikipedia.py	2008-04-12 10:15:11 UTC (rev 5205)
@@ -4876,32 +4876,45 @@
             else:
                 break
-    def linksearch(self, siteurl):
+    def linksearch(self, siteurl, limit=500):
         """Yield Pages from results of Special:Linksearch for 'siteurl'."""
         if siteurl.startswith('*.'):
             siteurl = siteurl[2:]
         output(u'Querying [[Special:Linksearch]]...')
         cache = []
+        R = re.compile('title ?="(.*?)"')
         for url in [siteurl, '*.' + siteurl]:
-            path = self.linksearch_address(url)
-            get_throttle()
-            html = self.getUrl(path)
-            loc = html.find('<div class="mw-spcontent">')
-            if loc > -1:
-                html = html[loc:]
-            loc = html.find('<div class="printfooter">')
-            if loc > -1:
-                html = html[:loc]
-            R = re.compile('title ?="(.*?)"')
-            for title in R.findall(html):
-                if not siteurl in title:
-                    # the links themselves have similar form
-                    if title in cache:
-                        continue
-                    else:
-                        cache.append(title)
-                        yield Page(self, title)
+            offset = 0
+            while True:
+                path = self.linksearch_address(url, limit=limit, offset=offset)
+                get_throttle()
+                html = self.getUrl(path)
+                #restricting the HTML source :
+                #when in the source, this div marks the beginning of the input
+                loc = html.find('<div class="mw-spcontent">')
+                if loc > -1:
+                    html = html[loc:]
+                #when in the source, marks the end of the linklist
+                loc = html.find('<div class="printfooter">')
+                if loc > -1:
+                    html = html[:loc]
+                #our regex fetches internal page links and the link they contain
+                links = R.findall(html)
+                if not links:
+                    #no more page to be fetched for that link
+                    break
+                for title in links:
+                    if not siteurl in title:
+                        # the links themselves have similar form
+                        if title in cache:
+                            continue
+                        else:
+                            cache.append(title)
+                            yield Page(self, title)
+                offset += limit
+                
+
     def __repr__(self):
         return self.family.name+":"+self.lang