[Pywikipedia-l] SVN: [6312] trunk/pywikipedia

purodha at svn.wikimedia.org purodha at svn.wikimedia.org
Thu Jan 29 22:47:54 UTC 2009


Revision: 6312
Author:   purodha
Date:     2009-01-29 22:47:54 +0000 (Thu, 29 Jan 2009)

Log Message:
-----------
Add honoring -namespace parameters ro -new processing.
(Multiple -namespace parameters still have a glitch with page count)
This solves the request at:
https://sourceforge.net/tracker2/index.php?func=detail&aid=2531112&group_id=93107&atid=603141

Modified Paths:
--------------
    trunk/pywikipedia/family.py
    trunk/pywikipedia/interwiki.py
    trunk/pywikipedia/pagegenerators.py
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py	2009-01-29 20:08:52 UTC (rev 6311)
+++ trunk/pywikipedia/family.py	2009-01-29 22:47:54 UTC (rev 6312)
@@ -3465,8 +3465,8 @@
     def log_address(self, code, limit=50, mode = ''):
         return "%s?useskin=monobook&title=Special:Log&type=%s&user=&page=&limit=%d" % (self.path(code), mode, limit)
 
-    def newpages_address(self, code, limit=50):
-        return "%s?useskin=monobook&title=%s:Newpages&limit=%d" % (self.path(code), self.special_namespace_url(code), limit)
+    def newpages_address(self, code, limit=50, namespace=0):
+        return "%s?useskin=monobook&title=%s:Newpages&limit=%d&namespace=%s" % (self.path(code), self.special_namespace_url(code), limit, namespace)
 
     def longpages_address(self, code, limit=500):
         return "%s?useskin=monobook&title=%s:Longpages&limit=%d" % (self.path(code), self.special_namespace_url(code), limit)

Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py	2009-01-29 20:08:52 UTC (rev 6311)
+++ trunk/pywikipedia/interwiki.py	2009-01-29 22:47:54 UTC (rev 6312)
@@ -31,6 +31,10 @@
 
     -new:          Work on the 100 newest pages. If given as -new:x, will work
                    on the x newest pages.
+                   When multiple -namespace parameters are given, x pages are
+                   inspected, and only the ones in the selected name spaces are
+                   processed. Use -namespace:all for all namespaces. Without
+                   -namespace, only article pages are processed.
 
                    This implies -noredirect.
 
@@ -1600,6 +1604,7 @@
         hintlessPageGen = None
         optContinue = False
         optRestore = False
+        newPages = None
         # This factory is responsible for processing command line arguments
         # that are also used by other scripts and that determine on which pages
         # to work on.
@@ -1694,7 +1699,6 @@
                     newPages = int(arg[5:])
                 else:
                     newPages = 100
-                hintlessPageGen = pagegenerators.NewpagesPageGenerator(newPages)
             elif arg.startswith('-skipfile:'):
                 skipfile = arg[10:]
                 skipPageGen = pagegenerators.TextfilePageGenerator(skipfile)
@@ -1753,6 +1757,22 @@
         except:
             wikipedia.output(u'Missing main page name')
 
+        if newPages != None:
+            if len(namespaces) == 0:
+                ns = 0 
+            if len(namespaces) == 1:
+                ns = namespaces[0]
+                if ns != 'all':
+                    if isinstance(ns, unicode) or isinstance(ns, str):
+                        index = site.getNamespaceIndex(ns)
+                        if index is None:
+                            raise ValueError(u'Unknown namespace: %s' % ns)
+                        ns = index
+                namespaces = []
+            else:
+                ns = 'all'
+            hintlessPageGen = pagegenerators.NewpagesPageGenerator(newPages, namespace=ns)
+
         if optRestore or optContinue:
             site = wikipedia.getSite()
             dumpFileName = wikipedia.config.datafilepath(

Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py	2009-01-29 20:08:52 UTC (rev 6311)
+++ trunk/pywikipedia/pagegenerators.py	2009-01-29 22:47:54 UTC (rev 6312)
@@ -249,10 +249,10 @@
     for page in site.prefixindex(prefix = title, namespace = namespace, includeredirects = includeredirects):
         yield page
 
-def NewpagesPageGenerator(number = 100, get_redirect = False, repeat = False, site = None):
+def NewpagesPageGenerator(number = 100, get_redirect = False, repeat = False, site = None, namespace = 0):
     if site is None:
         site = wikipedia.getSite()
-    for page in site.newpages(number=number, get_redirect=get_redirect, repeat=repeat):
+    for page in site.newpages(number=number, get_redirect=get_redirect, repeat=repeat, namespace=namespace):
         yield page[0]
 
 def FileLinksGenerator(referredImagePage):

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2009-01-29 20:08:52 UTC (rev 6311)
+++ trunk/pywikipedia/wikipedia.py	2009-01-29 22:47:54 UTC (rev 6312)
@@ -5010,7 +5010,7 @@
             yield page, match, relevance, '', '', ''
 
     # TODO: avoid code duplication for the following methods
-    def newpages(self, number = 10, get_redirect = False, repeat = False):
+    def newpages(self, number = 10, get_redirect = False, repeat = False, namespace = 0):
         """Yield new articles (as Page objects) from Special:Newpages.
 
         Starts with the newest article and fetches the number of articles
@@ -5029,9 +5029,10 @@
         # TODO: Repeat mechanism doesn't make much sense as implemented;
         #       should use both offset and limit parameters, and have an
         #       option to fetch older rather than newer pages
+        # TODO: extract and return edit comment.
         seen = set()
         while True:
-            path = self.newpages_address(n=number)
+            path = self.newpages_address(n=number, namespace=namespace)
             # The throttling is important here, so always enabled.
             get_throttle()
             html = self.getUrl(path)
@@ -5856,9 +5857,9 @@
         """Return path to Special:Log."""
         return self.family.log_address(self.lang, n, mode)
 
-    def newpages_address(self, n=50):
+    def newpages_address(self, n=50, namespace=0):
         """Return path to Special:Newpages."""
-        return self.family.newpages_address(self.lang, n)
+        return self.family.newpages_address(self.lang, n, namespace)
 
     def longpages_address(self, n=500):
         """Return path to Special:Longpages."""





More information about the Pywikipedia-l mailing list