[Pywikipedia-l] SVN: [6312] trunk/pywikipedia
purodha at svn.wikimedia.org
purodha at svn.wikimedia.org
Thu Jan 29 22:47:54 UTC 2009
Revision: 6312
Author: purodha
Date: 2009-01-29 22:47:54 +0000 (Thu, 29 Jan 2009)
Log Message:
-----------
Add honoring -namespace parameters ro -new processing.
(Multiple -namespace parameters still have a glitch with page count)
This solves the request at:
https://sourceforge.net/tracker2/index.php?func=detail&aid=2531112&group_id=93107&atid=603141
Modified Paths:
--------------
trunk/pywikipedia/family.py
trunk/pywikipedia/interwiki.py
trunk/pywikipedia/pagegenerators.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py 2009-01-29 20:08:52 UTC (rev 6311)
+++ trunk/pywikipedia/family.py 2009-01-29 22:47:54 UTC (rev 6312)
@@ -3465,8 +3465,8 @@
def log_address(self, code, limit=50, mode = ''):
return "%s?useskin=monobook&title=Special:Log&type=%s&user=&page=&limit=%d" % (self.path(code), mode, limit)
- def newpages_address(self, code, limit=50):
- return "%s?useskin=monobook&title=%s:Newpages&limit=%d" % (self.path(code), self.special_namespace_url(code), limit)
+ def newpages_address(self, code, limit=50, namespace=0):
+ return "%s?useskin=monobook&title=%s:Newpages&limit=%d&namespace=%s" % (self.path(code), self.special_namespace_url(code), limit, namespace)
def longpages_address(self, code, limit=500):
return "%s?useskin=monobook&title=%s:Longpages&limit=%d" % (self.path(code), self.special_namespace_url(code), limit)
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2009-01-29 20:08:52 UTC (rev 6311)
+++ trunk/pywikipedia/interwiki.py 2009-01-29 22:47:54 UTC (rev 6312)
@@ -31,6 +31,10 @@
-new: Work on the 100 newest pages. If given as -new:x, will work
on the x newest pages.
+ When multiple -namespace parameters are given, x pages are
+ inspected, and only the ones in the selected name spaces are
+ processed. Use -namespace:all for all namespaces. Without
+ -namespace, only article pages are processed.
This implies -noredirect.
@@ -1600,6 +1604,7 @@
hintlessPageGen = None
optContinue = False
optRestore = False
+ newPages = None
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
@@ -1694,7 +1699,6 @@
newPages = int(arg[5:])
else:
newPages = 100
- hintlessPageGen = pagegenerators.NewpagesPageGenerator(newPages)
elif arg.startswith('-skipfile:'):
skipfile = arg[10:]
skipPageGen = pagegenerators.TextfilePageGenerator(skipfile)
@@ -1753,6 +1757,22 @@
except:
wikipedia.output(u'Missing main page name')
+ if newPages != None:
+ if len(namespaces) == 0:
+ ns = 0
+ if len(namespaces) == 1:
+ ns = namespaces[0]
+ if ns != 'all':
+ if isinstance(ns, unicode) or isinstance(ns, str):
+ index = site.getNamespaceIndex(ns)
+ if index is None:
+ raise ValueError(u'Unknown namespace: %s' % ns)
+ ns = index
+ namespaces = []
+ else:
+ ns = 'all'
+ hintlessPageGen = pagegenerators.NewpagesPageGenerator(newPages, namespace=ns)
+
if optRestore or optContinue:
site = wikipedia.getSite()
dumpFileName = wikipedia.config.datafilepath(
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2009-01-29 20:08:52 UTC (rev 6311)
+++ trunk/pywikipedia/pagegenerators.py 2009-01-29 22:47:54 UTC (rev 6312)
@@ -249,10 +249,10 @@
for page in site.prefixindex(prefix = title, namespace = namespace, includeredirects = includeredirects):
yield page
-def NewpagesPageGenerator(number = 100, get_redirect = False, repeat = False, site = None):
+def NewpagesPageGenerator(number = 100, get_redirect = False, repeat = False, site = None, namespace = 0):
if site is None:
site = wikipedia.getSite()
- for page in site.newpages(number=number, get_redirect=get_redirect, repeat=repeat):
+ for page in site.newpages(number=number, get_redirect=get_redirect, repeat=repeat, namespace=namespace):
yield page[0]
def FileLinksGenerator(referredImagePage):
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-01-29 20:08:52 UTC (rev 6311)
+++ trunk/pywikipedia/wikipedia.py 2009-01-29 22:47:54 UTC (rev 6312)
@@ -5010,7 +5010,7 @@
yield page, match, relevance, '', '', ''
# TODO: avoid code duplication for the following methods
- def newpages(self, number = 10, get_redirect = False, repeat = False):
+ def newpages(self, number = 10, get_redirect = False, repeat = False, namespace = 0):
"""Yield new articles (as Page objects) from Special:Newpages.
Starts with the newest article and fetches the number of articles
@@ -5029,9 +5029,10 @@
# TODO: Repeat mechanism doesn't make much sense as implemented;
# should use both offset and limit parameters, and have an
# option to fetch older rather than newer pages
+ # TODO: extract and return edit comment.
seen = set()
while True:
- path = self.newpages_address(n=number)
+ path = self.newpages_address(n=number, namespace=namespace)
# The throttling is important here, so always enabled.
get_throttle()
html = self.getUrl(path)
@@ -5856,9 +5857,9 @@
"""Return path to Special:Log."""
return self.family.log_address(self.lang, n, mode)
- def newpages_address(self, n=50):
+ def newpages_address(self, n=50, namespace=0):
"""Return path to Special:Newpages."""
- return self.family.newpages_address(self.lang, n)
+ return self.family.newpages_address(self.lang, n, namespace)
def longpages_address(self, n=500):
"""Return path to Special:Longpages."""
More information about the Pywikipedia-l
mailing list