[Pywikipedia-l] SVN: [4684] trunk/pywikipedia
cosoleto at svn.wikimedia.org
cosoleto at svn.wikimedia.org
Mon Dec 10 07:27:21 UTC 2007
Revision: 4684
Author: cosoleto
Date: 2007-12-10 07:27:15 +0000 (Mon, 10 Dec 2007)
Log Message:
-----------
Fixed Site.newimages() by using my old code. Now it yields date, author and summary data too (like newpages function) and reports .JPEG and djvu files.
Modified Paths:
--------------
trunk/pywikipedia/family.py
trunk/pywikipedia/pagegenerators.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py 2007-12-10 03:07:37 UTC (rev 4683)
+++ trunk/pywikipedia/family.py 2007-12-10 07:27:15 UTC (rev 4684)
@@ -2767,6 +2767,9 @@
return '%s?title=%s:Allpages&from=%s&namespace=%s' % (
self.path(code), self.special_namespace_url(code), start, namespace)
+ def log_address(self, code, limit=50, mode = ''):
+ return "%s?title=Special:Log&type=%s&user=&page=&limit=%d" % (self.path(code), mode, limit)
+
def newpages_address(self, code, limit=50):
return "%s?title=%s:Newpages&limit=%d" % (self.path(code), self.special_namespace_url(code), limit)
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2007-12-10 03:07:37 UTC (rev 4683)
+++ trunk/pywikipedia/pagegenerators.py 2007-12-10 07:27:15 UTC (rev 4684)
@@ -199,7 +199,7 @@
if site is None:
site = wikipedia.getSite()
for page in site.newimages(number, repeat=repeat):
- yield page
+ yield page[0]
def UnCategorizedPageGenerator(number = 100, repeat = False, site = None):
if site is None:
@@ -772,14 +772,14 @@
if namespace:
prefix = prefix[colon+1:]
gen = PrefixingPageGenerator(prefix = prefix, namespace = namespace)
+ elif arg.startswith('-newimages'):
+ limit = arg[11:] or wikipedia.input(u'How many images do you want to load?')
+ gen = NewimagesPageGenerator(number = int(limit))
elif arg.startswith('-new'):
if len(arg) >=5:
gen = NewpagesPageGenerator(number = int(arg[5:]))
else:
gen = NewpagesPageGenerator(number = 60)
- elif arg.startswith('-newimages'):
- limit = arg[11:] or wikipedia.input(u'How many images do you want to check?')
- gen = NewimagesPageGenerator(number = limit)
elif arg.startswith('-search'):
mediawikiQuery = arg[8:]
if not mediawikiQuery:
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-12-10 03:07:37 UTC (rev 4683)
+++ trunk/pywikipedia/wikipedia.py 2007-12-10 07:27:15 UTC (rev 4684)
@@ -4019,7 +4019,7 @@
The objects yielded are tuples composed of the Page object,
timestamp (unicode), length (int), an empty unicode string, username
or IP address (str), comment (unicode).
-
+
"""
# TODO: in recent MW versions Special:Newpages takes a namespace parameter,
# and defaults to 0 if not specified.
@@ -4216,30 +4216,30 @@
def newimages(self, number = 10, repeat = False):
"""Yield ImagePages from Special:Log&type=upload"""
- # Url of the new images
- url = "/w/index.php?title=Special:Log&type=upload&user=&page=&pattern=&limit=%d&offset=0" % number
- # Get the HTML text
- html = self.getUrl(url)
- image_namespace = self.image_namespace()
- regexp = re.compile(
- r'(?P<new>class=\"new\" |)title=\"%s:(?P<image>.*?)\.(?P<ext>\w\w\w|jpeg)\">.*?</a>\".*?(?:<span class=\"comment\">.*?|)</li>' % image_namespace,
- re.UNICODE)
+
seen = set()
+ regexp = re.compile('<li[^>]*>(?P<date>.+?)\s+<a href=.*?>(?P<user>.+?)</a>\s+\(.+?</a>\).*?<a href=".*?"(?P<new> class="new")? title="(?P<image>.+?)"\s*>(?:.*?<span class="comment">(?P<comment>.*?)</span>)?')
while True:
+ path = self.log_address(number, mode = 'upload')
+ get_throttle()
+ html = self.getUrl(path)
+
for m in regexp.finditer(html):
- new = m.group('new')
- im = m.group('image')
- ext = m.group('ext')
- # This prevent pages with strange characters. They will be loaded without problem.
- image = "%s.%s" % (im, ext)
+ image = m.group('image')
+
if image not in seen:
seen.add(image)
- if new != '':
+
+ if m.group('new'):
output(u"Image \'%s\' has been deleted." % image)
continue
- page = ImagePage(self, image)
- yield page
+
+ date = m.group('date')
+ user = m.group('user')
+ comment = m.group('comment') or ''
+
+ yield ImagePage(self, image), date, user, comment
if not repeat:
break
@@ -4657,6 +4657,10 @@
"""Return path to Special:Allpages."""
return self.family.allpages_address(self.lang, start=s, namespace = ns)
+ def log_address(self, n=50, mode = ''):
+ """Return path to Special:Log."""
+ return self.family.log_address(self.lang, n, mode)
+
def newpages_address(self, n=50):
"""Return path to Special:Newpages."""
return self.family.newpages_address(self.lang, n)
More information about the Pywikipedia-l
mailing list