[Pywikipedia-l] SVN: [4650] trunk/pywikipedia/wikipedia.py
filnik at svn.wikimedia.org
filnik at svn.wikimedia.org
Sun Dec 9 12:09:40 UTC 2007
Revision: 4650
Author: filnik
Date: 2007-12-09 12:09:34 +0000 (Sun, 09 Dec 2007)
Log Message:
-----------
Adding a new generator, newImages()
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-12-09 11:49:32 UTC (rev 4649)
+++ trunk/pywikipedia/wikipedia.py 2007-12-09 12:09:34 UTC (rev 4650)
@@ -3468,6 +3468,7 @@
search(query): query results from Special:Search
allpages(): Special:Allpages
newpages(): Special:Newpages
+ newImages(): Special:Log&type=upload
longpages(): Special:Longpages
shortpages(): Special:Shortpages
categories(): Special:Categories (yields Category objects)
@@ -4214,6 +4215,37 @@
if not repeat:
break
+ def newImages(self, limit = 50, repeat = False):
+ """Yield ImagePages from Special:Log&type=upload"""
+ # Url of the new images
+ url = "/w/index.php?title=Special:Log&type=upload&user=&page=&pattern=&limit=%d&offset=0" % int(limit)
+ # Get the HTML text
+ html = self.getUrl(url)
+ image_namespace = self.image_namespace()
+ regexp = re.compile(
+ r'(?P<new>class=\"new\" |)title=\"%s:(?P<image>.*?)\.(?P<ext>\w\w\w|jpeg)\">.*?</a>\".*?<span class=\"comment\">' % image_namespace,
+ re.UNICODE)
+ pos = 0
+ seen = list()
+ ext_list = list()
+ for m in regexp.finditer(html):
+ new = m.group('new')
+ im = m.group('image')
+ ext = m.group('ext')
+ # This prevent pages with strange characters. They will be loaded without problem.
+ image = "%s.%s" % (im, ext)
+ if new != '':
+ wikipedia.output(u"Skipping %s because it has been deleted." % image)
+ if image not in seen:
+ seen.append(image)
+ if image not in seen:
+ seen.append(image)
+ page = Page(self, 'Image:%s' % image)
+ yield page
+ if not repeat:
+ wikipedia.output(u"\t\t>> All images checked. <<")
+ break
+
def uncategorizedimages(self, number = 10, repeat = False):
"""Yield ImagePages from Special:Uncategorizedimages."""
seen = set()
More information about the Pywikipedia-l
mailing list