[Pywikipedia-l] SVN: [4650] trunk/pywikipedia/wikipedia.py

filnik at svn.wikimedia.org filnik at svn.wikimedia.org
Sun Dec 9 12:09:40 UTC 2007


Revision: 4650
Author:   filnik
Date:     2007-12-09 12:09:34 +0000 (Sun, 09 Dec 2007)

Log Message:
-----------
Adding a new generator, newImages()

Modified Paths:
--------------
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2007-12-09 11:49:32 UTC (rev 4649)
+++ trunk/pywikipedia/wikipedia.py	2007-12-09 12:09:34 UTC (rev 4650)
@@ -3468,6 +3468,7 @@
         search(query): query results from Special:Search
         allpages(): Special:Allpages
         newpages(): Special:Newpages
+        newImages(): Special:Log&type=upload
         longpages(): Special:Longpages
         shortpages(): Special:Shortpages
         categories(): Special:Categories (yields Category objects)
@@ -4214,6 +4215,37 @@
             if not repeat:
                 break
 
+    def newImages(self, limit = 50, repeat = False):
+        """Yield ImagePages from Special:Log&type=upload"""
+        # Url of the new images
+        url = "/w/index.php?title=Special:Log&type=upload&user=&page=&pattern=&limit=%d&offset=0" % int(limit)
+        # Get the HTML text
+        html = self.getUrl(url)
+        image_namespace = self.image_namespace()
+        regexp = re.compile(
+            r'(?P<new>class=\"new\" |)title=\"%s:(?P<image>.*?)\.(?P<ext>\w\w\w|jpeg)\">.*?</a>\".*?<span class=\"comment\">' % image_namespace,
+            re.UNICODE)
+        pos = 0
+        seen = list()
+        ext_list = list()    
+        for m in regexp.finditer(html):
+            new = m.group('new')
+            im = m.group('image')
+            ext = m.group('ext')
+            # This prevent pages with strange characters. They will be loaded without problem.
+            image =  "%s.%s" % (im, ext)
+            if new != '':
+                wikipedia.output(u"Skipping %s because it has been deleted." % image)
+                if image not in seen:
+                    seen.append(image)
+            if image not in seen:
+                seen.append(image)
+                page = Page(self, 'Image:%s' % image)
+                yield page
+            if not repeat:            
+                wikipedia.output(u"\t\t>> All images checked. <<")
+                break
+
     def uncategorizedimages(self, number = 10, repeat = False):
         """Yield ImagePages from Special:Uncategorizedimages."""
         seen = set()





More information about the Pywikipedia-l mailing list