[Pywikipedia-l] SVN: [4684] trunk/pywikipedia

cosoleto at svn.wikimedia.org cosoleto at svn.wikimedia.org
Mon Dec 10 07:27:21 UTC 2007


Revision: 4684
Author:   cosoleto
Date:     2007-12-10 07:27:15 +0000 (Mon, 10 Dec 2007)

Log Message:
-----------
Fixed Site.newimages() by using my old code. Now it yields date, author and summary data too (like newpages function) and reports .JPEG and djvu files.

Modified Paths:
--------------
    trunk/pywikipedia/family.py
    trunk/pywikipedia/pagegenerators.py
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/family.py
===================================================================
--- trunk/pywikipedia/family.py	2007-12-10 03:07:37 UTC (rev 4683)
+++ trunk/pywikipedia/family.py	2007-12-10 07:27:15 UTC (rev 4684)
@@ -2767,6 +2767,9 @@
             return '%s?title=%s:Allpages&from=%s&namespace=%s' % (
                 self.path(code), self.special_namespace_url(code), start, namespace)
 
+    def log_address(self, code, limit=50, mode = ''):
+        return "%s?title=Special:Log&type=%s&user=&page=&limit=%d" % (self.path(code), mode, limit)
+
     def newpages_address(self, code, limit=50):
         return "%s?title=%s:Newpages&limit=%d" % (self.path(code), self.special_namespace_url(code), limit)
 

Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py	2007-12-10 03:07:37 UTC (rev 4683)
+++ trunk/pywikipedia/pagegenerators.py	2007-12-10 07:27:15 UTC (rev 4684)
@@ -199,7 +199,7 @@
     if site is None:
         site = wikipedia.getSite()
     for page in site.newimages(number, repeat=repeat):
-        yield page			
+        yield page[0]
 
 def UnCategorizedPageGenerator(number = 100, repeat = False, site = None):
     if site is None:
@@ -772,14 +772,14 @@
                 if namespace:
                     prefix = prefix[colon+1:]
             gen = PrefixingPageGenerator(prefix = prefix, namespace = namespace)
+        elif arg.startswith('-newimages'):
+            limit = arg[11:] or wikipedia.input(u'How many images do you want to load?')
+            gen = NewimagesPageGenerator(number = int(limit))
         elif arg.startswith('-new'):
             if len(arg) >=5:
               gen = NewpagesPageGenerator(number = int(arg[5:]))
             else:
               gen = NewpagesPageGenerator(number = 60)
-        elif arg.startswith('-newimages'):
-            limit = arg[11:] or wikipedia.input(u'How many images do you want to check?')
-            gen = NewimagesPageGenerator(number = limit)
         elif arg.startswith('-search'):
             mediawikiQuery = arg[8:]
             if not mediawikiQuery:

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2007-12-10 03:07:37 UTC (rev 4683)
+++ trunk/pywikipedia/wikipedia.py	2007-12-10 07:27:15 UTC (rev 4684)
@@ -4019,7 +4019,7 @@
         The objects yielded are tuples composed of the Page object,
         timestamp (unicode), length (int), an empty unicode string, username
         or IP address (str), comment (unicode).
-           
+
         """
         # TODO: in recent MW versions Special:Newpages takes a namespace parameter,
         #       and defaults to 0 if not specified.
@@ -4216,30 +4216,30 @@
 
     def newimages(self, number = 10, repeat = False):
         """Yield ImagePages from Special:Log&type=upload"""
-        # Url of the new images
-        url = "/w/index.php?title=Special:Log&type=upload&user=&page=&pattern=&limit=%d&offset=0" % number
-        # Get the HTML text
-        html = self.getUrl(url)
-        image_namespace = self.image_namespace()
-        regexp = re.compile(
-            r'(?P<new>class=\"new\" |)title=\"%s:(?P<image>.*?)\.(?P<ext>\w\w\w|jpeg)\">.*?</a>\".*?(?:<span class=\"comment\">.*?|)</li>' % image_namespace,
-            re.UNICODE)
+
         seen = set()
+        regexp = re.compile('<li[^>]*>(?P<date>.+?)\s+<a href=.*?>(?P<user>.+?)</a>\s+\(.+?</a>\).*?<a href=".*?"(?P<new> class="new")? title="(?P<image>.+?)"\s*>(?:.*?<span class="comment">(?P<comment>.*?)</span>)?')
 
         while True:
+            path = self.log_address(number, mode = 'upload')
+            get_throttle()
+            html = self.getUrl(path)
+
             for m in regexp.finditer(html):
-                new = m.group('new')
-                im = m.group('image')
-                ext = m.group('ext')
-                # This prevent pages with strange characters. They will be loaded without problem.
-                image =  "%s.%s" % (im, ext)
+                image = m.group('image')
+
                 if image not in seen:
                     seen.add(image)
-                    if new != '':
+
+                    if m.group('new'):
                         output(u"Image \'%s\' has been deleted." % image)
                         continue
-                    page = ImagePage(self, image)
-                    yield page
+
+                    date = m.group('date')
+                    user = m.group('user')
+                    comment = m.group('comment') or ''
+
+                    yield ImagePage(self, image), date, user, comment
             if not repeat:
                 break
 
@@ -4657,6 +4657,10 @@
         """Return path to Special:Allpages."""
         return self.family.allpages_address(self.lang, start=s, namespace = ns)
 
+    def log_address(self, n=50, mode = ''):
+        """Return path to Special:Log."""
+        return self.family.log_address(self.lang, n, mode)
+
     def newpages_address(self, n=50):
         """Return path to Special:Newpages."""
         return self.family.newpages_address(self.lang, n)





More information about the Pywikipedia-l mailing list