[Pywikipedia-l] SVN: [5634] trunk/pywikipedia/checkimages.py - pywikibot

26 Jun 2008

Revision: 5634
Author:   filnik
Date:     2008-06-26 14:26:46 +0000 (Thu, 26 Jun 2008)
Log Message:
-----------
Fixing the duplicate detector and some minor fixes
Modified Paths:
--------------
    trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================

--- trunk/pywikipedia/checkimages.py	2008-06-26 14:26:14 UTC (rev 5633)
+++ trunk/pywikipedia/checkimages.py	2008-06-26 14:26:46 UTC (rev 5634)
@@ -80,7 +80,7 @@
 #
import re, time, urllib, urllib2, os, locale, sys
-import wikipedia, config, pagegenerators, catlib
+import wikipedia, config, pagegenerators, catlib, query
locale.setlocale(locale.LC_ALL, '')
@@ -384,10 +384,6 @@
 class NothingFound(wikipedia.Error):
     """ An exception indicating that a regex has return [] instead of results."""
-class NoHash(wikipedia.Error):
-    """ The APIs don't return any Hash for the image searched.
-        Really Strange, better to raise an error. """
-
 # Other common useful functions
 def printWithTimeZone(message):
     """ Function to print the messages followed by the TimeZone encoded correctly. """
@@ -400,23 +396,6 @@
         time_zone = unicode(time.strftime(u"%d %b %Y %H:%M:%S (UTC)", time.gmtime()))
     wikipedia.output(u"%s%s" % (message, time_zone))
-def returnOlderTime(listGiven, timeListGiven):
-    """ Get some time and return the oldest of them """
-    #print listGiven; print timeListGiven
-    #Output:
-    #[[1210596312.0, u'Autoritratto.png'], [1210590240.0, u'Duplicato.png'], [1210592052.0, u'Duplicato_2.png']]
-    #[1210596312.0, 1210590240.0, 1210592052.0]
-    for element in listGiven:
-        time = element[0]
-        imageName = element[1]
-        not_the_oldest = False
-        for time_selected in timeListGiven:
-            if time > time_selected:
-                not_the_oldest = True
-                break
-        if not_the_oldest == False:
-            return imageName     
-
 class EmailSender(wikipedia.Page):
     """ Class to send emails through the Wikipedia's dedicated page. """
     def __init__(self, site, user):
@@ -424,6 +403,7 @@
         self.user = user
         page_special_name = u'Special:EmailUser'
         self.page_special_name = page_special_name
+        # Special:EmailUser/Filnik
         page = '%s/%s' % (self.page_special_name, self.user)
         self.page = page
         wikipedia.Page.__init__(self, site, page, None, 0)
@@ -563,7 +543,7 @@
         # paginetta it's the image page object.
         paginetta = wikipedia.ImagePage(self.site, self.image_namespace + self.image)
         try:
-            nick = paginetta.getLatestUploader()
+            nick = paginetta.getLatestUploader()[0]
         except wikipedia.NoPage:
             wikipedia.output(u"Seems that %s hasn't the image at all, but there is something in the description..." % self.image)
             repme = "\n*[[:Image:%s]] problems '''with the APIs'''"
@@ -679,6 +659,44 @@
                 yield image
                 #continue
+    def returnOlderTime(self, listGiven, timeListGiven):
+        """ Get some time and return the oldest of them """
+        #print listGiven; print timeListGiven
+        #Output:
+        #[[1210596312.0, u'Autoritratto.png'], [1210590240.0, u'Duplicato.png'], [1210592052.0, u'Duplicato_2.png']]
+        #[1210596312.0, 1210590240.0, 1210592052.0]
+        usage = False
+        num = 0
+        num_older = None
+        max_usage = 0
+        for element in listGiven:
+            imageName = element[1]
+            imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % imageName)
+            imageUsage = [page for page in imagePage.usingPages()]
+            if len(imageUsage) != 0 and imageUsage > max_usage:
+                max_usage = imageUsage
+                num_older = num
+            num += 1
+        if num_older != None:
+            return listGiven[num_older][1]
+        for element in listGiven:
+            time = element[0]
+            imageName = element[1]
+            not_the_oldest = False
+            for time_selected in timeListGiven:
+                if time > time_selected:
+                    not_the_oldest = True
+                    break
+            if not_the_oldest == False:
+                return imageName
+
+    def convert_to_url(self, page):
+        # Function stolen from wikipedia.py
+        """The name of the page this Page refers to, in a form suitable for the URL of the page."""
+        title = page.replace(" ", "_")
+        encodedTitle = title.encode(self.site.encoding())
+        return urllib.quote(encodedTitle)
+
     def checkImageOnCommons(self, image):
         """ Checking if the image is on commons """
         self.image = image
@@ -706,13 +724,6 @@
             # Problems? No, return True
             return True
-    def convert_to_url(self, page):
-        # Function stolen from wikipedia.py
-        """The name of the page this Page refers to, in a form suitable for the URL of the page."""
-        title = page.replace(" ", "_")
-        encodedTitle = title.encode(self.site.encoding())
-        return urllib.quote(encodedTitle)
-
     def checkImageDuplicated(self, image):
         """ Function to check the duplicated images. """
         # {{Dupe|Image:Blanche_Montel.jpg}}
@@ -722,23 +733,12 @@
         dupTalkText = wikipedia.translate(self.site, duplicates_user_talk_text)
         dupComment_talk = wikipedia.translate(self.site, duplicates_comment_talk)
         dupComment_image = wikipedia.translate(self.site, duplicates_comment_image)
-
         self.image = image
         duplicateRegex = r'\n*(?:[[:Image:%s]] has the following duplicates:|*[[:Image:%s]])$' % (self.convert_to_url(self.image), self.convert_to_url(self.image))
-        imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image)
-        wikipedia.output(u'Checking if %s has duplicates...' % image)
-        get_hash = self.site.getUrl(self.site.apipath() + '?action=query&format=xml&titles=Image:%s&prop=imageinfo&iiprop=sha1' % self.convert_to_url(self.image))
-        hash_found_list = re.findall(r'<ii sha1="(.*?)" />', get_hash)
-        if hash_found_list != []:
-            hash_found = hash_found_list[0]
-        else:
-            if imagePage.exists():
-                raise NoHash('No Hash found in the APIs! Maybe the regex to catch it is wrong or someone has changed the APIs structure.')
-            else:
-                wikipedia.output(u'Image deleted before getting the Hash. Skipping...')
-                return False # Error, we need to skip the page.
-        get_duplicates = self.site.getUrl(self.site.apipath() + '?action=query&format=xml&list=allimages&aisha1=%s' % hash_found)
-        duplicates = re.findall(r'<img name="(.*?)".*?/>', get_duplicates)
+        imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image)               
+        duplicates = imagePage.getDuplicates()
+        if duplicates == None:
+            return False # Error, we need to skip the page.
         if len(duplicates) > 1:
             if len(duplicates) == 2:
                 wikipedia.output(u'%s has a duplicate! Reporting it...' % self.image)
@@ -758,17 +758,13 @@
                 time_list = list()
                 for duplicate in duplicates:
                     DupePage = wikipedia.ImagePage(self.site, u'Image:%s' % duplicate)
-                    imagedata = DupePage.getFileVersionHistory()[-1][0]
-                    try:
-                        # Example: 21:15, 5 ott 2005
-                        data = time.strptime(imagedata, "%H:%M, %d %b %Y")
-                    except ValueError:
-                        # Example: 21:15, 5 Ottobre 2005
-                        data = time.strptime(imagedata, "%H:%M, %d %B %Y")
+                    imagedata = DupePage.getLatestUploader()[1]
+                    # '2008-06-18T08:04:29Z'
+                    data = time.strptime(imagedata, "%Y-%m-%dT%H:%M:%SZ")
                     data_seconds = time.mktime(data)
                     time_image_list.append([data_seconds, duplicate])
                     time_list.append(data_seconds)
-                older_image = returnOlderTime(time_image_list, time_list)
+                older_image = self.returnOlderTime(time_image_list, time_list)
                 # And if the images are more than two?
                 Page_oder_image = wikipedia.ImagePage(self.site, u'Image:%s' % older_image)
                 string = ''
@@ -895,6 +891,7 @@
                 list_loaded.append(word)
def checkbot():
+    """ Main function """
     # Command line configurable parameters
     repeat = True # Restart after having check all the images?
     limit = 80 # How many images check?