SVN: [5289] trunk/pywikipedia/checkimages.py - Pywikipedia-l

1 May 2008

Revision: 5289
Author:   filnik
Date:     2008-05-01 15:15:46 +0000 (Thu, 01 May 2008)
Log Message:
-----------
Adding a new functionality: checking if the image has duplicates through the APIs! Adding also some documentation, an error class for the new functionality and some rewrite for the report_image function
Modified Paths:
--------------
    trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================

--- trunk/pywikipedia/checkimages.py	2008-04-30 17:31:44 UTC (rev 5288)
+++ trunk/pywikipedia/checkimages.py	2008-05-01 15:15:46 UTC (rev 5289)
@@ -18,6 +18,8 @@
     -commons        - The Bot will check if an image on Commons has the same name
                     and if true it report the image.
+    -duplicates     - Checking if the image has duplicates.
+
     -break	        - To break the bot after the first check (default: recursive)
-time[:#]	- Time in seconds between repeat runs (default: 30)
@@ -55,9 +57,9 @@
 ---- Known issues/FIXMEs: ----
 * Fix the "real-time" regex and function
 * Add the "catch the language" function for commons.
-* Add new documentation
+* Fix and reorganise the new documentation
 * Add a report for the image tagged.
-* Fix the settings part when the bot save the data (make it better)
+* Implement: Special:FileDuplicateSearch/Image.jpg
 """
#
@@ -70,12 +72,16 @@
 __version__ = '$Id$'
 #
-import re, time, urllib2
-import wikipedia, config, os, locale, sys
-import cPickle, pagegenerators, catlib
+import re, time, urllib, urllib2, os, locale, sys
+import wikipedia, config, pagegenerators, catlib
locale.setlocale(locale.LC_ALL, '')
+class NoHash(wikipedia.Error):
+    """ The APIs don't return any Hash for the image searched.
+        Really Strange, better to raise an error.
+    """
+
 #########################################################################################################################
 # <------------------------------------------- Change only below! ----------------------------------------------------->#
 #########################################################################################################################
@@ -214,12 +220,12 @@
 # This is a list of what bots used this script in your project.
 # NOTE: YOUR Botnick is automatically added. It's not required to add it twice.
 bot_list = {
-            'commons':['Siebot', 'CommonsDelinker'],
-            'en'     :['OrphanBot'],
-            'it'     :['Filbot', 'Nikbot', '.snoopyBot.'],
-            'ja'     :['alexbot'],
-            'ta'     :['TrengarasuBOT'],
-            'zh'     :['alexbot'],
+            'commons':[u'Siebot', u'CommonsDelinker', u'Filbot', u'John Bot', u'Sz-iwbot', u'ABFbot'],
+            'en'     :[u'OrphanBot'],
+            'it'     :[u'Filbot', u'Nikbot', u'.snoopyBot.'],
+            'ja'     :[u'alexbot'],
+            'ta'     :[u'TrengarasuBOT'],
+            'zh'     :[u'alexbot'],
             }
# The message that the bot will add the second time that find another license problem.
@@ -248,7 +254,7 @@
 report_page = {
                 'commons':u'User:Filbot/Report',
                 'en'     :u'User:Filnik/Report',
-                'it'     :u'Progetto:Coordinamento/Immagini/Bot/NowCommons',
+                'it'     :u'Progetto:Coordinamento/Immagini/Bot/Report',
                 'ja'     :u'User:Alexbot/report',
                 'hu'     :u'User:Bdamokos/Report',
                 'ta'     :u'Trengarasu/commonsimages',
@@ -415,8 +421,8 @@
         talk_page = wikipedia.Page(self.site, pagina_discussione)
         self.talk_page = talk_page
         return True
-    # There is the function to put the advise in talk page.
     def put_talk(self, notification, head, notification2 = None, commx = None):
+        """ Function to put the warning in talk page of the uploader."""
         commento2 = wikipedia.translate(self.site, comm2)
         talk_page = self.talk_page
         notification = self.notification
@@ -476,6 +482,7 @@
             talk_page.put(testoattuale + head + notification, comment = commentox, minorEdit = False)
    		
     def untaggedGenerator(self, untaggedProject, limit):
+        """ Generator that yield the images without license. It's based on a tool of the toolserver. """
         lang = untaggedProject.split('.', 1)[0]
         project = '.%s' % untaggedProject.split('.', 1)[1]
         if lang == 'commons':
@@ -494,6 +501,7 @@
                 yield wikiPage
    
     def regexGenerator(self, regexp, textrun):
+        """ Generator used when an user use a regex parsing a page to yield the results """
         pos = 0
         done = list()
         ext_list = list()
@@ -510,15 +518,15 @@
                 yield image
                 #continue
-    def checkImage(self, image):
+    def checkImageOnCommons(self, image):
+        """ Checking if the image is on commons """
         self.image = image
-        # Search regular expression to find links like this (and the class attribute is optional too)
-        # title="Immagine:Nvidia.jpg"
-        wikipedia.output(u'Checking if %s is on commons...' % image)
-        commons = wikipedia.getSite('commons', 'commons') 
-        if wikipedia.Page(commons, u'Image:%s' % image).exists():
-            wikipedia.output(u'%s is on commons!' % image)
-            imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % image)
+        wikipedia.output(u'Checking if %s is on commons...' % self.image)
+        commons = wikipedia.getSite('commons', 'commons')
+        regexOnCommons = r"\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]]$" % (self.image, self.image)
+        if wikipedia.Page(commons, u'Image:%s' % self.image).exists():
+            wikipedia.output(u'%s is on commons!' % self.image)
+            imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image)
             on_commons_text = imagePage.getImagePageHtml()
             if "<div class='sharedUploadNotice'>" in on_commons_text:
                 wikipedia.output(u"But, the image doesn't exist on your project! Skip...")
@@ -529,39 +537,68 @@
                 wikipedia.output(u'%s has "stemma" inside, means that it's ok.' % image)
                 return True # Problems? No, it's only not on commons but the image needs a check
             else:            
-                repme = "\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]]"
-                self.report_image(self.image, self.rep_page, self.com, repme)
+                repme = "\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]]" % (self.image, self.image)
+                self.report_image(self.image, self.rep_page, self.com, repme, addings = False, regex = regexOnCommons)
                 # Problems? No, return True
                 return True
         else:
             # Problems? No, return True
             return True
-    def report_image(self, image, rep_page = None, com = None, rep_text = None):
-        if rep_page == None:
-            rep_page = self.rep_page
-        if com == None:
-            com = self.com
-        if rep_text == None:
-            rep_text = self.rep_text
+
+    def convert_to_url(self, page):
+        # Function stolen from wikipedia.py
+        """The name of the page this Page refers to, in a form suitable for the URL of the page."""
+        title = page.replace(" ", "_")
+        encodedTitle = title.encode(self.site.encoding())
+        return urllib.quote(encodedTitle)
+
+    def checkImageDuplicated(self, image):
+        """ Function to check the duplicated images. """
+        self.image = image
+        duplicateRegex = r'\n*(?:[[:Image:%s]] has the following duplicates:|*[[:Image:%s]])$' % (self.image, self.image)
+        imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image)
+        wikipedia.output(u'Checking if %s has duplicates...' % image)
+        get_hash = self.site.getUrl('/w/api.php?action=query&format=xml&titles=Image:%s&prop=imageinfo&iiprop=sha1' % self.convert_to_url(self.image))
+        hash_found_list = re.findall(r'<ii sha1="(.*?)" />', get_hash)
+        if hash_found_list != []:
+            hash_found = hash_found_list[0]
+        else:
+            raise NoHash('No Hash found in the APIs! Maybe the regex to catch it is wrong or someone has changed the APIs structure.')
+        get_duplicates = self.site.getUrl('/w/api.php?action=query&format=xml&list=allimages&aisha1=%s' % hash_found)
+        duplicates = re.findall(r'<img name="(.*?)".*?/>', get_duplicates)
+        if len(duplicates) > 1:
+            if len(duplicates) == 2:
+                wikipedia.output(u'%s has a duplicate! Reporting it...' % self.image)
+            else:
+                wikipedia.output(u'%s has %s duplicates! Reporting them...' % (self.image, len(duplicates) - 1))
+            repme = "\n*[[:Image:%s]] has the following duplicates:" % self.image
+            for duplicate in duplicates:
+                if duplicate == self.image:
+                    continue # the image itself, not report also this as duplicate
+                repme += "\n**[[:Image:%s]]" % duplicate
+            self.report_image(self.image, self.rep_page, self.com, repme + '\n', addings = False, regex = duplicateRegex)            
+        
+    def report_image(self, image, rep_page = None, com = None, rep_text = None, addings = True, regex = None):
+        """ Function to report the images in the report page when needed. """
+        if rep_page == None: rep_page = self.rep_page
+        if com == None: com = self.com
+        if rep_text == None: rep_text = self.rep_text
         another_page = wikipedia.Page(self.site, rep_page)
-		
-        if another_page.exists():      
+	if regex == None: regex = image
+        if another_page.exists():
             text_get = another_page.get()
         else:
             text_get = str()
         if len(text_get) >= self.logFulNumber:
             raise LogIsFull("The log page (%s) is full! Please delete the old images reported." % another_page.title())  
         pos = 0
-        # The talk page includes "_" between the two names, in this way i replace them to " "
-        regex = image
-        n = re.compile(regex, re.UNICODE)
+        # The talk page includes "_" between the two names, in this way i replace them to " "        
+        n = re.compile(regex, re.UNICODE|re.M)
         y = n.search(text_get, pos)
         if y == None:
-            # Adding the log :)
-            if "'''Commons'''" in rep_text:
-                rep_text = rep_text % (image, image)
-            else:
-                rep_text = rep_text % image
+            # Adding the log
+            if addings:
+                rep_text = rep_text % image # Adding the name of the image in the report if not done already              
             another_page.put(text_get + rep_text, comment = com, minorEdit = False)
             wikipedia.output(u"...Reported...")
             reported = True
@@ -572,6 +609,7 @@
         return reported
    
     def takesettings(self):
+        """ Function to take the settings from the wiki. """
         pos = 0
         if self.settings == None: lista = None
         else:
@@ -609,6 +647,7 @@
         return lista
    
     def load(self, raw):
+        """ Load a list of object from a string using regex. """
         list_loaded = list()
         pos = 0
         load_2 = True
@@ -693,6 +732,7 @@
     regexGen = False # Use the regex generator
     untagged = False # Use the untagged generator
     skip_list = list() # Inizialize the skip list used below
+    duplicatesActive = False
# Here below there are the parameters.
     for arg in wikipedia.handleArgs():
@@ -710,6 +750,8 @@
             repeat = False
         elif arg == '-commons':
             commonsActive = True
+        elif arg == '-duplicates':
+            duplicatesActive = True
         elif arg.startswith('-skip'):
             if len(arg) == 5:
                 skip = True
@@ -874,8 +916,7 @@
         else: wikipedia.output(u'\t   >> No additional settings found! <<')
         # Not the main, but the most important loop.
         #parsed = False
-        for image in generator:
-            
+        for image in generator:            
             # When you've a lot of image to skip before working use this workaround, otherwise
             # let this commented, thanks. [ decoment also parsed = False if you want to use it
             #
@@ -930,9 +971,11 @@
                 skip_list.append('skip = Off') # Only to print it once
             # Check on commons if there's already an image with the same name
             if commonsActive == True:
-                response = mainClass.checkImage(imageName)
+                response = mainClass.checkImageOnCommons(imageName)
                 if response == False:
                     continue
+            if duplicatesActive == True:
+                mainClass.checkImageDuplicated(imageName)
             parentesi = False # parentesi are these in italian: { ( ) } []
             delete = False
             tagged = False