[Pywikipedia-l] SVN: [5676] trunk/pywikipedia/checkimages.py - pywikibot

5 Jul 2008

Revision: 5676
Author:   filnik
Date:     2008-07-05 20:26:02 +0000 (Sat, 05 Jul 2008)

Log Message:
-----------
Beta version, but working pretty good, of the smartdetection

Modified Paths:
--------------
    trunk/pywikipedia/checkimages.py

Modified: trunk/pywikipedia/checkimages.py
===================================================================

--- trunk/pywikipedia/checkimages.py	2008-07-05 18:21:03 UTC (rev 5675)
+++ trunk/pywikipedia/checkimages.py	2008-07-05 20:26:02 UTC (rev 5676)
@@ -22,6 +22,8 @@
 
     -duplicatesreport   - Report the duplicates in a log *AND* put the template in the
images.
 
+    -smartdetection     - Check in a category if the license found exist in realit or
not.
+
     -sendemail          - Send an email after tagging.
 
     -break	        - To break the bot after the first check (default: recursive)
@@ -308,7 +310,7 @@
                    u'dupe', u'duplicate', u'uncat',
u'uncategorized', u'watermark', u'nocat',
u'imageupload'],
         'de':[u'information'],
         'en':[u'information'],
-        'it':[u'edp', u'informazioni[ _]file',
u'information', u'trademark'],
+        'it':[u'edp', u'informazioni[ _]file',
u'information', u'trademark', u'permissionotrs'],
         'ja':[u'Information'],
         'hu':[u'információ', u'enwiki', u'azonnali'],
         'ta':[u'information'],
@@ -356,6 +358,11 @@
         'it':r'\{\{(?:[Tt]emplate:|)[Cc]ancella[ _]subito[|}]',
         }
 
+category_with_licenses = {
+        'commons':'Category:License tags',
+        'it':'Categoria:Template Licenze copyright',
+        }
+
 ## Put None if you don't use this option or simply add nothing if en
 ## is still None.
 # Page where is stored the message to send as email to the users
@@ -447,14 +454,14 @@
         """ Constructor, define some global variable """
         self.site = site
         self.logFulNumber = logFulNumber
-        self.settings = wikipedia.translate(site, page_with_settings)
-        self.rep_page = wikipedia.translate(site, report_page)
-        self.rep_text = wikipedia.translate(site, report_text)
-        self.com = wikipedia.translate(site, comm10)
+        self.settings = wikipedia.translate(self.site, page_with_settings)
+        self.rep_page = wikipedia.translate(self.site, report_page)
+        self.rep_text = wikipedia.translate(self.site, report_text)
+        self.com = wikipedia.translate(self.site, comm10)
         # Commento = Summary in italian
         self.commento = wikipedia.translate(self.site, comm)
         # Adding the bot's nickname at the notification text if needed.
-        botolist = wikipedia.translate(wikipedia.getSite(), bot_list)     
+        botolist = wikipedia.translate(self.site, bot_list)     
         project = wikipedia.getSite().family.name
         bot = config.usernames[project]
         botnick = bot[self.site.lang]
@@ -807,13 +814,13 @@
                 return False # The image is a duplicate, it will be deleted.
         return True # Ok - No problem. Let's continue the checking phase
         
-    def report_image(self, image, rep_page = None, com = None, rep_text = None, addings =
True, regex = None):
+    def report_image(self, image_to_report, rep_page = None, com = None, rep_text = None,
addings = True, regex = None):
         """ Function to report the images in the report page when needed.
"""
         if rep_page == None: rep_page = self.rep_page
         if com == None: com = self.com
         if rep_text == None: rep_text = self.rep_text
         another_page = wikipedia.Page(self.site, rep_page)
-	if regex == None: regex = image
+	if regex == None: regex = image_to_report
         if another_page.exists():
             text_get = another_page.get()
         else:
@@ -821,25 +828,24 @@
         if len(text_get) >= self.logFulNumber:
             raise LogIsFull("The log page (%s) is full! Please delete the old images
reported." % another_page.title())  
         pos = 0
-        # The talk page includes "_" between the two names, in this way i
replace them to " "        
+        # The talk page includes "_" between the two names, in this way i
replace them to " "
         n = re.compile(regex, re.UNICODE|re.M)
         y = n.search(text_get, pos)
         if y == None:
             # Adding the log
             if addings:
-                rep_text = rep_text % image # Adding the name of the image in the report
if not done already              
+                rep_text = rep_text % image_to_report # Adding the name of the image in
the report if not done already              
             another_page.put(text_get + rep_text, comment = com, minorEdit = False)
             wikipedia.output(u"...Reported...")
             reported = True
         else:
             pos = y.end()
-            wikipedia.output(u"%s is already in the report page." % image)
+            wikipedia.output(u"%s is already in the report page." %
image_to_report)
             reported = False
         return reported
 	
     def takesettings(self):
         """ Function to take the settings from the wiki.
"""
-        pos = 0
         if self.settings == None: lista = None
         else:
             x = wikipedia.Page(self.site, self.settings)
@@ -849,32 +855,41 @@
                 rxp = r"<------- ------->\n\*[Nn]ame ?=
?['\"](.*?)['\"]\n\*([Ff]ind|[Ff]indonly)=(.*?)\n\*[Ii]magechanges=(.*?)\n\*[Ss]ummary=['\"](.*?)['\"]\n\*[Hh]ead=['\"](.*?)['\"]\n\*[Tt]ext
?= ?['\"](.*?)['\"]\n\*[Mm]ex ?=
?['\"]?(.*?)['\"]?$"
                 r = re.compile(rxp, re.UNICODE|re.M)
                 number = 1
-                while 1:
-                    m = r.search(testo, pos)
-                    if m == None:
-                        if lista == list():
-                            wikipedia.output(u"You've set wrongly your settings,
please take a look to the relative page. (run without them)")
-                            lista = None
-                        else:
-                            break
-                    else:
-                        pos = m.end()
-                        name = str(m.group(1))
-                        find_tipe = str(m.group(2))
-                        find = str(m.group(3))
-                        imagechanges = str(m.group(4))
-                        summary = str(m.group(5))
-                        head = str(m.group(6))
-                        text = str(m.group(7))
-                        mexcatched = str(m.group(8))
-                        tupla = [number, name, find_tipe, find, imagechanges, summary,
head, text, mexcatched]
-                        lista += [tupla]
-                        number += 1
+                for m in r.finditer(testo):
+                    name = str(m.group(1))
+                    find_tipe = str(m.group(2))
+                    find = str(m.group(3))
+                    imagechanges = str(m.group(4))
+                    summary = str(m.group(5))
+                    head = str(m.group(6))
+                    text = str(m.group(7))
+                    mexcatched = str(m.group(8))
+                    tupla = [number, name, find_tipe, find, imagechanges, summary, head,
text, mexcatched]
+                    lista += [tupla]
+                    number += 1
+                if lista == list():
+                    wikipedia.output(u"You've set wrongly your settings, please
take a look to the relative page. (run without them)")
+                    lista = None                    
             except wikipedia.NoPage:
                 wikipedia.output(u"The settings' page doesn't exist!")
                 lista = None
         return lista
-	
+
+    def load_licenses(self):
+        """ Load the list of the licenses """
+	catName = wikipedia.translate(self.site, category_with_licenses)
+        cat = catlib.Category(wikipedia.getSite(), catName)
+        categories = [page.title() for page in
pagegenerators.SubCategoriesPageGenerator(cat)]
+        categories.append(catName)
+        list_licenses = list()
+        wikipedia.output(u'\n\t...Loading the names of the licenses
allowed...\n')
+        for catName in categories:
+            cat = catlib.Category(wikipedia.getSite(), catName)
+            gen = pagegenerators.CategorizedPageGenerator(cat)
+            pages = [page for page in gen]
+            list_licenses.extend(pages)
+        return list_licenses
+    
     def load(self, raw):
         """ Load a list of object from a string using regex.
"""
         list_loaded = list()
@@ -885,11 +900,6 @@
         regl = r"(?:\"|\')(.*?)(?:\"|\')(?:, |\])"
         pl = re.compile(regl, re.UNICODE)
         for xl in pl.finditer(raw):
-            if xl == None:
-                if len(list_loaded) >= 1:
-                    return list_loaded
-                    break
-            pos = xl.end()
             word = xl.group(1)
             if word not in list_loaded:
                 list_loaded.append(word)
@@ -911,8 +921,9 @@
     skip_list = list() # Inizialize the skip list used below
     duplicatesActive = False # Use the duplicate option
     duplicatesReport = False # Use the duplicate-report option
-    sendemailActive = False # Use the send-email option
-    
+    sendemailActive = False # Use the send-email
+    smartdetection = False # Enable the smart detection
+        
     # Here below there are the parameters.
     for arg in wikipedia.handleArgs():
         if arg.startswith('-limit'):
@@ -935,6 +946,8 @@
             duplicatesReport = True
         elif arg == '-sendemail':
             sendemailActive = True                   
+        elif arg == '-smartdetection':
+            smartdetection = True   
         elif arg.startswith('-skip'):
             if len(arg) == 5:
                 skip = True
@@ -999,7 +1012,7 @@
                 projectUntagged = str(wikipedia.input(u'In which project should I
work?'))
             elif len(arg) > 9:
                 projectUntagged = str(arg[10:])          
-
+                
     # Understand if the generator it's the default or not.
     try:
         generator
@@ -1090,6 +1103,8 @@
             wikipedia.output(u'Problems with loading the settigs, run without
them.')
             tupla_written = None
             some_problem = False
+        # Load the list of licenses allowed for our project
+        list_licenses = mainClass.load_licenses()
         # Ensure that if the list given is empty it will be converted to
"None"
         # (but it should be already done in the takesettings() function)
         if tupla_written == []: tupla_written = None
@@ -1200,7 +1215,8 @@
                         white_template_found += 1
                         if l != '' and l != ' ': # Check that l is not
nothing or a space
                             # Deleting! (replace the template with nothing)
-                            g = re.sub(r'\{\{(?:template:|)%s' % l.lower(),
r'', g.lower())
+                            regex_white_template =
re.compile(r'\{\{(?:template:|)%s' % l, re.IGNORECASE)
+                            g = regex_white_template.sub(r'', g)
                             hiddenTemplateFound = True
             if white_template_found == 1:
                 wikipedia.output(u'A white template found, skipping the
template...')
@@ -1289,9 +1305,36 @@
                     some_problem = False
                     continue
                 elif parentesi == True:
-                    printWithTimeZone(u"%s seems ok," % imageName)
+                    seems_ok = False
+                    license_found = None
+                    if smartdetection:                    
+                        regex_find_licenses =
re.compile(r'\{\{(?:[Tt]emplate:|)(.*?)(?:[|\n].*?|)\}\}', re.DOTALL)
+                        licenses_found = regex_find_licenses.findall(g)
+                        if licenses_found != []:                        
+                            for license_selected in licenses_found:
+                                #print template.exists()                        
+                                template = wikipedia.Page(site, 'Template:%s' %
license_selected)
+                                if template.isRedirectPage():
+                                    template = template.getRedirectTarget()
+                                license_found = license_selected
+                                if template in list_licenses:
+                                    seems_ok = True
+                                    break
+                        if not seems_ok:
+                            rep_text_license_faked = "\n*[[:Image:%s]] seems to have
a ''fake license'', license detected: %s." % (imageName,
license_found)
+                            regexFakedLicense = r"\* ?\[\[:Image:%s\]\] seems to
have a ''fake license'', license detected: %s." % (imageName,
license_found)
+                            printWithTimeZone(u"%s seems to have a fake license: %s,
reporting..." % (imageName, license_found))
+                            mainClass.report_image(imageName, rep_text =
rep_text_license_faked,
+                                                   addings = False, regex =
regexFakedLicense)
+                    else:
+                        seems_ok = True
+                    if seems_ok:
+                        if license_found != None:
+                            printWithTimeZone(u"%s seems ok, license found:
%s..." % (imageName, license_found))
+                        else:
+                            printWithTimeZone(u"%s seems ok..." % imageName)   
                        
                     # It works also without this... but i want only to be sure ^^
-                    parentesi = False
+                    parentesi = False             
                     continue
                 elif delete == True:
                     wikipedia.output(u"%s is not a file!" % imageName)