[Pywikipedia-l] SVN: [6023] trunk/pywikipedia/checkimages.py

Sun Oct 26 14:09:24 UTC 2008

Revision: 6023
Author:   filnik
Date:     2008-10-26 14:09:24 +0000 (Sun, 26 Oct 2008)

Log Message:
-----------
Really big rewrite! now the bot uses APIs to do the work really better and really faster! I will fix it a bit in the next days and add the comments, but the most is done

Modified Paths:
--------------
    trunk/pywikipedia/checkimages.py

Modified: trunk/pywikipedia/checkimages.py
===================================================================

--- trunk/pywikipedia/checkimages.py	2008-10-25 23:48:32 UTC (rev 6022)
+++ trunk/pywikipedia/checkimages.py	2008-10-26 14:09:24 UTC (rev 6023)
@@ -22,8 +22,6 @@
 
     -duplicatesreport   - Report the duplicates in a log *AND* put the template in the images.
 
-    -smartdetection     - Check in a category if the license found exist in realit or not.
-
     -sendemail          - Send an email after tagging.
 
     -break            - To break the bot after the first check (default: recursive)
@@ -336,7 +334,7 @@
         'de':[u'information'],
         'en':[u'information'],
         'hu':[u'információ', u'enwiki', u'azonnali'],
-        'it':[u'edp', u'informazioni[ _]file', u'information', u'trademark', u'permissionotrs'], # Put the other in the page on the project defined below
+        'it':[u'edp', u'informazioni file', u'information', u'trademark', u'permissionotrs'], # Put the other in the page on the project defined below
         'ja':[u'Information'],
         'ko':[u'그림 정보'],
         'ta':[u'information'],
@@ -344,7 +342,7 @@
         }
 # A page where there's a list of template to skip.
 PageWithHiddenTemplates = {
-    'commons': u'User:Filbot/White_templates#White_templates',
+    'commons': u'User:Filbot/White_templates2#White_templates',
     'en':None,
     'it':u'Progetto:Coordinamento/Immagini/Bot/WhiteTemplates',
     'ko': u'User:Kwjbot_IV/whitetemplates/list',
@@ -387,7 +385,7 @@
         }
 # Message to put in the talk
 duplicates_user_talk_text = {
-        'commons': u'{{subst:User:Filnik/duplicates|Image:%s|Image:%s}}',
+        'commons': u'{{subst:User:Filnik/duplicates|Image:%s|Image:%s}}', # FIXME: it doesn't exist
         'en'     : None,
         'it'     : u"{{subst:Progetto:Coordinamento/Immagini/Bot/Messaggi/Duplicati|%s|%s|__botnick__}} --~~~~",
         }
@@ -502,10 +500,69 @@
             wikipedia.output(u'No data found.')
             return False
 
+def categoryElementsNumber(CatName):
+    #action=query&prop=categoryinfo&titles=Category:License_tags
+    """
+    """
+    params = {
+        'action'    :'query',
+        'prop'      :'categoryinfo',
+        'titles'    :CatName,
+        }
+
+    data = query.GetData(params,
+                    useAPI = True, encodeTitle = False)
+    pageid = data['query']['pages'].keys()[0]
+    elements = data['query']['pages'][pageid]['categoryinfo']['size']
+    return elements
+
+def categoryAllElements(CatName):
+    #action=query&list=categorymembers&cmlimit=500&cmtitle=Category:License_tags
+    """
+    """
+    wikipedia.output("Loading %s..." % CatName)
+    elements = int(categoryElementsNumber(CatName))
+    elements += 20 # better to be sure that all the elements are loaded
+    if (elements - 20) > 5000:
+        raise wikipedia.Error(u'The category selected as more than 5.000 elements, limit reached')
+    elif elements > 5000: # if they are less then 5000, but for few elements
+        elements = 5000
+    params = {
+        'action'    :'query',
+        'list'      :'categorymembers',
+        'cmlimit'   :str(elements),
+        'cmtitle'   :CatName,
+        }
+
+    data = query.GetData(params,
+                    useAPI = True, encodeTitle = False)
+    
+    members = data['query']['categorymembers']
+    allmembers = members
+    results = list()
+    for subcat in members:
+        ns = subcat['ns']
+        pageid = subcat['pageid']
+        title = subcat['title']
+        if ns == 14:
+            allmembers.extend(categoryAllElements(title))
+            members.remove(subcat)
+    for member in allmembers:
+        ns = member['ns']
+        pageid = member['pageid']
+        title = member['title']
+        results.append(member)
+    return results
+def categoryAllPageObjects(CatName):
+    final = list()
+    for element in categoryAllElements(CatName):
+        final.append(wikipedia.Page(wikipedia.getSite(), element['title']))
+    return final
+
 # Here there is the main class.
 class main:
     def __init__(self, site, logFulNumber = 25000, sendemailActive = False,
-                 duplicatesReport = False, smartdetection = False):
+                 duplicatesReport = False):
         """ Constructor, define some global variable """
         self.site = site
         self.logFulNumber = logFulNumber
@@ -513,7 +570,7 @@
         self.rep_page = wikipedia.translate(self.site, report_page)
         self.rep_text = wikipedia.translate(self.site, report_text)
         self.com = wikipedia.translate(self.site, comm10)
-        self.hiddentemplate = wikipedia.translate(self.site, HiddenTemplate)
+        self.hiddentemplates = wikipedia.translate(self.site, HiddenTemplate)
         self.pageHidden = wikipedia.translate(self.site, PageWithHiddenTemplates)
         self.pageAllowed = wikipedia.translate(self.site, PageWithAllowedTemplates)        
         # Commento = Summary in italian
@@ -533,9 +590,7 @@
         image_n = self.site.image_namespace()
         self.image_namespace = u"%s:" % image_n # Example: "Image:"
         # Load the licenses only once, so do it once
-        self.smartdetection = smartdetection
-        if self.smartdetection:
-            self.list_licenses = self.load_licenses()
+        self.list_licenses = self.load_licenses()
     def setParameters(self, imageName, timestamp, uploader):
         """ Function to set parameters, now only image but maybe it can be used for others in "future" """
         self.imageName = imageName
@@ -736,17 +791,18 @@
         """ Function to load the white templates """
         # A template as {{en is not a license! Adding also them in the whitelist template...
         for langK in wikipedia.Family(u'wikipedia').langs.keys():
-            self.hiddentemplate.append(u'%s' % langK)
+            self.hiddentemplates.append(wikipedia.Page(self.site, u'Template:%s' % langK))
         # The template #if: and #switch: aren't something to care about
-        self.hiddentemplate.extend([u'#if:', u'#switch:'])
+        #self.hiddentemplates.extend([u'#if:', u'#switch:']) FIXME
         # Hidden template loading
         if self.pageHidden != None:
             try:
                 pageHiddenText = wikipedia.Page(self.site, self.pageHidden).get()
             except (wikipedia.NoPage, wikipedia.IsRedirectPage):
                 pageHiddenText = ''
-            self.hiddentemplate.extend(self.load(pageHiddenText))
-        return self.hiddentemplate
+            for element in self.load(pageHiddenText):
+                self.hiddentemplates.append(wikipedia.Page(self.site, element))
+        return self.hiddentemplates
 
     def returnOlderTime(self, listGiven, timeListGiven):
         """ Get some time and return the oldest of them """
@@ -1029,6 +1085,7 @@
 
     def load_licenses(self):
         """ Load the list of the licenses """
+        """
         catName = wikipedia.translate(self.site, category_with_licenses)
         cat = catlib.Category(wikipedia.getSite(), catName)
         categories = [page.title() for page in pagegenerators.SubCategoriesPageGenerator(cat)]
@@ -1040,6 +1097,10 @@
             gen = pagegenerators.CategorizedPageGenerator(cat)
             pages = [page for page in gen]
             list_licenses.extend(pages)
+        """
+        catName = wikipedia.translate(self.site, category_with_licenses)
+        wikipedia.output(u'\n\t...Loading the licenses allowed...\n')
+        list_licenses = categoryAllPageObjects(catName)
 
         # Add the licenses set in the default page as licenses
         # to check
@@ -1049,95 +1110,57 @@
             except (wikipedia.NoPage, wikipedia.IsRedirectPage):
                 pageAllowedText = ''
             for nameLicense in self.load(pageAllowedText):
-                if not 'template:' in nameLicense.lower():
-                    nameLicense = u'Template:%s' % nameLicense
                 pageLicense = wikipedia.Page(self.site, nameLicense)
                 if pageLicense not in list_licenses:
                     list_licenses.append(pageLicense) # the list has wiki-pages
         return list_licenses
 
-    def giveMeTheTemplate(self, license_selected):
-        """ From the name of a template see if it's template:something or just
-            an inclusion of another namespace != template. If it's a redirect
-            gets the real page, if there's a NoPage, return None.
-        """
-        #print template.exists()
-        template = wikipedia.Page(self.site, u'Template:%s' % license_selected)
-        try:
-            template.pageAPInfo()
-        except wikipedia.NoPage:
-            try:
-                template = wikipedia.Page(self.site, license_selected)
-                template.pageAPInfo()
-            except (wikipedia.NoPage, wikipedia.IsRedirectPage):
-                return None # break and exit
-        except wikipedia.IsRedirectPage:
-            template = template.getRedirectTarget()
-        return template
-
-    def smartDetection(self, image_text):
+    def smartDetection(self):
         """ The bot instead of checking if there's a simple template in the
             image's description, checks also if that template is a license or
             something else. In this sense this type of check is smart.
             """
         seems_ok = False
         license_found = None
-        regex_find_licenses = re.compile(r'\{\{(?:[Tt]emplate:|)(.*?)(?:[|\n<].*?|)\}\}', re.DOTALL)
-        licenses_found = regex_find_licenses.findall(image_text)
-        second_round = False
-
-        exit_cicle = False # howTo exit from both the for and the while cicle
-        while 1:
-            if exit_cicle: # howTo exit from the while
-                break
-            if licenses_found != []:
-                for license_selected in licenses_found:
-                    # put the first, if there is problem, this will be reported in the log
-                    if license_found == None:
-                        license_found = license_selected
+        self.hiddentemplates = self.loadHiddenTemplates()      
+        self.licenses_found = self.image.getTemplates()
+        whiteTemplatesFound = False
+        regex_find_licenses = re.compile(r'(?<!\{)\{\{(?:[Tt]emplate:|)([^{]*?)[|\n<}]', re.DOTALL)
+        templatesInTheImageRaw = regex_find_licenses.findall(self.imageCheckText)
+        allLicenses = list()
+        # Found the templates ONLY in the image's description
+        for template_selected in templatesInTheImageRaw:
+            for templateReal in self.licenses_found:
+                if self.convert_to_url(template_selected).lower().replace('template:', '') == \
+                       self.convert_to_url(templateReal.title().lower().replace('template:', '')):
+                    allLicenses.append(templateReal)
+        if self.licenses_found != []:
+            for template in self.licenses_found:
+                license_selected = template.title().replace('Template:', '')
+                if template in self.list_licenses: # the list_licenses are loaded in the __init__ (not to load them multimple times)
+                    seems_ok = True
+                    license_found = license_selected # let the last "fake" license normally detected
+                    break
+                if template in self.hiddentemplates:
+                    # if the whitetemplate is not in the images description, we don't care
                     try:
-                        template = self.giveMeTheTemplate(license_selected)
-                        if template == None:
-                            continue
-                    except wikipedia.BadTitle:
-                        # Template with wrong name, no need to report, simply skip
+                        allLicenses.remove(template)
+                    except ValueError:
                         continue
-                    if template in self.list_licenses: # the list_licenses are loaded in the __init__ (not to load them multimple times)
-                        seems_ok = True
-                        exit_cicle = True
-                        license_found = license_selected # let the last "fake" license normally detected
-                        break
-                # previous block was unsuccessful? Try with the next one
-                for license_selected in licenses_found:
-                    try:
-                        template = self.giveMeTheTemplate(license_selected)
-                        if template == None:
-                            continue # ok, this template it's not ok, continue..                          
-                    except wikipedia.BadTitle:
-                        # Template with wrong name, no need to report, simply skip
-                        continue                          
-                    try:                         
-                        template_text = template.get()            
-                    except wikipedia.NoPage:
-                        continue # ok, this template it's not ok, continue..
-                    regex_noinclude = re.compile(r'<noinclude>(.*?)</noinclude>', re.DOTALL)
-                    template_text = regex_noinclude.sub('', template_text)
-                    if second_round == False:
-                        licenses_found = regex_find_licenses.findall(template_text)
-                        second_round = True
-                        break # only exit from the for, not from the while
                     else:
-                        exit_cicle = True
-                        break
-        if not seems_ok:
+                        whiteTemplatesFound = True
+                        continue
+            if license_found == None and allLicenses != list():
+                license_found = license_selected
+        if not seems_ok and license_found != None:
             rep_text_license_fake = u"\n*[[:Image:%s]] seems to have a ''fake license'', license detected: <nowiki>%s</nowiki>" % (self.imageName, license_found)
             regexFakeLicense = r"\* ?\[\[:Image:%s\]\] seems to have a ''fake license'', license detected: <nowiki>%s</nowiki>$" % (self.imageName, license_found)
             printWithTimeZone(u"%s seems to have a fake license: %s, reporting..." % (self.imageName, license_found))
             self.report_image(self.imageName, rep_text = rep_text_license_fake,
                                    addings = False, regex = regexFakeLicense)
-        else:
+        elif license_found != None:
             printWithTimeZone(u"%s seems ok, license found: %s..." % (self.imageName, license_found))
-        return license_found
+        return (license_found, whiteTemplatesFound)
 
     def load(self, raw):
         """ Load a list of object from a string using regex. """
@@ -1252,31 +1275,8 @@
                     return True
             elif i.lower() in self.imageCheckText:
                 return True
-        return False # Nothing Found? Ok: False
+        return False # Nothing Found? Ok: False      
 
-    def whiteTemplateEraser(self):
-        """ Erase the white template from the checking text and return how many have been found. """
-        # Load the white templates(hidden template is the same as white template, regarding the meaning)
-        white_templates_found = 0
-        hiddentemplate = self.loadHiddenTemplates()
-        for regexWhiteLicense in hiddentemplate:
-            fullRegexWL = r'\{\{(?:template:|)(?:%s[ \n]*?(?:\n|\||\}|<)|creator:)' % regexWhiteLicense.lower()
-            if self.tagged == False:
-                # why creator? Because on commons there's a template such as {{creator:name}} that.. works
-                res = re.findall(fullRegexWL, self.imageCheckText.lower())
-                if res != []:
-                    for element in res: # if a regex gives more than 1 results, are more than 1 template found.
-                        white_templates_found += 1
-                    if regexWhiteLicense != '' and regexWhiteLicense != ' ': # Check that regexWhiteLicense is not nothing or a space
-                        # Deleting! (replace the template with nothing)
-                        regex_white_template = re.compile(fullRegexWL, re.IGNORECASE)
-                        self.imageCheckText = regex_white_template.sub(r'', self.imageCheckText)
-        if white_templates_found == 1:
-            wikipedia.output(u'A white template found, skipping the template...')
-        elif white_templates_found > 1:
-            wikipedia.output(u'White templates found: %s; skipping those templates...' % white_templates_found)
-        return white_templates_found        
-
     def findAdditionalProblems(self):
         # In every tupla there's a setting configuration
         for tupla in self.settingsData:
@@ -1322,7 +1322,7 @@
                         self.mex_used = mexCatched
                         continue
 
-    def checkStep(self, smartdetection):
+    def checkStep(self):
         # nothing = Defining an empty image description
         nothing = ['', ' ', '  ', '   ', '\n', '\n ', '\n  ', '\n\n', '\n \n', ' \n', ' \n ', ' \n \n']
         # something = Minimal requirements for an image description.
@@ -1369,11 +1369,10 @@
         # Deleting the useless template from the description (before adding something
         # in the image the original text will be reloaded, don't worry).
         self.tagged = self.isTagged()
-        white_templates_found = self.whiteTemplateEraser()
-        if white_templates_found != 0:
-            hiddenTemplateFound = True
-        else:
-            hiddenTemplateFound = False
+        if self.tagged == True:
+            # Tagged? Yes, skip.
+            printWithTimeZone(u'%s is already tagged...' % self.imageName)
+            return True        
         for a_word in something: # something is the array with {{, MIT License and so on.
             if a_word in self.imageCheckText:
                 # There's a template, probably a license (or I hope so)
@@ -1382,6 +1381,7 @@
         for parl in notallowed:
             if parl.lower() in extension.lower():
                 delete = True
+        (license_found, hiddenTemplateFound) = self.smartDetection()
         self.some_problem = False # If it has "some_problem" it must check
                   # the additional settings.
         # if self.settingsData, use addictional settings
@@ -1392,10 +1392,6 @@
         #if p.exists(): <-- improve thebot, better to make as
         #                   less call to the server as possible
         # Here begins the check block.
-        if self.tagged == True:
-            # Tagged? Yes, skip.
-            printWithTimeZone(u'%s is already tagged...' % self.imageName)
-            return True
         if self.some_problem == True:
             if self.mex_used in self.imageCheckText:
                 wikipedia.output(u'Image already fixed. Skip.')
@@ -1414,13 +1410,8 @@
                 wikipedia.output(u"Skipping the image...")
             self.some_problem = False
             return True
-        elif brackets == True:
+        elif brackets == True and license_found != None:
             seems_ok = False
-            license_found = None
-            if smartdetection:
-                license_found = self.smartDetection(self.imageCheckText)
-            else:
-                printWithTimeZone(u"%s seems ok..." % self.imageName)
             # It works also without this... but i want only to be sure ^^
             brackets = False
             return True
@@ -1469,7 +1460,6 @@
     duplicatesActive = False # Use the duplicate option
     duplicatesReport = False # Use the duplicate-report option
     sendemailActive = False # Use the send-email
-    smartdetection = False # Enable the smart detection
 
     # Here below there are the parameters.
     for arg in wikipedia.handleArgs():
@@ -1497,8 +1487,6 @@
             duplicatesReport = True
         elif arg == '-sendemail':
             sendemailActive = True
-        elif arg == '-smartdetection':
-            smartdetection = True
         elif arg.startswith('-skip'):
             if len(arg) == 5:
                 skip = True
@@ -1597,7 +1585,7 @@
     # Main Loop
     while 1:
         # Defing the Main Class.
-        mainClass = main(site, sendemailActive = sendemailActive, duplicatesReport = duplicatesReport, smartdetection = smartdetection)
+        mainClass = main(site, sendemailActive = sendemailActive, duplicatesReport = duplicatesReport)
         # Untagged is True? Let's take that generator
         if untagged == True:
             generator =  mainClass.untaggedGenerator(projectUntagged, limit)
@@ -1674,7 +1662,7 @@
                 response2 = mainClass.checkImageDuplicated(duplicates_rollback)
                 if response2 == False:
                     continue
-            resultCheck = mainClass.checkStep(smartdetection)
+            resultCheck = mainClass.checkStep()
             if resultCheck:
                 continue
     # A little block to perform the repeat or to break.
@@ -1694,5 +1682,5 @@
         final = datetime.datetime.strptime(str(datetime.datetime.utcnow()).split('.')[0], "%Y-%m-%d %H:%M:%S") #timezones are UTC
         delta = final - old
         secs_of_diff = delta.seconds
-        print "seconds: %s" % secs_of_diff
+        wikipedia.output("Execution time: %s" % secs_of_diff)
         wikipedia.stopme()