[Pywikipedia-l] SVN: [5985] trunk/pywikipedia/checkimages.py - pywikibot

17 Oct 2008

Revision: 5985
Author:   filnik
Date:     2008-10-17 17:54:33 +0000 (Fri, 17 Oct 2008)

Log Message:
-----------
Some rewrite, skip -> in a function

Modified Paths:
--------------
    trunk/pywikipedia/checkimages.py

Modified: trunk/pywikipedia/checkimages.py
===================================================================

--- trunk/pywikipedia/checkimages.py	2008-10-16 20:47:02 UTC (rev 5984)
+++ trunk/pywikipedia/checkimages.py	2008-10-17 17:54:33 UTC (rev 5985)
@@ -518,16 +518,19 @@
         botolist.append(botnick)
         self.botolist = botolist
         self.sendemailActive = sendemailActive
+        self.skip_list = list() # Inizialize the skip list used below
         self.duplicatesReport = duplicatesReport
         image_n = self.site.image_namespace()
-        self.image_namespace = "%s:" % image_n # Example:
"User_talk:"
+        self.image_namespace = "%s:" % image_n # Example: "Image:"
         # Load the licenses only once, so do it once
         self.smartdetection = smartdetection
         if self.smartdetection:
             self.list_licenses = self.load_licenses()
-    def setParameters(self, image):
+    def setParameters(self, imageName):
         """ Function to set parameters, now only image but maybe it can be
used for others in "future" """
-        self.image = image
+        self.imageName = imageName
+        # Defing the image's Page Object
+        self.image = wikipedia.ImagePage(self.site, '%s%s' %
(self.image_namespace, self.imageName))
     def report(self, newtext, image_to_report, notification = None, head = None,
                notification2 = None, unver = True, commTalk = None, commImage = None):
         """ Function to make the reports easier (or I hope so).
"""
@@ -590,23 +593,22 @@
     def tag_image(self, put = True):
         """ Function to add the template in the image and to find out
         who's the user that has uploaded the image. """
-        # Defing the image's Page Object
-        p = wikipedia.ImagePage(self.site, 'Image:%s' % self.image_to_report)
         # Get the image's description
+        reportPageObject = wikipedia.ImagePage(self.site, self.image_namespace +
self.image_to_report)
         try:
-            testoa = p.get()
+            reportPageText = reportPageObject.get()
         except wikipedia.NoPage:
-            wikipedia.output(u'%s has been deleted...' % p.title())
+            wikipedia.output(u'%s has been deleted...' % self.imageName)
             # We have a problem! Report and exit!
             return False
         # You can use this function also to find only the user that
         # has upload the image (FixME: Rewrite a bit this part)
         if put:
-            p.put(testoa + self.newtext, comment = self.commImage, minorEdit = True)
+            reportPageObject.put(reportPageText + self.newtext, comment = self.commImage,
minorEdit = True)
         # paginetta it's the image page object.
-        paginetta = wikipedia.ImagePage(self.site, self.image_namespace +
self.image_to_report)
+        
         try:
-            nick = paginetta.getLatestUploader()[0]
+            nick = reportPageObject.getLatestUploader()[0]
         except wikipedia.NoPage:
             wikipedia.output(u"Seems that %s hasn't the image at all, but there
is something in the description..." % self.image_to_report)
             repme = "\n*[[:Image:%s]] problems '''with the
APIs'''"
@@ -614,9 +616,7 @@
             self.report_image(self.image_to_report, self.rep_page, self.com, repme)
             return False
         luser = wikipedia.url2link(nick, self.site, self.site)
-        pagina_discussione = "%s:%s" % (self.site.namespace(3), luser)
-        # Defing the talk page (pagina_discussione = talk_page ^__^ )
-        talk_page = wikipedia.Page(self.site, pagina_discussione)
+        talk_page = wikipedia.Page(self.site, "%s:%s" %
(self.site.namespace(3), luser))
         self.talk_page = talk_page
         self.luser = luser
         return True
@@ -787,34 +787,34 @@
 
     def checkImageOnCommons(self):
         """ Checking if the image is on commons """
-        wikipedia.output(u'Checking if %s is on commons...' % self.image)
+        wikipedia.output(u'Checking if %s is on commons...' % self.imageName)
         commons_site = wikipedia.getSite('commons', 'commons')
-        regexOnCommons = r"\n\*\[\[:Image:%s\]\] is also on
'''Commons''': \[\[commons:Image:.*?\]\](?: \(same name\)|)$"
% self.image
-        imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image)
+        regexOnCommons = r"\n\*\[\[:Image:%s\]\] is also on
'''Commons''': \[\[commons:Image:.*?\]\](?: \(same name\)|)$"
% self.imageName
+        imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.imageName)
         hash_found = imagePage.getHash()
         if hash_found == None:
             return False # Problems? Yes! Image deleted, no hash found. Skip the image.
         else:
             commons_image_with_this_hash = commons_site.getImagesFromAnHash(hash_found)
             if commons_image_with_this_hash != []:
-                wikipedia.output(u'%s is on commons!' % self.image)
-                imagePage = wikipedia.ImagePage(self.site, 'Image:%s' %
self.image)
+                wikipedia.output(u'%s is on commons!' % self.imageName)
+                imagePage = wikipedia.ImagePage(self.site, 'Image:%s' %
self.imageName)
                 on_commons_text = imagePage.getImagePageHtml()
                 if "<div class='sharedUploadNotice'>" in
on_commons_text:
                     wikipedia.output(u"But, the image doesn't exist on your
project! Skip...")
                     # Problems? Yes! We have to skip the check part for that image!
                     # Because it's on commons but someone has added something on your
project.
                     return False
-                elif re.findall(r'\bstemma\b', self.image.lower()) != [] and
self.site.lang == 'it':
-                    wikipedia.output(u'%s has "stemma" inside, means that
it\'s ok.' % self.image)
+                elif re.findall(r'\bstemma\b', self.imageName.lower()) != [] and
self.site.lang == 'it':
+                    wikipedia.output(u'%s has "stemma" inside, means that
it\'s ok.' % self.imageName)
                     return True # Problems? No, it's only not on commons but the
image needs a check
                 else:
                     # the second usually is a url or something like that. Compare the two
in equal way, both url.
-                    if self.convert_to_url(self.image) ==
self.convert_to_url(commons_image_with_this_hash[0]):
-                        repme = "\n*[[:Image:%s]] is also on
'''Commons''': [[commons:Image:%s]] (same name)" %
(self.image, commons_image_with_this_hash[0])
+                    if self.convert_to_url(self.imageName) ==
self.convert_to_url(commons_image_with_this_hash[0]):
+                        repme = "\n*[[:Image:%s]] is also on
'''Commons''': [[commons:Image:%s]] (same name)" %
(self.imageName, commons_image_with_this_hash[0])
                     else:
-                        repme = "\n*[[:Image:%s]] is also on
'''Commons''': [[commons:Image:%s]]" % (self.image,
commons_image_with_this_hash[0])
-                    self.report_image(self.image, self.rep_page, self.com, repme, addings
= False, regex = regexOnCommons)
+                        repme = "\n*[[:Image:%s]] is also on
'''Commons''': [[commons:Image:%s]]" % (self.imageName,
commons_image_with_this_hash[0])
+                    self.report_image(self.imageName, self.rep_page, self.com, repme,
addings = False, regex = regexOnCommons)
                     # Problems? No, return True
                     return True
             else:
@@ -825,7 +825,7 @@
         """ Function to check the duplicated images. """
         # {{Dupe|Image:Blanche_Montel.jpg}}
         # Skip the stub images
-        #if 'stub' in self.image.lower() and self.project == 'wikipedia'
and self.site.lang == 'it':
+        #if 'stub' in self.imageName.lower() and self.project ==
'wikipedia' and self.site.lang == 'it':
         #    return True # Skip the stub, ok
         dupText = wikipedia.translate(self.site, duplicatesText)
         dupRegex = wikipedia.translate(self.site, duplicatesRegex)
@@ -833,17 +833,17 @@
         dupTalkText = wikipedia.translate(self.site, duplicates_user_talk_text)
         dupComment_talk = wikipedia.translate(self.site, duplicates_comment_talk)
         dupComment_image = wikipedia.translate(self.site, duplicates_comment_image)
-        duplicateRegex = r'\n\*(?:\[\[:Image:%s\]\] has the following duplicates(?:
\(\'\'\'forced mode\'\'\'\)|):|\*\[\[:Image:%s\]\])$' %
(self.convert_to_url(self.image), self.convert_to_url(self.image))
-        imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image)
+        duplicateRegex = r'\n\*(?:\[\[:Image:%s\]\] has the following duplicates(?:
\(\'\'\'forced mode\'\'\'\)|):|\*\[\[:Image:%s\]\])$' %
(self.convert_to_url(self.imageName), self.convert_to_url(self.imageName))
+        imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.imageName)
         hash_found = imagePage.getHash()
         duplicates = self.site.getImagesFromAnHash(hash_found)
         if duplicates == None:
             return False # Error, image deleted, no hash found. Skip the image.
         if len(duplicates) > 1:
             if len(duplicates) == 2:
-                wikipedia.output(u'%s has a duplicate! Reporting it...' %
self.image)
+                wikipedia.output(u'%s has a duplicate! Reporting it...' %
self.imageName)
             else:
-                wikipedia.output(u'%s has %s duplicates! Reporting them...' %
(self.image, len(duplicates) - 1))
+                wikipedia.output(u'%s has %s duplicates! Reporting them...' %
(self.imageName, len(duplicates) - 1))
             if not dupText == None and not dupRegex == None:
                 time_image_list = list()
                 time_list = list()
@@ -919,17 +919,17 @@
                                 commImage = dupComment_image, unver = True)
             if self.duplicatesReport or only_report:
                 if only_report:
-                    repme = "\n*[[:Image:%s]] has the following duplicates
('''forced mode'''):" % self.convert_to_url(self.image)
+                    repme = "\n*[[:Image:%s]] has the following duplicates
('''forced mode'''):" % self.convert_to_url(self.imageName)
                 else:
-                    repme = "\n*[[:Image:%s]] has the following duplicates:" %
self.convert_to_url(self.image)
+                    repme = "\n*[[:Image:%s]] has the following duplicates:" %
self.convert_to_url(self.imageName)
                 for duplicate in duplicates:
-                    if self.convert_to_url(duplicate) ==
self.convert_to_url(self.image):
+                    if self.convert_to_url(duplicate) ==
self.convert_to_url(self.imageName):
                         continue # the image itself, not report also this as duplicate
                     repme += "\n**[[:Image:%s]]" %
self.convert_to_url(duplicate)
-                result = self.report_image(self.image, self.rep_page, self.com, repme,
addings = False, regex = duplicateRegex)
+                result = self.report_image(self.imageName, self.rep_page, self.com,
repme, addings = False, regex = duplicateRegex)
                 if not result:
                     return True # If Errors, exit (but continue the check)               

-            if older_image != self.image:
+            if older_image != self.imageName:
                 return False # The image is a duplicate, it will be deleted. So skip the
check-part, useless
         return True # Ok - No problem. Let's continue the checking phase
 
@@ -1091,13 +1091,13 @@
                         exit_cicle = True
                         break
         if not seems_ok:
-            rep_text_license_fake = "\n*[[:Image:%s]] seems to have a ''fake
license'', license detected: {{tl|%s}}." % (self.image, license_found)
-            regexFakeLicense = r"\* ?\[\[:Image:%s\]\] seems to have a
''fake license'', license detected: \{\{tl\|%s\}\}\.$" % (self.image,
license_found)
-            printWithTimeZone(u"%s seems to have a fake license: %s,
reporting..." % (self.image, license_found))
-            self.report_image(self.image, rep_text = rep_text_license_fake,
+            rep_text_license_fake = "\n*[[:Image:%s]] seems to have a ''fake
license'', license detected: {{tl|%s}}." % (self.imageName, license_found)
+            regexFakeLicense = r"\* ?\[\[:Image:%s\]\] seems to have a
''fake license'', license detected: \{\{tl\|%s\}\}\.$" %
(self.imageName, license_found)
+            printWithTimeZone(u"%s seems to have a fake license: %s,
reporting..." % (self.imageName, license_found))
+            self.report_image(self.imageName, rep_text = rep_text_license_fake,
                                    addings = False, regex = regexFakeLicense)
         else:
-            printWithTimeZone(u"%s seems ok, license found: %s..." %
(self.image, license_found))
+            printWithTimeZone(u"%s seems ok, license found: %s..." %
(self.imageName, license_found))
         return license_found
 
     def load(self, raw):
@@ -1115,6 +1115,29 @@
                 list_loaded.append(word)
         return list_loaded
 
+    def skipImages(self, skip_number, limit):
+        # If the images to skip are more the images to check, make them the same number
+        if skip_number == 0:
+            wikipedia.output(u'\t\t>> No images to skip...<<')
+            return False
+        if skip_number > limit: skip_number = limit
+        # Print a starting message only if no images has been skipped
+        if self.skip_list == []:
+            if skip_number == 1:
+                wikipedia.output(u'Skipping the first image:\n')
+            else:
+                wikipedia.output(u'Skipping the first %s images:\n' %
skip_number)
+        # If we still have pages to skip:
+        if len(self.skip_list) < skip_number:
+            wikipedia.output(u'Skipping %s...' % self.imageName)
+            self.skip_list.append(self.imageName)
+            if skip_number == 1:
+                wikipedia.output('')
+            return True
+        else:
+            wikipedia.output('') # Print a blank line.
+            return False
+
 def checkbot():
     """ Main function """
     # Command line configurable parameters
@@ -1127,8 +1150,7 @@
     normal = False # Check the new images or use another generator?
     urlUsed = False # Use the url-related function instead of the new-pages generator
     regexGen = False # Use the regex generator
-    untagged = False # Use the untagged generator
-    skip_list = list() # Inizialize the skip list used below
+    untagged = False # Use the untagged generator   
     duplicatesActive = False # Use the duplicate option
     duplicatesReport = False # Use the duplicate-report option
     sendemailActive = False # Use the send-email
@@ -1181,7 +1203,7 @@
                 firstPageTitle = str(wikipedia.input(u'From witch page do you want to
start?'))
             elif len(arg) > 6:
                 firstPageTitle = str(arg[7:])
-            generator = wikipedia.getSite().allpages(start=firstPageTitle ,namespace=6)
+            generator = wikipedia.getSite().allpages(start=firstPageTitle, namespace=6)
             repeat = False
         elif arg.startswith('-page'):
             if len(arg) == 5:
@@ -1355,38 +1377,21 @@
             mainClass.setParameters(imageName) # Setting the image for the main class
             # Skip block
             if skip == True:
-                # If the images to skip are more the images to check, make them the same
number
-                if skip_number > limit: skip_number = limit
-                # Print a starting message only if no images has been skipped
-                if skip_list == []:
-                    if skip_number == 1:
-                        wikipedia.output(u'Skipping the first image:\n')
-                    else:
-                        wikipedia.output(u'Skipping the first %s images:\n' %
skip_number)
-                # If we still have pages to skip:
-                if len(skip_list) < skip_number:
-                    wikipedia.output(u'Skipping %s...' % imageName)
-                    skip_list.append(imageName)
-                    if skip_number == 1:
-                        wikipedia.output('')
-                        skip = False
-                    continue
-                else:
-                    wikipedia.output('') # Print a blank line.
-                    skip = False
-            elif skip_list == []: # Skip must be false if we are here but
-                       # the user has set 0 as images to skip
-                wikipedia.output(u'\t\t>> No images to skip...<<')
-                skip_list.append('skip = Off') # Only to print it once
+                skip = mainClass.skipImages(skip_number, limit)
+                if skip == True:
+                    continue             
             parentesi = False # parentesi are these in italian: { ( ) } []
             delete = False
             tagged = False
             extension = imageName.split('.')[-1] # get the extension from the
image's name
             # Page => ImagePage
             p = wikipedia.ImagePage(site, image.title())
-            # Get the text in the image (called g)
+            # Get the text in the image (called imageCheckText)
             try:
-                g = p.get()
+                # the checkText will be modified in order to make the check phase easier
+                # the imageFullText will be used when the full text is needed without
changes
+                imageCheckText = p.get()
+                imageFullText = imageCheckText
             except wikipedia.NoPage:
                 wikipedia.output(u"Skipping %s because it has been deleted." %
imageName)
                 continue
@@ -1396,7 +1401,7 @@
             # Delete the fields where the templates cannot be loaded
             regex_nowiki = re.compile(r'<nowiki>(.*?)</nowiki>',
re.DOTALL)
             regex_pre = re.compile(r'<pre>(.*?)</pre>', re.DOTALL)
-            g = regex_nowiki.sub('', g); g = regex_pre.sub('', g)
+            imageCheckText = regex_nowiki.sub('', imageCheckText); imageCheckText
= regex_pre.sub('', imageCheckText)
             # Check on commons if there's already an image with the same name
             if commonsActive == True:
                 response = mainClass.checkImageOnCommons()
@@ -1413,10 +1418,10 @@
                 # and the regex will be wrong)
                 if '{{' in i:
                     regexP = re.compile('\{\{(?:template|)%s ?(?:\||\n|\}|<)
?' % i.split('{{')[1].replace(' ', '[ _]'), re.I)
-                    result = regexP.findall(g)
+                    result = regexP.findall(imageCheckText)
                     if result != []:
                         tagged = True
-                elif i.lower() in g:
+                elif i.lower() in imageCheckText:
                     tagged = True
             # Deleting the useless template from the description (before adding
something
             # in the image the original text will be reloaded, don't worry).
@@ -1425,13 +1430,13 @@
             for l in hiddentemplate:
                 if tagged == False:
                     # why creator? Because on commons there's a template such as
{{creator:name}} that.. works
-                    res = re.findall(r'\{\{(?:[Tt]emplate:|)(?:%s[
\n]*?(?:\n|\||\}|<)|creator:)' % l.lower(), g.lower())
+                    res = re.findall(r'\{\{(?:[Tt]emplate:|)(?:%s[
\n]*?(?:\n|\||\}|<)|creator:)' % l.lower(), imageCheckText.lower())
                     if res != []:
                         white_template_found += 1
                         if l != '' and l != ' ': # Check that l is not
nothing or a space
                             # Deleting! (replace the template with nothing)
                             regex_white_template =
re.compile(r'\{\{(?:template:|)(?:%s|creator)' % l, re.IGNORECASE)
-                            g = regex_white_template.sub(r'', g)
+                            imageCheckText = regex_white_template.sub(r'',
imageCheckText)
                             hiddenTemplateFound = True
             if white_template_found == 1:
                 wikipedia.output(u'A white template found, skipping the
template...')
@@ -1440,7 +1445,7 @@
             else:
                 wikipedia.output(u'White templates found: %s; skipping those
templates...' % white_template_found)
             for a_word in something: # something is the array with {{, MIT License and so
on.
-                if a_word in g:
+                if a_word in imageCheckText:
                     # There's a template, probably a license (or I hope so)
                     parentesi = True
             # Is the extension allowed? (is it an image or f.e. a .xls file?)
@@ -1474,7 +1479,7 @@
                     wikipedia.setAction(summary)
                     for k in find_list:
                         if find_tipe.lower() == 'findonly':
-                            if k.lower() == g.lower():
+                            if k.lower() == imageCheckText.lower():
                                 some_problem = True
                                 text_used = text
                                 head_used = head_2
@@ -1484,7 +1489,7 @@
                                 mex_used = mexCatched
                                 break
                         elif find_tipe.lower() == 'find':
-                            if k.lower() in g.lower():
+                            if k.lower() in imageCheckText.lower():
                                 some_problem = True
                                 text_used = text
                                 head_used = head_2
@@ -1503,7 +1508,7 @@
                 printWithTimeZone(u'%s is already tagged...' % imageName)
                 continue
             if some_problem == True:
-                if mex_used in g:
+                if mex_used in imageCheckText:
                     wikipedia.output(u'Image already fixed. Skip.')
                     continue
                 wikipedia.output(u"The image description for %s contains %s..."
% (imageName, name_used))
@@ -1524,7 +1529,7 @@
                 seems_ok = False
                 license_found = None
                 if smartdetection:
-                    license_found = mainClass.smartDetection(g)
+                    license_found = mainClass.smartDetection(imageCheckText)
                 else:
                     printWithTimeZone(u"%s seems ok..." % imageName)
                 # It works also without this... but i want only to be sure ^^
@@ -1540,7 +1545,7 @@
                 mainClass.report(canctext, imageName, notification, head)
                 delete = False
                 continue
-            elif g in nothing:
+            elif imageCheckText in nothing:
                 wikipedia.output(u"The image description for %s does not contain a
license template!" % imageName)
                 if hiddenTemplateFound and HiddenTN != None and HiddenTN != ''
and HiddenTN != ' ':
                     notification = HiddenTN % imageName