[Pywikipedia-l] SVN: [4957] trunk/pywikipedia/checkimages.py

filnik at svn.wikimedia.org filnik at svn.wikimedia.org
Thu Jan 31 17:23:45 UTC 2008

Revision: 4957
Author:   filnik
Date:     2008-01-31 17:23:45 +0000 (Thu, 31 Jan 2008)

Log Message:
Some bugfixes, some rewrite, adding some comments (but the script still needs a better docu)

Modified Paths:

Modified: trunk/pywikipedia/checkimages.py
--- trunk/pywikipedia/checkimages.py	2008-01-31 16:32:39 UTC (rev 4956)
+++ trunk/pywikipedia/checkimages.py	2008-01-31 17:23:45 UTC (rev 4957)
@@ -315,6 +315,10 @@
 	def __init__(self, site, logFulNumber = 25000):
 		self.site = site
 		self.logFulNumber = logFulNumber
+                self.settings = wikipedia.translate(site, page_with_settings)
+                self.rep_page = wikipedia.translate(site, report_page)
+                self.rep_text = wikipedia.translate(site, report_text)
+                self.com = wikipedia.translate(site, comm10)
 	def general(self, newtext, image, notification, head, botolist):
 		""" This class can be called for two reason. So I need two different __init__, one with common data
 			and another with the data that I required... maybe it can be added on the other function, but in this way
@@ -349,7 +353,7 @@
 		if imagedata == list():
 			wikipedia.output(u"Seems that %s hasn't the image at all, but there is something in the description..." % self.image)
 			repme = "\n*[[:Image:%s]] seems to have problems ('''no data found in the image''')"
-			self.report_image(rep_page, self.image, com, repme)
+			self.report_image(self.image, self.rep_page, self.com, repme)
 			# We have a problem! Report and exit!         
 			return False
@@ -358,7 +362,7 @@
 			wikipedia.output(u"Seems that %s hasn't the image at all, but there is something in the description..." % self.image)
 			repme = "\n*[[:Image:%s]] seems to have problems ('''no data found in the image''')"
 			# We have a problem! Report and exit!
-			self.report_image(rep_page, self.image, com, repme)
+			self.report_image(self.image, self.rep_page, self.com, repme)
 			return False
 		luser = wikipedia.url2link(nick, self.site, self.site)
 		pagina_discussione = "%s:%s" % (self.site.namespace(3), luser)
@@ -420,7 +424,7 @@
 		elif second_text == False:
 			talk_page.put(testoattuale + head + notification, comment = commentox, minorEdit = False)
-	def untaggedGenerator(self, untaggedProject, rep_page, com):
+	def untaggedGenerator(self, untaggedProject):
 		lang = untaggedProject.split('.', 1)[0]
 		project = '.%s' % untaggedProject.split('.', 1)[1]
 		if lang == 'commons':
@@ -457,6 +461,7 @@
 	def checkImage(self, image):
+                self.image = image
 		# Search regular expression to find links like this (and the class attribute is optional too)
 		# title="Immagine:Nvidia.jpg"
 		wikipedia.output(u'Checking if %s is on commons...' % image)
@@ -472,17 +477,22 @@
 				return False
 			elif 'stemma' in image.lower() and self.site.lang == 'it':
 				wikipedia.output(u'%s has "stemma" inside, means that it\'s ok.' % image)
-				return False
+				return True # Problems? No, it's only not on commons but the image needs a check
 				repme = "\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]]"
-				self.report_image(rep_page, image, com, repme)
+				self.report_image(self.image, self.rep_page, self.com, repme)
 				# Problems? No, return True
 				return True
 			# Problems? No, return True
 			return True
-	def report_image(self, rep_page, image, com, rep):
+	def report_image(self, image, rep_page = None, com = None, rep_text = None):
+                if rep_page == None:
+                        rep_page = self.rep_page
+                if com == None:
+                        com = self.com
+                if rep_text == None:
+                        rep_text = self.rep_text
 		another_page = wikipedia.Page(self.site, rep_page)
 		if another_page.exists():      
@@ -498,10 +508,10 @@
 		y = n.search(text_get, pos)
 		if y == None:
 			# Adding the log :)
-			if "\'\'\'Commons\'\'\'" in rep:
-				rep_text = rep % (image, image)
+			if "\'\'\'Commons\'\'\'" in rep_text:
+				rep_text = rep_text % (image, image)
-				rep_text = rep % image
+				rep_text = rep_text % image
 			another_page.put(text_get + rep_text, comment = com, minorEdit = False)
 			reported = True
@@ -511,11 +521,11 @@
 			reported = False
 		return reported
-	def takesettings(self, settings):
+	def takesettings(self):
 		pos = 0
-		if settings == None: lista = None
+		if self.settings == None: lista = None
-                        x = wikipedia.Page(self.site, settings)
+                        x = wikipedia.Page(self.site, self.settings)
                         lista = list()
                                 testo = x.get()
@@ -724,10 +734,6 @@
         nn = wikipedia.translate(site, nothing_notification)
         dels = wikipedia.translate(site, del_comm)
         smwl = wikipedia.translate(site, second_message_without_license)
-        settings = wikipedia.translate(site, page_with_settings)
-        rep_page = wikipedia.translate(site, report_page)
-        rep_text = wikipedia.translate(site, report_text)
-        com = wikipedia.translate(site, comm10)
         TextFind = wikipedia.translate(site, txt_find)
         hiddentemplate = wikipedia.translate(site, HiddenTemplate)
         # A template as {{en is not a license! Adding also them in the whitelist template...
@@ -768,7 +774,7 @@
                 mainClass = main(site)
                 # Untagged is True? Let's take that generator
                 if untagged == True:
-                        generator =  mainClass.untaggedGenerator(projectUntagged, rep_page, com)
+                        generator =  mainClass.untaggedGenerator(projectUntagged)
                         normal = False # Ensure that normal is False
                 # Normal True? Take the default generator
                 if normal == True:
@@ -790,7 +796,7 @@
                 # Ok, We (should) have a generator, so let's go on.
                         # Take the additional settings for the Project
-                        tupla_written = mainClass.takesettings(settings)
+                        tupla_written = mainClass.takesettings()
                 except wikipedia.Error:
                         # Error? Settings = None
                         wikipedia.output(u'Problems with loading the settigs, run without them.')
@@ -798,19 +804,12 @@
                         some_problem = False
                 # Ensure that if the list given is empty it will be converted to "None"
                 # (but it should be already done in the takesettings() function)
-                if tupla_written == []:
-                        tupla_written = None
-                if tupla_written != None:
-                        wikipedia.output(u'\t   >> Loaded the real-time page... <<')
-                        # Save the settings not to lose them (FixMe: Make that part better)
-                        # The name is to avoid mistakes when the same bot is run in multiple projects.
-                        filename = "settings-%s.data" % str(site).replace(':', '-')
-                        f = file(filename, 'w')
-                        cPickle.dump(tupla_written, f)
-                        f.close()
-                else:
-                        # No settings found, No problem, continue.
-                        wikipedia.output(u'\t   >> No additional settings found! <<')
+                if tupla_written == []: tupla_written = None
+                # Real-Time page loaded
+                if tupla_written != None: wikipedia.output(u'\t   >> Loaded the real-time page... <<')
+                # No settings found, No problem, continue.
+                else: wikipedia.output(u'\t   >> No additional settings found! <<')
+                # Not the main, but the most important loop.
                 for image in generator:
                         # If I don't inizialize the generator, wait part and skip part are useless
                         if wait:
@@ -830,11 +829,13 @@
                         if skip == True:
                                 # If the images to skip are more the images to check, make them the same number
                                 if skip_number > limit: skip_number = limit
+                                # Print a starting message only if no images has been skipped
                                 if skip_list == []:
                                         if skip_number == 1:
                                                 wikipedia.output(u'Skipping the first image:\n')
                                                 wikipedia.output(u'Skipping the first %s images:\n' % skip_number)
+                                # If we still have pages to skip:
                                 if len(skip_list) < skip_number:
                                         wikipedia.output(u'Skipping %s...' % imageName)
@@ -843,25 +844,24 @@
                                                 skip = False 
-                                        wikipedia.output('1\n')
+                                        wikipedia.output('') # Print a blank line.
                                         skip = False					                                               
-                        elif skip_list == []:
+                        elif skip_list == []: # Skip must be false if we are here but
+                                              # the user has set 0 as images to skip
                                 wikipedia.output(u'\t\t>> No images to skip...<<')
                                 skip_list.append('skip = Off') # Only to print it once
+                        # Check on commons if there's already an image with the same name
                         if commonsActive == True:
                                 response = mainClass.checkImage(imageName)
                                 if response == False:
-                        if tupla_written != None:
-                                f = file(filename)
-                                tuplaList = cPickle.load(f)
-                        parentesi = False
+                        parentesi = False # parentesi are these in italian: { ( ) } []
                         delete = False
                         tagged = False
-                        extension = imageName.split('.')[-1]
+                        extension = imageName.split('.')[-1] # get the extension from the image's name
                         # Page => ImagePage
                         p = wikipedia.ImagePage(site, image.title())
-                        # Skip deleted images
+                        # Get the text in the image (called g)
                                 g = p.get()
                         except wikipedia.NoPage:
@@ -870,33 +870,40 @@
                         except wikipedia.IsRedirectPage:
                                 wikipedia.output(u"The file description for %s is a redirect?!" % imageName )
+                        # Is the image already tagged? If yes, no need to double-check, skip
                         for i in TextFind:
+                                # If there are {{ use regex, otherwise no (if there's not the {{ may not be a template
+                                # and the regex will be wrong)
                                 if '{{' in i:
                                         regexP = re.compile('\{\{(?:template|)%s ?(?:\||\n|\}) ?' % i.split('{{')[1].replace(' ', '[ _]'), re.I)
                                         result = regexP.findall(g)
                                         if result != []:
                                                 tagged = True
                                 elif i.lower() in g:
-                                        tagged = True				
+                                        tagged = True
+                        # Deleting the useless template from the description (before adding something
+                        # in the image the original text will be reloaded, don't worry).
                         for l in hiddentemplate:
                                 if tagged == False:
                                         res = re.findall(r'\{\{(?:[Tt]emplate:|)%s(?: \n|\||\n|\})' % l.lower(), g.lower())
                                         if res != []:
-                                                #print res
                                                 wikipedia.output(u'A white template found, skipping the template...')
-                                                # I don't delete the template, because if there is something to change the image page
-                                                # will be reloaded. I delete it only for the next check part.
-                                                if l != '' and l != ' ':
+                                                if l != '' and l != ' ': # Check that l is not nothing or a space
+                                                        # Deleting! (replace the template with nothing)
                                                         g = g.lower().replace('{{%s' % l, '')
-                        for a_word in something:
+                        for a_word in something: # something is the array with {{, MIT License and so on.
                                 if a_word in g:
+                                        # There's a template, probably a license (or I hope so)
                                         parentesi = True
+                        # Is the extension allowed? (is it an image or f.e. a .xls file?)
                         for parl in notallowed:
                                 if parl.lower() in extension.lower():
                                         delete = True
-                        some_problem = False
+                        some_problem = False # If it has "some_problem" it must check
+                                             # the additional settings.
+                        # if tupla_writte, use addictional settings
                         if tupla_written != None:                 
-                                for tupla in tuplaList:
+                                for tupla in tupla_written:
                                         name = tupla[1]
                                         find_tipe = tupla[2]
                                         find = tupla[3]
@@ -916,7 +923,6 @@
                                         text = text % imageName
                                         mexCatched = tupla[8]
-                                        del tupla[0:8]
                                         for k in find_list:
                                                 if find_tipe.lower() == 'findonly':
                                                         if k.lower() == g.lower():
@@ -938,9 +944,12 @@
                                                                 summary_used = summary
                                                                 mex_used = mexCatched
+                        # If the image exists (maybe it has been deleting during the oder
+                        # checking parts or something, who knows? ;-))
                         if p.exists():
                                 # Here begins the check block.
                                 if tagged == True:
+                                        # Tagged? Yes, skip.
                                         printWithTimeZone(u'%s is already tagged...' % imageName)
                                 if some_problem == True:
@@ -951,7 +960,7 @@
                                         if mex_used.lower() == 'default':
                                                 mex_used = unvertext
                                         if imagestatus_used == False:
-                                                reported = mainClass.report_image(rep_page, imageName, com, rep_text)
+                                                reported = mainClass.report_image(imageName)
                                                 reported = True
                                         if reported == True:
@@ -1014,4 +1023,3 @@
-		sys.exit() # Be sure that the Bot will stop

More information about the Pywikipedia-l mailing list