Revision: 4864 Author: filnik Date: 2008-01-13 14:08:31 +0000 (Sun, 13 Jan 2008)
Log Message: ----------- Some rewrite, adding comments and (I hope) a bugfix: #1868451 (I can't test it because my Time Zone is UTC and I can't change it). By the way, I'm still adding comments and rewriting...
Modified Paths: -------------- trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py =================================================================== --- trunk/pywikipedia/checkimages.py 2008-01-12 22:14:39 UTC (rev 4863) +++ trunk/pywikipedia/checkimages.py 2008-01-13 14:08:31 UTC (rev 4864) @@ -53,12 +53,11 @@ * Text= This is the template that the bot will use when it will report the image's problem.
---- Known issues/FIXMEs: ---- -* In repeat mode, skip images already checked. (critical for use on Commons - too many uploads there) * Fix the "real-time" regex and function * Add the "catch the language" function for commons. -* see /home/daniel/public_html/WikiSense/UntaggedImages.php * Add new documentation * Add a report for the image tagged. +* Fix the settings part when the bot save the data (make it better) """
# @@ -283,6 +282,24 @@ class NothingFound(wikipedia.Error): """ An exception indicating that a regex has return [] instead of results."""
+def printWithTimeZone(message): + """ Function to print the messages followed by the TimeZone encoded correctly. """ + if message[-1] != ' ': + message = '%s ' % message + time_zone = time.strftime("%d %b %Y %H:%M:%S (UTC)", time.localtime()) + try: + wikipedia.output(u"%s%s" % (message, time_zone)) + except UnicodeDecodeError: + try: + wikipedia.output(u"%s%s" % (message, time_zone.decode('utf-8'))) + except UnicodeDecodeError: + try: + wikipedia.output(u"%s%s" % (message, time_zone.encode(wikipedia.getSite().encoding()))) + except Exception, e: + # There's some strange error! Skip time_zone printing the error. + print e # Print the Error (not encode/decode, that won't give problem) + wikipedia.output(message) + # When the page is not a wiki-page (as for untagged generator) you need that function def pageText(url): try: @@ -294,7 +311,7 @@ response.close() # When you load to many users, urllib2 can give this error. except urllib2.HTTPError: - wikipedia.output(u"Server error. Pausing for 10 seconds... %s" % time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime()) ) + printWithTimeZone(u"Server error. Pausing for 10 seconds... ") time.sleep(10) request = urllib2.Request(url) user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7' @@ -505,7 +522,7 @@ lista = list() try: testo = x.get() - rxp = "<------- ------->\n*[Nn]ame=['"](.*?)['"]\n*([Ff]ind|[Ff]indonly)=(.*?)\n*[Ii]magechanges=(.*?)\n*[Ss]ummary=['"](.*?)['"]\n*[Hh]ead=['"](.*?)['"]\n*[Tt]ext ?= ?['"](.*?)['"]\n*[Mm]ex ?= ?['"]?(.*?)['"]?$" + rxp = "<------- ------->\n*[Nn]ame ?= ?['"](.*?)['"]\n*([Ff]ind|[Ff]indonly)=(.*?)\n*[Ii]magechanges=(.*?)\n*[Ss]ummary=['"](.*?)['"]\n*[Hh]ead=['"](.*?)['"]\n*[Tt]ext ?= ?['"](.*?)['"]\n*[Mm]ex ?= ?['"]?(.*?)['"]?$" r = re.compile(rxp, re.UNICODE|re.M) number = 1 while 1: @@ -530,6 +547,7 @@ lista += [tupla] number += 1 except wikipedia.NoPage: + wikipedia.output(u"The settings' page doesn't exist!") lista = None return lista @@ -595,22 +613,25 @@ wikipedia.output(u"Another error... skipping the user..") break break - + # Here there is the main loop. I'll take all the (name of the) images and then i'll check them. if __name__ == "__main__": try: + + # Command line configurable parameters - repeat = True - limit = 80 - time_sleep = 30 - skip_number = 0 - wait_number = 0 - commonsActive = False - normal = False - urlUsed = False - regexGen = False - untagged = False - + repeat = True # Restart after having check all the images? + limit = 80 # How many images check? + time_sleep = 30 # How many time sleep after the check? + skip_number = 0 # How many images to skip before checking? + wait_number = 0 # How many time sleep before the check? + commonsActive = False # Check if on commons there's an image with the same name? + normal = False # Check the new images or use another generator? + urlUsed = False # Use the url-related function instead of the new-pages generator + regexGen = False # Use the regex generator + untagged = False # Use the untagged generator + skip_list = list() # Inizialize the skip list used below + # Here below there are the parameters. for arg in wikipedia.handleArgs(): if arg.startswith('-limit'): @@ -700,7 +721,7 @@
# Block of text to translate the parameters set above. image_n = site.image_namespace() - image_namespace = "%s:" % image_n + image_namespace = "%s:" % image_n # Example: "User_talk:" unvertext = wikipedia.translate(site, n_txt) commento = wikipedia.translate(site, comm) commento2 = wikipedia.translate(site, comm2) @@ -723,7 +744,8 @@ # A template as {{en is not a license! Adding also them in the whitelist template... for langK in wikipedia.Family('wikipedia').knownlanguages: hiddentemplate.append('%s' % langK) - + + # If the images to skip are 0, set the skip variable to False (the same for the wait time) if skip_number == 0: skip = False if wait_number == 0: @@ -731,8 +753,9 @@ # nothing = Defining an empty image description nothing = ['', ' ', ' ', ' ', '\n', '\n ', '\n ', '\n\n', '\n \n', ' \n', ' \n ', ' \n \n'] # something = Minimal requirements for an image description. - #If this fits, no tagging will take place - something = ['{{', 'MIT']#, '}}'] + # If this fits, no tagging will take place (if there aren't other issues) + # MIT license is ok on italian wikipedia, let also this here + something = ['{{', "'''MIT license'''"] # Don't put "}}" here, please. Useless and can give problems. # Unused file extensions. Does not contain PDF. notallowed = ("xcf", "xls", "sxw", "sxi", "sxc", "sxd", "djvu")
@@ -740,76 +763,101 @@ if lang not in project_inserted: wikipedia.output(u"Your project is not supported by this script. You have to edit the script and add it!") wikipedia.stopme() - + # Some formatting for delete immediately template di = '\n%s' % di dels = dels % di - # Reading the log of the new images + # Reading the log of the new images if another generator is not given. if normal == True: if limit == 1: wikipedia.output(u"Retrieving the latest file for checking...") else: wikipedia.output(u"Retrieving the latest %d files for checking..." % limit) - while 1: + # Main Loop + while 1: + # Defing the Main Class. mainClass = main(site) + # Untagged is True? Let's take that generator if untagged == True: generator = mainClass.untaggedGenerator(projectUntagged, rep_page, com) - normal = False + normal = False # Ensure that normal is False + # Normal True? Take the default generator if normal == True: generator = pagegenerators.NewimagesPageGenerator(number = limit, site = site) + # if urlUsed and regexGen, get the source for the generator if urlUsed == True and regexGen == True: textRegex = pagetext(regexPageUrl) + # Not an url but a wiki page as "source" for the regex elif regexGen == True: pageRegex = wikipedia.Page(site, regexPageName) try: textRegex = pageRegex.get() except wikipedia.NoPage: wikipedia.output(u"%s doesn't exist!" % page.title()) - textRegex = '' + textRegex = '' # No source, so the bot will quit later. + # If generator is the regex' one, use your own Generator using an url or page and a regex. if generator == 'regex' and regexGen == True: generator = mainClass.regexGenerator(regexpToUse, textRegex) + # Ok, We (should) have a generator, so let's go on. try: + # Take the additional settings for the Project tupla_written = mainClass.takesettings(settings) except wikipedia.Error: + # Error? Settings = None wikipedia.output(u'Problems with loading the settigs, run without them.') tupla_written = None some_problem = False + # Ensure that if the list given is empty it will be converted to "None" + # (but it should be already done in the takesettings() function) if tupla_written == []: tupla_written = None if tupla_written != None: wikipedia.output(u'\t >> Loaded the real-time page... <<') + # Save the settings not to lose them (FixMe: Make that part better) filename = "settings.data" f = file(filename, 'w') cPickle.dump(tupla_written, f) f.close() else: + # No settings found, No problem, continue. wikipedia.output(u'\t >> No additional settings found! <<') - if skip == True: - skip_list = list() - wikipedia.output(u'Skipping the first %s images:\n' % skip_number) - else: - wikipedia.output(u'\t\t>> No images to skip...<<') - skipok = False for image in generator: - # If I don't inizialize the generator, wait part has no sense. + # If I don't inizialize the generator, wait part and skip part are useless if wait: - wikipedia.output(u'Waiting %s seconds before checking the images, %s' % (wait_number, time.strftime("%d %b %Y %H:%M:%S (UTC)", time.localtime()))) + printWithTimeZone(u'Waiting %s seconds before checking the images,' % wait_number) + # Let's sleep... time.sleep(wait_number) + # Never sleep again (we are in a loop) wait = False + # If the generator returns something that is not an image, simply skip it. if normal == False and regexGen == False: if image_namespace.lower() not in image.title().lower() and \ 'image:' not in image.title().lower(): + wikipedia.output(u'%s seems not an image, skip it...' % image.title()) continue - imageName = image.title().split(image_namespace)[1] + imageName = image.title().split(image_namespace)[1] # Deleting the namespace (useless here) + # Skip block if skip == True: + # If the images to skip are more the images to check, make them the same number + if skip_number > limit: skip_number = limit + if skip_list == []: + if skip_number == 1: + wikipedia.output(u'Skipping the first image:\n') + else: + wikipedia.output(u'Skipping the first %s images:\n' % skip_number) if len(skip_list) < skip_number: wikipedia.output(u'Skipping %s...' % imageName) skip_list.append(imageName) + if skip_number == 1: + wikipedia.output('') + skip = False continue else: - if skipok == False: - wikipedia.output('') - skipok = True + wikipedia.output('1\n') + skip = False + elif skip_list == []: + wikipedia.output(u'\t\t>> No images to skip...<<') + skip_list.append('skip = Off') # Only to print it once if commonsActive == True: response = mainClass.checkImage(imageName) if response == False: @@ -898,7 +946,7 @@ if p.exists(): # Here begins the check block. if tagged == True: - wikipedia.output(u'%s is already tagged... %s' % (imageName, time.strftime("%H:%M:%S", time.localtime()))) + printWithTimeZone(u'%s is already tagged...' % imageName) continue if some_problem == True: if mex_used in g: @@ -919,7 +967,7 @@ some_problem = False continue elif parentesi == True: - wikipedia.output(u"%s seems ok, %s" % (imageName, time.strftime("%H:%M:%S", time.localtime()))) + printWithTimeZone(u"%s seems ok," % imageName) # It works also without this... but i want only to be sure ^^ parentesi = False continue @@ -959,16 +1007,7 @@ continue # A little block to perform the repeat or to break. if repeat == True: - time_zone = time.strftime("%d %b %Y %H:%M:%S (UTC)", time.localtime()) - try: - wikipedia.output(u"Waiting for %s seconds, %s" % (time_sleep, time_zone)) - except UnicodeDecodeError: - try: - wikipedia.output(u"Waiting for %s seconds, %s" % (time_sleep, time_zone.decode('utf-8'))) - except Exception, e: - # There's some strange error! Skip time_zone printing the error. - print e - wikipedia.output(u"Waiting for %s seconds") + printWithTimeZone(u"Waiting for %s seconds," % time_sleep) time.sleep(time_sleep) elif repeat == False: wikipedia.output(u"\t\t\t>> STOP! <<")
pywikipedia-l@lists.wikimedia.org