[Pywikipedia-l] SVN: [4864] trunk/pywikipedia/checkimages.py
filnik at svn.wikimedia.org
filnik at svn.wikimedia.org
Sun Jan 13 14:08:32 UTC 2008
Revision: 4864
Author: filnik
Date: 2008-01-13 14:08:31 +0000 (Sun, 13 Jan 2008)
Log Message:
-----------
Some rewrite, adding comments and (I hope) a bugfix: #1868451 (I can't test it because my Time Zone is UTC and I can't change it). By the way, I'm still adding comments and rewriting...
Modified Paths:
--------------
trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================
--- trunk/pywikipedia/checkimages.py 2008-01-12 22:14:39 UTC (rev 4863)
+++ trunk/pywikipedia/checkimages.py 2008-01-13 14:08:31 UTC (rev 4864)
@@ -53,12 +53,11 @@
* Text= This is the template that the bot will use when it will report the image's problem.
---- Known issues/FIXMEs: ----
-* In repeat mode, skip images already checked. (critical for use on Commons - too many uploads there)
* Fix the "real-time" regex and function
* Add the "catch the language" function for commons.
-* see /home/daniel/public_html/WikiSense/UntaggedImages.php
* Add new documentation
* Add a report for the image tagged.
+* Fix the settings part when the bot save the data (make it better)
"""
#
@@ -283,6 +282,24 @@
class NothingFound(wikipedia.Error):
""" An exception indicating that a regex has return [] instead of results."""
+def printWithTimeZone(message):
+ """ Function to print the messages followed by the TimeZone encoded correctly. """
+ if message[-1] != ' ':
+ message = '%s ' % message
+ time_zone = time.strftime("%d %b %Y %H:%M:%S (UTC)", time.localtime())
+ try:
+ wikipedia.output(u"%s%s" % (message, time_zone))
+ except UnicodeDecodeError:
+ try:
+ wikipedia.output(u"%s%s" % (message, time_zone.decode('utf-8')))
+ except UnicodeDecodeError:
+ try:
+ wikipedia.output(u"%s%s" % (message, time_zone.encode(wikipedia.getSite().encoding())))
+ except Exception, e:
+ # There's some strange error! Skip time_zone printing the error.
+ print e # Print the Error (not encode/decode, that won't give problem)
+ wikipedia.output(message)
+
# When the page is not a wiki-page (as for untagged generator) you need that function
def pageText(url):
try:
@@ -294,7 +311,7 @@
response.close()
# When you load to many users, urllib2 can give this error.
except urllib2.HTTPError:
- wikipedia.output(u"Server error. Pausing for 10 seconds... %s" % time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime()) )
+ printWithTimeZone(u"Server error. Pausing for 10 seconds... ")
time.sleep(10)
request = urllib2.Request(url)
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7'
@@ -505,7 +522,7 @@
lista = list()
try:
testo = x.get()
- rxp = "<------- ------->\n\*[Nn]ame=['\"](.*?)['\"]\n\*([Ff]ind|[Ff]indonly)=(.*?)\n\*[Ii]magechanges=(.*?)\n\*[Ss]ummary=['\"](.*?)['\"]\n\*[Hh]ead=['\"](.*?)['\"]\n\*[Tt]ext ?= ?['\"](.*?)['\"]\n\*[Mm]ex ?= ?['\"]?(.*?)['\"]?$"
+ rxp = "<------- ------->\n\*[Nn]ame ?= ?['\"](.*?)['\"]\n\*([Ff]ind|[Ff]indonly)=(.*?)\n\*[Ii]magechanges=(.*?)\n\*[Ss]ummary=['\"](.*?)['\"]\n\*[Hh]ead=['\"](.*?)['\"]\n\*[Tt]ext ?= ?['\"](.*?)['\"]\n\*[Mm]ex ?= ?['\"]?(.*?)['\"]?$"
r = re.compile(rxp, re.UNICODE|re.M)
number = 1
while 1:
@@ -530,6 +547,7 @@
lista += [tupla]
number += 1
except wikipedia.NoPage:
+ wikipedia.output(u"The settings' page doesn't exist!")
lista = None
return lista
@@ -595,22 +613,25 @@
wikipedia.output(u"Another error... skipping the user..")
break
break
-
+
# Here there is the main loop. I'll take all the (name of the) images and then i'll check them.
if __name__ == "__main__":
try:
+
+
# Command line configurable parameters
- repeat = True
- limit = 80
- time_sleep = 30
- skip_number = 0
- wait_number = 0
- commonsActive = False
- normal = False
- urlUsed = False
- regexGen = False
- untagged = False
-
+ repeat = True # Restart after having check all the images?
+ limit = 80 # How many images check?
+ time_sleep = 30 # How many time sleep after the check?
+ skip_number = 0 # How many images to skip before checking?
+ wait_number = 0 # How many time sleep before the check?
+ commonsActive = False # Check if on commons there's an image with the same name?
+ normal = False # Check the new images or use another generator?
+ urlUsed = False # Use the url-related function instead of the new-pages generator
+ regexGen = False # Use the regex generator
+ untagged = False # Use the untagged generator
+ skip_list = list() # Inizialize the skip list used below
+
# Here below there are the parameters.
for arg in wikipedia.handleArgs():
if arg.startswith('-limit'):
@@ -700,7 +721,7 @@
# Block of text to translate the parameters set above.
image_n = site.image_namespace()
- image_namespace = "%s:" % image_n
+ image_namespace = "%s:" % image_n # Example: "User_talk:"
unvertext = wikipedia.translate(site, n_txt)
commento = wikipedia.translate(site, comm)
commento2 = wikipedia.translate(site, comm2)
@@ -723,7 +744,8 @@
# A template as {{en is not a license! Adding also them in the whitelist template...
for langK in wikipedia.Family('wikipedia').knownlanguages:
hiddentemplate.append('%s' % langK)
-
+
+ # If the images to skip are 0, set the skip variable to False (the same for the wait time)
if skip_number == 0:
skip = False
if wait_number == 0:
@@ -731,8 +753,9 @@
# nothing = Defining an empty image description
nothing = ['', ' ', ' ', ' ', '\n', '\n ', '\n ', '\n\n', '\n \n', ' \n', ' \n ', ' \n \n']
# something = Minimal requirements for an image description.
- #If this fits, no tagging will take place
- something = ['{{', 'MIT']#, '}}']
+ # If this fits, no tagging will take place (if there aren't other issues)
+ # MIT license is ok on italian wikipedia, let also this here
+ something = ['{{', "'''MIT license'''"] # Don't put "}}" here, please. Useless and can give problems.
# Unused file extensions. Does not contain PDF.
notallowed = ("xcf", "xls", "sxw", "sxi", "sxc", "sxd", "djvu")
@@ -740,76 +763,101 @@
if lang not in project_inserted:
wikipedia.output(u"Your project is not supported by this script. You have to edit the script and add it!")
wikipedia.stopme()
-
+ # Some formatting for delete immediately template
di = '\n%s' % di
dels = dels % di
- # Reading the log of the new images
+ # Reading the log of the new images if another generator is not given.
if normal == True:
if limit == 1:
wikipedia.output(u"Retrieving the latest file for checking...")
else:
wikipedia.output(u"Retrieving the latest %d files for checking..." % limit)
- while 1:
+ # Main Loop
+ while 1:
+ # Defing the Main Class.
mainClass = main(site)
+ # Untagged is True? Let's take that generator
if untagged == True:
generator = mainClass.untaggedGenerator(projectUntagged, rep_page, com)
- normal = False
+ normal = False # Ensure that normal is False
+ # Normal True? Take the default generator
if normal == True:
generator = pagegenerators.NewimagesPageGenerator(number = limit, site = site)
+ # if urlUsed and regexGen, get the source for the generator
if urlUsed == True and regexGen == True:
textRegex = pagetext(regexPageUrl)
+ # Not an url but a wiki page as "source" for the regex
elif regexGen == True:
pageRegex = wikipedia.Page(site, regexPageName)
try:
textRegex = pageRegex.get()
except wikipedia.NoPage:
wikipedia.output(u"%s doesn't exist!" % page.title())
- textRegex = ''
+ textRegex = '' # No source, so the bot will quit later.
+ # If generator is the regex' one, use your own Generator using an url or page and a regex.
if generator == 'regex' and regexGen == True:
generator = mainClass.regexGenerator(regexpToUse, textRegex)
+ # Ok, We (should) have a generator, so let's go on.
try:
+ # Take the additional settings for the Project
tupla_written = mainClass.takesettings(settings)
except wikipedia.Error:
+ # Error? Settings = None
wikipedia.output(u'Problems with loading the settigs, run without them.')
tupla_written = None
some_problem = False
+ # Ensure that if the list given is empty it will be converted to "None"
+ # (but it should be already done in the takesettings() function)
if tupla_written == []:
tupla_written = None
if tupla_written != None:
wikipedia.output(u'\t >> Loaded the real-time page... <<')
+ # Save the settings not to lose them (FixMe: Make that part better)
filename = "settings.data"
f = file(filename, 'w')
cPickle.dump(tupla_written, f)
f.close()
else:
+ # No settings found, No problem, continue.
wikipedia.output(u'\t >> No additional settings found! <<')
- if skip == True:
- skip_list = list()
- wikipedia.output(u'Skipping the first %s images:\n' % skip_number)
- else:
- wikipedia.output(u'\t\t>> No images to skip...<<')
- skipok = False
for image in generator:
- # If I don't inizialize the generator, wait part has no sense.
+ # If I don't inizialize the generator, wait part and skip part are useless
if wait:
- wikipedia.output(u'Waiting %s seconds before checking the images, %s' % (wait_number, time.strftime("%d %b %Y %H:%M:%S (UTC)", time.localtime())))
+ printWithTimeZone(u'Waiting %s seconds before checking the images,' % wait_number)
+ # Let's sleep...
time.sleep(wait_number)
+ # Never sleep again (we are in a loop)
wait = False
+ # If the generator returns something that is not an image, simply skip it.
if normal == False and regexGen == False:
if image_namespace.lower() not in image.title().lower() and \
'image:' not in image.title().lower():
+ wikipedia.output(u'%s seems not an image, skip it...' % image.title())
continue
- imageName = image.title().split(image_namespace)[1]
+ imageName = image.title().split(image_namespace)[1] # Deleting the namespace (useless here)
+ # Skip block
if skip == True:
+ # If the images to skip are more the images to check, make them the same number
+ if skip_number > limit: skip_number = limit
+ if skip_list == []:
+ if skip_number == 1:
+ wikipedia.output(u'Skipping the first image:\n')
+ else:
+ wikipedia.output(u'Skipping the first %s images:\n' % skip_number)
if len(skip_list) < skip_number:
wikipedia.output(u'Skipping %s...' % imageName)
skip_list.append(imageName)
+ if skip_number == 1:
+ wikipedia.output('')
+ skip = False
continue
else:
- if skipok == False:
- wikipedia.output('')
- skipok = True
+ wikipedia.output('1\n')
+ skip = False
+ elif skip_list == []:
+ wikipedia.output(u'\t\t>> No images to skip...<<')
+ skip_list.append('skip = Off') # Only to print it once
if commonsActive == True:
response = mainClass.checkImage(imageName)
if response == False:
@@ -898,7 +946,7 @@
if p.exists():
# Here begins the check block.
if tagged == True:
- wikipedia.output(u'%s is already tagged... %s' % (imageName, time.strftime("%H:%M:%S", time.localtime())))
+ printWithTimeZone(u'%s is already tagged...' % imageName)
continue
if some_problem == True:
if mex_used in g:
@@ -919,7 +967,7 @@
some_problem = False
continue
elif parentesi == True:
- wikipedia.output(u"%s seems ok, %s" % (imageName, time.strftime("%H:%M:%S", time.localtime())))
+ printWithTimeZone(u"%s seems ok," % imageName)
# It works also without this... but i want only to be sure ^^
parentesi = False
continue
@@ -959,16 +1007,7 @@
continue
# A little block to perform the repeat or to break.
if repeat == True:
- time_zone = time.strftime("%d %b %Y %H:%M:%S (UTC)", time.localtime())
- try:
- wikipedia.output(u"Waiting for %s seconds, %s" % (time_sleep, time_zone))
- except UnicodeDecodeError:
- try:
- wikipedia.output(u"Waiting for %s seconds, %s" % (time_sleep, time_zone.decode('utf-8')))
- except Exception, e:
- # There's some strange error! Skip time_zone printing the error.
- print e
- wikipedia.output(u"Waiting for %s seconds")
+ printWithTimeZone(u"Waiting for %s seconds," % time_sleep)
time.sleep(time_sleep)
elif repeat == False:
wikipedia.output(u"\t\t\t>> STOP! <<")
More information about the Pywikipedia-l
mailing list