Revision: 4602 Author: filnik Date: 2007-11-27 15:01:26 +0000 (Tue, 27 Nov 2007)
Log Message: ----------- Adding a new script to check the new images but also the old ones
Added Paths: ----------- trunk/pywikipedia/checkimages.py
Added: trunk/pywikipedia/checkimages.py =================================================================== --- trunk/pywikipedia/checkimages.py (rev 0) +++ trunk/pywikipedia/checkimages.py 2007-11-27 15:01:26 UTC (rev 4602) @@ -0,0 +1,922 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +Script to check recently uploaded files. This script checks if a file +description is present and if there is only a {{PD}} tag in the description. +It will tag a file "no source" in the former case, and request the uploader +to choose a more specific license in the latter case. + +This script will have to be configured for each language. Please submit +translations as addition to the pywikipediabot framework. + +Everything that needs customisation is indicated by comments. + +This script understands the following command-line arguments: + + -limit - The number of images to check (default: 80) + + -commons - The Bot will check if an image on Commons has the same name + and if true it report the image. + + -break - To break the bot after the first check (default: recursive) + + -time[:#] - Time in seconds between repeat runs (default: 30) + + -skip[:#] - The bot skip the first [:#] images (default: 0) + + -start[:#] - Use allpages() as generator (it starts already form Image:[:#]) + + -cat[:#] - Use a category as generator + + -regex[:#] - Use regex, must be used with -url or -page + + -page[:#] - Define the name of the wikipage where are the images + + -url[:#] - Define the url where are the images + + -untagged[:#] - Use daniel's tool as generator ( http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php ) + +---- Istructions for the real-time settings ---- +* For every new block you have to add: + +<------- -------> + +In this way the Bot can understand where the block start to take the right parameter. + +* Name= Set the name of the block +* Find= Use it to define what search in the text of the image's description, +while Findonly= search only if the exactly text that you give is in the image's description. +* Summary= That's the summary that the bot will use when it will notify the problem. +* Head= That's the incipit that the bot will use for the message. +* Text= This is the template that the bot will use when it will report the image's problem. + +---- Known issues/FIXMEs: ---- +* In repeat mode, skip images already checked. (critical for use on Commons - too many uploads there) +* Fix the "real-time" regex and function +* Add the "catch the language" function for commons. +* see /home/daniel/public_html/WikiSense/UntaggedImages.php +* Add new documentation +* Add a report for the image tagged. +""" + +# +# (C) Kyle/Orgullomoore, 2006-2007 (newimage.py) +# (C) Siebrand Mazeland, 2007 +# (C) Filnik, 2007 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id: checkimages.py,v 1.0 2007/11/27 16:00:25 filnik Exp$' +# + +import re, time, urllib2 +import wikipedia, config, os +import cPickle, pagegenerators, catlib + +######################################################################################################################### +# <------------------------------------------- Change only below! -----------------------------------------------------># +######################################################################################################################### + +# That's what you want that will be added. (i.e. the {{no source}} with the right day/month/year ) +n_txt = { + 'commons':'\n{{subst:nld}}', + 'en' :'\n{{subst:nld}}', + 'it' :'\n{{subst:unverdata}}', + } + +txt_find = { + 'commons':['{{no license', '{{nld'], + 'en':['{{nld', '{{no license'], + 'it':['{{unverdata', '{{unverified'], + } + +# Summary for when the will add the no source +comm = { + 'commons':'Bot: Marking newly uploaded untagged file', + 'en' :'Bot: Marking newly uploaded untagged file', + 'it' :"Bot: Aggiungo unverified", + } + +# Summary that the bot use when it notify the problem with the image's license +comm2 = { + 'commons':"Bot: Requesting source information." , + 'en' :"Bot: Requesting source information." , + 'it' :"Bot: Notifico l'unverified", + } + +# When the Bot find that the usertalk is empty is not pretty to put only the no source without the welcome, isn't it? +empty = { + 'commons':'{{subst:welcome}}\n~~~~\n', + 'en' :'{{welcome}}\n~~~~\n', + 'it' :'{{benvenuto}}\n~~~~\n', + } + +# General summary +unver = { + 'commons':'Bot: no source', + 'en' :'Bot: no source', + 'it' :'Bot: Unverified!', + } + +# if the file has an unknown extension it will be tagged with this template. +# In reality, there aren't unknown extension, they are only not allewed... ^__^ +delete_immediately = { + 'commons':"{{db-meta|The file has .%s as extension.}}", + 'en' :"{{db-meta|The file has .%s as extension.}}", + 'it' :'{{cancella subito|motivo=Il file ha come estensione ".%s"}}', + } + +# The header of the Unknown extension's message. +delete_immediately_head = { + 'commons':"\n== Unknown extension! ==\n", + 'en' :"\n== Unknown extension! ==\n", + 'it' :'\n== File non specificato ==\n', + } + +# Text that will be add if the bot find a unknown extension. +delete_immediately_notification = { + 'commons':'The [[:Image:%s]] file has a wrong extension, please check. ~~~~', + 'en' :'The [[:Image:%s]] file has a wrong extension, please check. ~~~~', + 'it' :'{{subst:Utente:Filbot/Ext|%s}}', + } +# Summary of the delate immediately. (f.e: Adding {{db-meta|The file has .%s as extension.}}) +del_comm = { + 'commons':'Bot: Adding %s', + 'en' :'Bot: Adding %s', + 'it' :'Bot: Aggiungo %s', + } + +# This is the most important header, because it will be used a lot. That's the header that the bot +# will add if the image hasn't the license. +nothing_head = { + 'commons':"",# Nothing, the template has already the header inside. + 'en' :"\n== Image without license ==\n", + 'it' :"\n== Immagine senza licenza ==\n", + } +# That's the text that the bot will add if it doesn't find the license. +nothing_notification = { + 'commons':"{{subst:User:Filnik/untagged|Image:%s}}Image:%s}}\n\n''This message was '''added automatically by [[User:Filbot|Filbot]]''', if you need some help about it, ask [[User:Filnik|its master]] or go to the [[Commons:Help desk]]''. --~~~~", + 'en' :"{{subst:image source|Image:%s}} --~~~~", + 'it' :"{{subst:Utente:Filbot/Senza licenza|%s}} --~~~~", + } +# This is a list of what bots used this script in your project. +# NOTE: YOUR Botnick is automatically added. It's not required to add it twice. +bot_list = { + 'commons':['Siebot', 'CommonsDelinker'], + 'en' :['OrphanBot'], + 'it' :['Filbot', 'Nikbot', '.snoopyBot.'], + } + +# The message that the bot will add the second time that find another license problem. +second_message_without_license = { + 'commons':None, + 'en': None, + 'it':'{{subst:Utente:Filbot/Senza licenza2|%s}} --~~~~', + } +# You can add some settings to wikipedia. In this way, you can change them without touch the code. +# That's useful if you are running the bot on Toolserver. +page_with_settings = { + 'commons':None, + 'en':None, + 'it':'Utente:Nikbot/Settings#Settings', + } +# The bot can report some images (like the images that have the same name of an image on commons) +# This is the page where the bot will store them. +report_page = { + 'commons':'User:Filbot/Report', + 'en' :'User:Filnik/Report', + 'it' :'Utente:Nikbot/Report', + } +# Adding the date after the signature. +timeselected = u' ~~~~~' +# The text added in the report +report_text = { + 'commons':"\n*[[:Image:%s]] " + timeselected, + 'en':"\n*[[:Image:%s]] " + timeselected, + 'it':"\n*[[:Immagine:%s]] " + timeselected, + } +# The summary of the report +comm10 = { + 'commons':'Bot: Updating the log', + 'en':'Bot: Updating the log', + 'it':'Bot: Aggiorno il log', + } + +# If a template isn't a license but it's included on a lot of images, that can be skipped to +# analise the image without taking care of it. (the template must be in a list) +HiddenTemplate = { + 'commons':['{{information'], + 'en':['{{information'], + 'it':['{{edp', '{{informazioni file', '{{information'], + } + +# Add your project (in alphabetical order) if you want that the bot start +project_inserted = ['commons', 'en', 'it'] + +# Ok, that's all. What is below, is the rest of code, now the code is fixed and it will run correctly in your project. +######################################################################################################################### +# <------------------------------------------- Change only above! -----------------------------------------------------># +######################################################################################################################### + +class LogIsFull(wikipedia.Error): + """An exception indicating that the log is full and the Bot cannot add other data to prevent Errors.""" + +class NothingFound(wikipedia.Error): + """ An exception indicating that a regex has return [] instead of results.""" + +def pageText(url): + try: + request = urllib2.Request(url) + user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7' + request.add_header("User-Agent", user_agent) + response = urllib2.urlopen(request) + text = response.read() + response.close() + # When you load to many users, urllib2 can give this error. + except urllib2.HTTPError: + wikipedia.output(u"Server error. Pausing for 10 seconds... " + time.strftime("%d %b %Y %H:%M:%S (UTC)", time.gmtime()) ) + time.sleep(10) + request = urllib2.Request(url) + user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7' + request.add_header("User-Agent", user_agent) + response = urllib2.urlopen(request) + text = response.read() + response.close() + return text + +# Here there is the main class. +class main: + def __init__(self, site): + self.site = site + self.logFulNumber = 25000 + def general(self, newtext, image, notification, head, botolist): + """ This class can be called for two reason. So I need two different __init__, one with common data + and another with the data that I required... maybe it can be added on the other function, but in this way + seems more clear what parameters I need + """ + self.newtext = newtext + self.image = image + self.head = head + self.notification = notification + self.botolist = botolist + def put_mex(self, put = True): + # Adding no source. - I'm sure that the image exists, double check... but another can't be useless. + try: + testoa = p.get() + except wikipedia.NoPage: + wikipedia.output(u'%s has been deleted...' % p.title()) + if put: + p.put(testoa + self.newtext, comment = commento, minorEdit = True) + # paginetta it's the image page object. + paginetta = wikipedia.ImagePage(self.site, image_namespace + self.image) + # I take the data of the latest uploader and I take only the name + imagedata = paginetta.getFileVersionHistory() + #print imagedata # Let it so for de-buggin porpuse (wikipedia.output gives error) + # When an Image is deleted from Commons and someone has add something in the wikipedia page + # The bot doesn't catch the data properly :-) + if imagedata == list(): + wikipedia.output(u"Seems that %s hasn't the image at all, but there is something in the description..." % self.image) + repme = "\n*[[:Image:%s]] seems to have problems ('''no data found in the image''')" + self.report_image(rep_page, self.image, com, repme) + # We have a problem! Report and exit! + return False + try: + nick = paginetta.getFileVersionHistory()[-1][1] + except IndexError: + wikipedia.output(u"Seems that %s hasn't the image at all, but there is something in the description..." % self.image) + repme = "\n*[[:Image:%s]] seems to have problems ('''no data found in the image''')" + # We have a problem! Report and exit! + self.report_image(rep_page, self.image, com, repme) + return False + luser = wikipedia.url2link(nick, self.site, self.site) + pagina_discussione = self.site.namespace(3) + ':' + luser + # Defing the talk page (pagina_discussione = talk_page ^__^ ) + talk_page = wikipedia.Page(self.site, pagina_discussione) + self.talk_page = talk_page + return True + # There is the function to put the advise in talk page. + def put_talk(self, notification, head, notification2 = None, commx = None): + talk_page = self.talk_page + notification = self.notification + if notification2 == None: + notification2 = notification + else: + notification2 = notification2 % self.image + head = self.head + second_text = False + # Getting the talk page's history, to check if there is another advise... + # The try block is used to prevent error if you use an old wikipedia.py's version. + edit_to_load = 10 + if talk_page.exists(): + try: + history = talk_page.getVersionHistory(False, False, False, edit_to_load) + except TypeError: + history = talk_page.getVersionHistory(False, False, False) + latest_edit = history[0] + latest_user = latest_edit[2] + wikipedia.output(u'The latest user that has written something is: ' + latest_user) + else: + wikipedia.output(u'The user page is blank') + + if talk_page.exists(): + testoattuale = talk_page.get() + # Find out the list of Bots that add no source tags. + lang = config.mylang + # Standard language + self.lang = lang + project = config.family + bot = config.usernames[project] + botnick = bot[lang] + botolist = self.botolist + [botnick] + for i in botolist: + if latest_user == i: + second_text = True + # A block to prevent the second message if the bot also welcomed users... + if latest_edit == history[-1]: + second_text = False + else: + second_text = False + testoattuale = ti_es_ti + if commx == None: + commentox = commento2 + else: + commentox = commx + if second_text == True: + talk_page.put(testoattuale + "\n\n:" + notification2, comment = commentox, minorEdit = False) + elif second_text == False: + talk_page.put(testoattuale + head + notification, comment = commentox, minorEdit = False) + def run_bot(self, textrun, rep_page, com): + # Search regular expression to find links like this (and the class attribute is optional too) + # class="new" title="Immagine:Soldatino2.jpg">Immagine:Soldatino2.jpg</a>" <span class="comment"> + regexp = r'(class="new" |)title="' + image_namespace + '(.*?).(\w\w\w|jpeg)">.*?</a>".*?<span class="comment">' + pos = 0 + done = list() + ext_list = list() + r = re.compile(regexp, re.UNICODE) + while 1: + m = r.search(textrun, pos) + if m == None: + wikipedia.output(u"\t\t>> All images checked. <<") + break + pos = m.end() + new = m.group(1) + im = m.group(2) + ext = m.group(3) + # This prevent pages with strange characters. They will be loaded without problem. + image = im + "." + ext + if new != '': + wikipedia.output(u"Skipping %s because it has been deleted." % image) + done.append(image) + if image not in done: + done.append(image) + yield image + #continue + + def untaggedGenerator(self, untaggedProject, rep_page, com): + lang = untaggedProject.split('.', 1)[0] + project = '.' + untaggedProject.split('.', 1)[1] + if lang == 'commons': + link = 'http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php?wikifam=commo...' + else: + link = 'http://tools.wikimedia.de/~daniel/WikiSense/UntaggedImages.php?wikilang=' + lang + '&wikifam=' + project + '&order=img_timestamp&max=' + str(limit) + '&ofs=0&max=' + str(limit) + text = pageText(link) + #print text + regexp = r"""<td valign='top' title='Name'><a href='http://.*?\..*?\.org/w/index\.php\?title=(.*?)'>.*?</a></td>""" + results = re.findall(regexp, text) + if results == []: + print link + raise NothingFound('Nothing found! Try to use the tool by yourself to be sure that it works!') + else: + for result in results: + yield wikipedia.Page(self.site, result) + + def regexGenerator(self, regexp, textrun): + pos = 0 + done = list() + ext_list = list() + r = re.compile(r'%s' % regexp, re.UNICODE|re.M) + while 1: + m = r.search(textrun, pos) + if m == None: + wikipedia.output(u"\t\t>> All images checked. <<") + break + pos = m.end() + image = m.group(1) + if image not in done: + done.append(image) + yield image + #continue + + def checkImage(self, image): + # Search regular expression to find links like this (and the class attribute is optional too) + # title="Immagine:Nvidia.jpg" + wikipedia.output(u'Checking if %s is on commons...' % image) + commons = wikipedia.getSite('commons', 'commons') + if wikipedia.Page(commons, u'Image:' + image).exists(): + wikipedia.output(u'%s is on commons!' % image) + imagePage = wikipedia.ImagePage(self.site, 'Image:' + image) + on_commons_text = imagePage.getImagePageHtml() + if "<div class='sharedUploadNotice'>" in on_commons_text: + wikipedia.output(u"But, the image doesn't exist on your project! Skip...") + # Problems? Yes! We have to skip the check part for that image! + # Because it's on commons but someone has added something on your project. + return False + elif 'stemma' in image.lower() and self.site.lang == 'it': + wikipedia.output(u'%s has "stemma" inside, means that it's ok.' % image) + return False + else: + repme = "\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]]" + self.report_image(rep_page, image, com, repme) + # Problems? No, return True + return True + else: + # Problems? No, return True + return True + + def report_image(self, rep_page, image, com, rep): + another_page = wikipedia.Page(self.site, rep_page) + + if another_page.exists(): + text_get = another_page.get() + else: + text_get = str() + if len(text_get) >= self.logFulNumber: + raise LogIsFull("The log page (%s) is full! Please delete the old images reported." % another_page.title()) + pos = 0 + # The talk page includes "_" between the two names, in this way i replace them to " " + regex = image + n = re.compile(regex, re.UNICODE) + y = n.search(text_get, pos) + if y == None: + # Adding the log :) + if "'''Commons'''" in rep: + rep_text = rep % (image, image) + else: + rep_text = rep % image + another_page.put(text_get + rep_text, comment = com, minorEdit = False) + wikipedia.output(u"...Reported...") + reported = True + else: + pos = y.end() + wikipedia.output(u"%s is already in the report page." % image) + reported = False + return reported + + def takesettings(self, settings): + pos = 0 + x = wikipedia.Page(self.site, settings) + lista = list() + try: + testo = x.get() + rxp = "<------- ------->\n*[Nn]ame=['"](.*?)['"]\n*([Ff]ind|[Ff]indonly)=(.*?)\n*[Ii]magechanges=(.*?)\n*[Ss]ummary=['"](.*?)['"]\n*[Hh]ead=['"](.*?)['"]\n*[Tt]ext ?= ?['"](.*?)['"]\n*[Mm]ex ?= ?['"]?(.*?)['"]?$" + r = re.compile(rxp, re.UNICODE|re.M) + number = 1 + while 1: + m = r.search(testo, pos) + if m == None: + if lista == list(): + wikipedia.output(u"You've set wrongly your settings, please take a look to the relative page. (run without them)") + lista = None + else: + break + else: + pos = m.end() + name = str(m.group(1)) + find_tipe = str(m.group(2)) + find = str(m.group(3)) + imagechanges = str(m.group(4)) + summary = str(m.group(5)) + head = str(m.group(6)) + text = str(m.group(7)) + mexcatched = str(m.group(8)) + tupla = [number, name, find_tipe, find, imagechanges, summary, head, text, mexcatched] + lista += [tupla] + number += 1 + except wikipedia.NoPage: + lista = None + return lista + + def load(self, raw): + list_loaded = list() + pos = 0 + load_2 = True + # I search with a regex how many user have not the talk page + # and i put them in a list (i find it more easy and secure) + while 1: + regl = "("|')(.*?)("|')(, |])" + pl = re.compile(regl, re.UNICODE) + xl = pl.search(raw, pos) + if xl == None: + if len(list_loaded) >= 1: + return list_loaded + break + elif len(done) == 0: + break + pos = xl.end() + word = xl.group(2) + if word not in list_loaded: + list_loaded.append(word) + +# I've seen that the report class before (the main) was to long to be called so, +# here there is a function that has all the settings, so i can call it once ^__^ +def report(newtext, image, notification, head, notification2 = None, unver = True, commx = None): + global botolist + while 1: + run = main(site = wikipedia.getSite()) + secondrun = run.general(newtext, image, notification, head, botolist) + if unver == True: + try: + resPutMex = run.put_mex() + except wikipedia.NoPage: + wikipedia.output(u"The page has been deleted! Skip!") + break + except wikipedia.EditConflict: + wikipedia.output(u"Edit conflict! Skip!") + break + else: + if resPutMex == False: + break + else: + try: + resPutMex = run.put_mex(False) + except wikipedia.NoPage: + wikipedia.output(u"The page has been deleted!") + break + except wikipedia.EditConflict: + wikipedia.output(u"Edit conflict! Skip!") + break + else: + if resPutMex == False: + break + try: + run.put_talk(notification, head, notification2, commx) + except wikipedia.EditConflict: + wikipedia.output(u"Edit Conflict! Retrying...") + try: + run.put_talk(notification, head, notification2, commx) + except: + wikipedia.output(u"Another error... skipping the user..") + break + break + +# Here there is the main loop. I'll take all the (name of the) images and then i'll check them. +if __name__ == "__main__": + try: + # Command line configurable parameters + repeat = True + limit = 80 + time_sleep = 30 + skip_number = 0 + commonsActive = False + normal = False + urlUsed = False + regexGen = False + untagged = False + + # Here below there are the parameters. + for arg in wikipedia.handleArgs(): + if arg.startswith('-limit'): + if len(arg) == 7: + limit = int(wikipedia.input(u'How many images do you want to check?')) + else: + limit = int(arg[7:]) + if arg.startswith('-time'): + if len(arg) == 5: + time_sleep = int(wikipedia.input(u'How many seconds do you want runs to be apart?')) + else: + time_sleep = int(arg[6:]) + elif arg == '-break': + repeat = False + elif arg == '-commons': + commonsActive = True + elif arg.startswith('-skip'): + if len(arg) == 5: + skip = True + skip_number = int(wikipedia.input(u'How many images do you want to skip?')) + elif len(arg) > 5: + skip = True + skip_number = int(arg[6:]) + elif arg.startswith('-start'): + if len(arg) == 6: + firstPageTitle = str(wikipedia.input(u'From witch page do you want to start?')) + elif len(arg) > 6: + firstPageTitle = str(arg[7:]) + generator = wikipedia.getSite().allpages(start='Image:'+firstPageTitle) + repeat = False + elif arg.startswith('-page:'): + if len(arg) == 6: + regexPageName = str(wikipedia.input(u'Which page do you want to use for the regex?')) + elif len(arg) > 6: + regexPageName = str(arg[6:]) + repeat = False + regexGen = True + elif arg.startswith('-url:'): + if len(arg) == 5: + regexPageUrl = str(wikipedia.input(u'Which url do you want to use for the regex?')) + elif len(arg) > 5: + regexPageUrl = str(arg[5:]) + urlUsed = True + repeat = False + regexGen = True + elif arg.startswith('-regex:'): + if len(arg) == 7: + regexpToUse = str(wikipedia.input(u'Which regex do you want to use?')) + elif len(arg) > 7: + regexpToUse = str(arg[7:]) + generator = 'regex' + repeat = False + elif arg.startswith('-cat'): + if len(arg) == 4: + catName = str(wikipedia.input(u'In which category do I work?')) + elif len(arg) > 4: + catName = str(arg[5:]) + catSelected = catlib.Category(wikipedia.getSite(), 'Category:'+catName) + generator = pagegenerators.CategorizedPageGenerator(catSelected) + repeat = False + elif arg.startswith('-untagged'): + untagged = True + if len(arg) == 9: + projectUntagged = str(wikipedia.input(u'In which project do I work?')) + elif len(arg) > 9: + projectUntagged = str(arg[10:]) + + # Understand if the generator it's the default or not. + try: + generator + except NameError: + normal = True + + # URL of the log of newimages. (http:/en.wikipedia.org/ will generated according to the project... you won't see it in the url) + url = "/w/index.php?title=Special:Log&type=upload&user=&page=&pattern=&limit=%d&offset=0" % limit + + # Define the site. + site = wikipedia.getSite() + + # In this way i find what language, project and what bot do you use. + lang = config.mylang + project = config.family + + # Block of text to translate the parameters set above. + image_n = site.image_namespace() + image_namespace = image_n + ":" + unvertext = wikipedia.translate(site, n_txt) + commento = wikipedia.translate(site, comm) + commento2 = wikipedia.translate(site, comm2) + ti_es_ti = wikipedia.translate(site, empty) + unverf = wikipedia.translate(site, unver) + di = wikipedia.translate(site, delete_immediately) + dih = wikipedia.translate(site, delete_immediately_head) + din = wikipedia.translate(site, delete_immediately_notification) + nh = wikipedia.translate(site, nothing_head) + nn = wikipedia.translate(site, nothing_notification) + dels = wikipedia.translate(site, del_comm) + botolist = wikipedia.translate(site, bot_list) + smwl = wikipedia.translate(site, second_message_without_license) + settings = wikipedia.translate(site, page_with_settings) + rep_page = wikipedia.translate(site, report_page) + rep_text = wikipedia.translate(site, report_text) + com = wikipedia.translate(site, comm10) + TextFind = wikipedia.translate(site, txt_find) + hiddentemplate = wikipedia.translate(site, HiddenTemplate) + + if skip_number == 0: + skip = False + # nothing = Defining an empty image description + nothing = ['', ' ', ' ', ' ', '\n', '\n ', '\n ', '\n\n', '\n \n', ' \n', ' \n ', ' \n \n'] + # something = Minimal requirements for an image description. + #If this fits, no tagging will take place + something = ['{{', 'MIT']#, '}}'] + # Unused file extensions. Does not contain PDF. + notallowed = ("xcf", "xls", "sxw", "sxi", "sxc", "sxd", "djvu") + + # A little block-statement to ensure that the bot will not start with en-parameters + if lang not in project_inserted: + wikipedia.output(u"Your project is not supported by this script. You have to edit the script and add it!") + wikipedia.stopme() + + di = '\n' + di + dels = dels % di + + # Reading the log of the new images + if normal == True: + if limit == 1: + wikipedia.output(u"Retrieving the lastest file for checking...") + else: + wikipedia.output(u"Retrieving the lastest %d files for checking..." % limit) + + while 1: + # If I use the standard way, I have to download the page to parse it. + if normal == True: + textrun = site.getUrl(url) + + mainClass = main(site) + if untagged == True: + generator = mainClass.untaggedGenerator(projectUntagged, rep_page, com) + normal = False + if normal == True: + generator = mainClass.run_bot(textrun, rep_page, com) + + if urlUsed == True and regexGen == True: + textRegex = pagetext(regexPageUrl) + elif regexGen == True: + pageRegex = wikipedia.Page(site, regexPageName) + try: + textRegex = pageRegex.get() + except wikipedia.NoPage: + wikipedia.output(u"%s doesn't exist!" % page.title()) + textRegex = '' + if generator == 'regex' and regexGen == True: + generator = mainClass.regexGenerator(regexpToUse, textRegex) + try: + tupla_written = mainClass.takesettings(settings) + except wikipedia.Error: + wikipedia.output(u'Problems with loading the settigs, run without them.') + tupla_written = None + some_problem = False + if tupla_written != None: + wikipedia.output(u'\t >> Loaded the real-time page... <<') + filename = "settings.data" + f = file(filename, 'w') + cPickle.dump(tupla_written, f) + f.close() + else: + wikipedia.output(u"No additional settings found!") + if skip == True: + skip_list = list() + wikipedia.output(u'Skipping the first ' + str(skip_number) + u' images:\n') + else: + wikipedia.output(u'\t\t>> No images to skip...<<') + skipok = False + for image in generator: + if normal == False and regexGen == False: + if image_namespace not in image.title(): + continue + image = image.title().split(image_namespace)[1] + elif regexGen == True: + image = image.split(image_namespace)[1] + if skip == True: + if len(skip_list) < skip_number: + wikipedia.output(u'Skipping %s...' % image) + skip_list.append(image) + continue + else: + if skipok == False: + wikipedia.output('') + skipok = True + if commonsActive == True: + response = mainClass.checkImage(image) + if response == False: + continue + if tupla_written != None: + f = file(filename) + tuplaList = cPickle.load(f) + parentesi = False + delete = False + tagged = False + extension = image.split('.')[-1] + page = image_namespace + image + p = wikipedia.ImagePage(site, page) + # Skip deleted images + try: + g = p.get() + except wikipedia.NoPage: + wikipedia.output(u"Skipping %s because it has been deleted." % image) + continue + except wikipedia.IsRedirectPage: + wikipedia.output(u"The file description for %s is a redirect?!" % image ) + continue + for l in hiddentemplate: + if l.lower() in g.lower(): + wikipedia.output(u'A white template found, skipping the template...') + #whiteTemplate = True + #final_text = g + g = g.lower().replace(l, '') + #print g + for a_word in something: + if a_word in g: + parentesi = True + for parl in notallowed: + if parl.lower() in extension.lower(): + delete = True + for i in TextFind: + if i.lower() in g: + tagged = True + some_problem = False + if tupla_written != None: + for tupla in tuplaList: + name = tupla[1] + find_tipe = tupla[2] + find = tupla[3] + find_list = mainClass.load(find) + imagechanges = tupla[4] + if imagechanges.lower() == 'false': + imagestatus = False + elif imagechanges.lower() == 'true': + imagestatus = True + else: + wikipedia.output(u"Error! Imagechanges set wrongly!") + tupla_written = None + break + summary = tupla[5] + head_2 = tupla[6] + text = tupla[7] + text = text % image + mexCatched = tupla[8] + wikipedia.setAction(summary) + del tupla[0:8] + for k in find_list: + if find_tipe.lower() == 'findonly': + if k.lower() == g.lower(): + some_problem = True + text_used = text + head_used = head_2 + imagestatus_used = imagestatus + name_used = name + summary_used = summary + mex_used = mexCatched + break + elif find_tipe.lower() == 'find': + if k.lower() in g.lower(): + some_problem = True + text_used = text + head_used = head_2 + imagestatus_used = imagestatus + name_used = name + summary_used = summary + mex_used = mexCatched + continue + if p.exists(): + # Here there is the checkin ^^ + if tagged == True: + wikipedia.output(image + u' is already tagged... ' + time.strftime("%H:%M:%S", time.localtime())) + continue + if some_problem == True: + if mex_used in g: + wikipedia.output(u'Image already fixed. Skip.') + continue + wikipedia.output(u"The image description for %s contains %s..." % (image, name_used)) + if mex_used.lower() == 'default': + mex_used = unvertext + if imagestatus_used == False: + reported = mainClass.report_image(rep_page, image, com, rep_text) + else: + reported = True + if reported == True: + #if imagestatus_used == True: + report(mex_used, image, text_used, "\n" + head_used + "\n", None, imagestatus_used, summary_used) + else: + wikipedia.output(u"Skipping the image...") + some_problem = False + continue + elif parentesi == True: + wikipedia.output(image + u" seems ok, " + time.strftime("%H:%M:%S", time.localtime())) + # It works also without this... but i want only to be sure ^^ + parentesi = False + continue + elif delete == True: + wikipedia.output(u"%s is not a file!" % image) + # Modify summary text + wikipedia.setAction(dels) + canctext = di % extension + notification = din % image + head = dih + report(canctext, image, notification, head) + delete = False + continue + elif g in nothing: + wikipedia.output(u"The image description for %s does not contain a license template!" % image) + if lang == 'commons': + head = nh % image + else: + head = nh + if lang == 'commons': + notification = nn + else: + notification = nn % image + report(unvertext, image, notification, head, smwl) + continue + else: + wikipedia.output(u"%s has only text and not the specific license..." % image) + if lang == 'commons': + head = nh % image + else: + head = nh + if lang == 'commons': + notification = nn + else: + notification = nn % image + report(unvertext, image, notification, head, smwl) + continue + # A little block to perform the repeat or to break. + if repeat == True: + wikipedia.output(u"Waiting for " + str(time_sleep) + u" seconds, " + time.strftime("%d %b %Y %H:%M:%S (UTC)", time.localtime()) ) + time.sleep(time_sleep) + elif repeat == False: + wikipedia.output(u"\t\t\t>> STOP! <<") + wikipedia.stopme() + break + except wikipedia.BadTitle: + wikipedia.output(u"Wikidown or server's problem, quit") + wikipedia.stopme() + finally: + wikipedia.stopme()
pywikipedia-l@lists.wikimedia.org