Revision: 5676 Author: filnik Date: 2008-07-05 20:26:02 +0000 (Sat, 05 Jul 2008)
Log Message: ----------- Beta version, but working pretty good, of the smartdetection
Modified Paths: -------------- trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py =================================================================== --- trunk/pywikipedia/checkimages.py 2008-07-05 18:21:03 UTC (rev 5675) +++ trunk/pywikipedia/checkimages.py 2008-07-05 20:26:02 UTC (rev 5676) @@ -22,6 +22,8 @@
-duplicatesreport - Report the duplicates in a log *AND* put the template in the images.
+ -smartdetection - Check in a category if the license found exist in realit or not. + -sendemail - Send an email after tagging.
-break - To break the bot after the first check (default: recursive) @@ -308,7 +310,7 @@ u'dupe', u'duplicate', u'uncat', u'uncategorized', u'watermark', u'nocat', u'imageupload'], 'de':[u'information'], 'en':[u'information'], - 'it':[u'edp', u'informazioni[ _]file', u'information', u'trademark'], + 'it':[u'edp', u'informazioni[ _]file', u'information', u'trademark', u'permissionotrs'], 'ja':[u'Information'], 'hu':[u'információ', u'enwiki', u'azonnali'], 'ta':[u'information'], @@ -356,6 +358,11 @@ 'it':r'{{(?:[Tt]emplate:|)[Cc]ancella[ _]subito[|}]', }
+category_with_licenses = { + 'commons':'Category:License tags', + 'it':'Categoria:Template Licenze copyright', + } + ## Put None if you don't use this option or simply add nothing if en ## is still None. # Page where is stored the message to send as email to the users @@ -447,14 +454,14 @@ """ Constructor, define some global variable """ self.site = site self.logFulNumber = logFulNumber - self.settings = wikipedia.translate(site, page_with_settings) - self.rep_page = wikipedia.translate(site, report_page) - self.rep_text = wikipedia.translate(site, report_text) - self.com = wikipedia.translate(site, comm10) + self.settings = wikipedia.translate(self.site, page_with_settings) + self.rep_page = wikipedia.translate(self.site, report_page) + self.rep_text = wikipedia.translate(self.site, report_text) + self.com = wikipedia.translate(self.site, comm10) # Commento = Summary in italian self.commento = wikipedia.translate(self.site, comm) # Adding the bot's nickname at the notification text if needed. - botolist = wikipedia.translate(wikipedia.getSite(), bot_list) + botolist = wikipedia.translate(self.site, bot_list) project = wikipedia.getSite().family.name bot = config.usernames[project] botnick = bot[self.site.lang] @@ -807,13 +814,13 @@ return False # The image is a duplicate, it will be deleted. return True # Ok - No problem. Let's continue the checking phase
- def report_image(self, image, rep_page = None, com = None, rep_text = None, addings = True, regex = None): + def report_image(self, image_to_report, rep_page = None, com = None, rep_text = None, addings = True, regex = None): """ Function to report the images in the report page when needed. """ if rep_page == None: rep_page = self.rep_page if com == None: com = self.com if rep_text == None: rep_text = self.rep_text another_page = wikipedia.Page(self.site, rep_page) - if regex == None: regex = image + if regex == None: regex = image_to_report if another_page.exists(): text_get = another_page.get() else: @@ -821,25 +828,24 @@ if len(text_get) >= self.logFulNumber: raise LogIsFull("The log page (%s) is full! Please delete the old images reported." % another_page.title()) pos = 0 - # The talk page includes "_" between the two names, in this way i replace them to " " + # The talk page includes "_" between the two names, in this way i replace them to " " n = re.compile(regex, re.UNICODE|re.M) y = n.search(text_get, pos) if y == None: # Adding the log if addings: - rep_text = rep_text % image # Adding the name of the image in the report if not done already + rep_text = rep_text % image_to_report # Adding the name of the image in the report if not done already another_page.put(text_get + rep_text, comment = com, minorEdit = False) wikipedia.output(u"...Reported...") reported = True else: pos = y.end() - wikipedia.output(u"%s is already in the report page." % image) + wikipedia.output(u"%s is already in the report page." % image_to_report) reported = False return reported def takesettings(self): """ Function to take the settings from the wiki. """ - pos = 0 if self.settings == None: lista = None else: x = wikipedia.Page(self.site, self.settings) @@ -849,32 +855,41 @@ rxp = r"<------- ------->\n*[Nn]ame ?= ?['"](.*?)['"]\n*([Ff]ind|[Ff]indonly)=(.*?)\n*[Ii]magechanges=(.*?)\n*[Ss]ummary=['"](.*?)['"]\n*[Hh]ead=['"](.*?)['"]\n*[Tt]ext ?= ?['"](.*?)['"]\n*[Mm]ex ?= ?['"]?(.*?)['"]?$" r = re.compile(rxp, re.UNICODE|re.M) number = 1 - while 1: - m = r.search(testo, pos) - if m == None: - if lista == list(): - wikipedia.output(u"You've set wrongly your settings, please take a look to the relative page. (run without them)") - lista = None - else: - break - else: - pos = m.end() - name = str(m.group(1)) - find_tipe = str(m.group(2)) - find = str(m.group(3)) - imagechanges = str(m.group(4)) - summary = str(m.group(5)) - head = str(m.group(6)) - text = str(m.group(7)) - mexcatched = str(m.group(8)) - tupla = [number, name, find_tipe, find, imagechanges, summary, head, text, mexcatched] - lista += [tupla] - number += 1 + for m in r.finditer(testo): + name = str(m.group(1)) + find_tipe = str(m.group(2)) + find = str(m.group(3)) + imagechanges = str(m.group(4)) + summary = str(m.group(5)) + head = str(m.group(6)) + text = str(m.group(7)) + mexcatched = str(m.group(8)) + tupla = [number, name, find_tipe, find, imagechanges, summary, head, text, mexcatched] + lista += [tupla] + number += 1 + if lista == list(): + wikipedia.output(u"You've set wrongly your settings, please take a look to the relative page. (run without them)") + lista = None except wikipedia.NoPage: wikipedia.output(u"The settings' page doesn't exist!") lista = None return lista - + + def load_licenses(self): + """ Load the list of the licenses """ + catName = wikipedia.translate(self.site, category_with_licenses) + cat = catlib.Category(wikipedia.getSite(), catName) + categories = [page.title() for page in pagegenerators.SubCategoriesPageGenerator(cat)] + categories.append(catName) + list_licenses = list() + wikipedia.output(u'\n\t...Loading the names of the licenses allowed...\n') + for catName in categories: + cat = catlib.Category(wikipedia.getSite(), catName) + gen = pagegenerators.CategorizedPageGenerator(cat) + pages = [page for page in gen] + list_licenses.extend(pages) + return list_licenses + def load(self, raw): """ Load a list of object from a string using regex. """ list_loaded = list() @@ -885,11 +900,6 @@ regl = r"(?:"|')(.*?)(?:"|')(?:, |])" pl = re.compile(regl, re.UNICODE) for xl in pl.finditer(raw): - if xl == None: - if len(list_loaded) >= 1: - return list_loaded - break - pos = xl.end() word = xl.group(1) if word not in list_loaded: list_loaded.append(word) @@ -911,8 +921,9 @@ skip_list = list() # Inizialize the skip list used below duplicatesActive = False # Use the duplicate option duplicatesReport = False # Use the duplicate-report option - sendemailActive = False # Use the send-email option - + sendemailActive = False # Use the send-email + smartdetection = False # Enable the smart detection + # Here below there are the parameters. for arg in wikipedia.handleArgs(): if arg.startswith('-limit'): @@ -935,6 +946,8 @@ duplicatesReport = True elif arg == '-sendemail': sendemailActive = True + elif arg == '-smartdetection': + smartdetection = True elif arg.startswith('-skip'): if len(arg) == 5: skip = True @@ -999,7 +1012,7 @@ projectUntagged = str(wikipedia.input(u'In which project should I work?')) elif len(arg) > 9: projectUntagged = str(arg[10:]) - + # Understand if the generator it's the default or not. try: generator @@ -1090,6 +1103,8 @@ wikipedia.output(u'Problems with loading the settigs, run without them.') tupla_written = None some_problem = False + # Load the list of licenses allowed for our project + list_licenses = mainClass.load_licenses() # Ensure that if the list given is empty it will be converted to "None" # (but it should be already done in the takesettings() function) if tupla_written == []: tupla_written = None @@ -1200,7 +1215,8 @@ white_template_found += 1 if l != '' and l != ' ': # Check that l is not nothing or a space # Deleting! (replace the template with nothing) - g = re.sub(r'{{(?:template:|)%s' % l.lower(), r'', g.lower()) + regex_white_template = re.compile(r'{{(?:template:|)%s' % l, re.IGNORECASE) + g = regex_white_template.sub(r'', g) hiddenTemplateFound = True if white_template_found == 1: wikipedia.output(u'A white template found, skipping the template...') @@ -1289,9 +1305,36 @@ some_problem = False continue elif parentesi == True: - printWithTimeZone(u"%s seems ok," % imageName) + seems_ok = False + license_found = None + if smartdetection: + regex_find_licenses = re.compile(r'{{(?:[Tt]emplate:|)(.*?)(?:[|\n].*?|)}}', re.DOTALL) + licenses_found = regex_find_licenses.findall(g) + if licenses_found != []: + for license_selected in licenses_found: + #print template.exists() + template = wikipedia.Page(site, 'Template:%s' % license_selected) + if template.isRedirectPage(): + template = template.getRedirectTarget() + license_found = license_selected + if template in list_licenses: + seems_ok = True + break + if not seems_ok: + rep_text_license_faked = "\n*[[:Image:%s]] seems to have a ''fake license'', license detected: %s." % (imageName, license_found) + regexFakedLicense = r"* ?[[:Image:%s]] seems to have a ''fake license'', license detected: %s." % (imageName, license_found) + printWithTimeZone(u"%s seems to have a fake license: %s, reporting..." % (imageName, license_found)) + mainClass.report_image(imageName, rep_text = rep_text_license_faked, + addings = False, regex = regexFakedLicense) + else: + seems_ok = True + if seems_ok: + if license_found != None: + printWithTimeZone(u"%s seems ok, license found: %s..." % (imageName, license_found)) + else: + printWithTimeZone(u"%s seems ok..." % imageName) # It works also without this... but i want only to be sure ^^ - parentesi = False + parentesi = False continue elif delete == True: wikipedia.output(u"%s is not a file!" % imageName)
pywikipedia-l@lists.wikimedia.org