Revision: 6023 Author: filnik Date: 2008-10-26 14:09:24 +0000 (Sun, 26 Oct 2008)
Log Message: ----------- Really big rewrite! now the bot uses APIs to do the work really better and really faster! I will fix it a bit in the next days and add the comments, but the most is done
Modified Paths: -------------- trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py =================================================================== --- trunk/pywikipedia/checkimages.py 2008-10-25 23:48:32 UTC (rev 6022) +++ trunk/pywikipedia/checkimages.py 2008-10-26 14:09:24 UTC (rev 6023) @@ -22,8 +22,6 @@
-duplicatesreport - Report the duplicates in a log *AND* put the template in the images.
- -smartdetection - Check in a category if the license found exist in realit or not. - -sendemail - Send an email after tagging.
-break - To break the bot after the first check (default: recursive) @@ -336,7 +334,7 @@ 'de':[u'information'], 'en':[u'information'], 'hu':[u'információ', u'enwiki', u'azonnali'], - 'it':[u'edp', u'informazioni[ _]file', u'information', u'trademark', u'permissionotrs'], # Put the other in the page on the project defined below + 'it':[u'edp', u'informazioni file', u'information', u'trademark', u'permissionotrs'], # Put the other in the page on the project defined below 'ja':[u'Information'], 'ko':[u'그림 정보'], 'ta':[u'information'], @@ -344,7 +342,7 @@ } # A page where there's a list of template to skip. PageWithHiddenTemplates = { - 'commons': u'User:Filbot/White_templates#White_templates', + 'commons': u'User:Filbot/White_templates2#White_templates', 'en':None, 'it':u'Progetto:Coordinamento/Immagini/Bot/WhiteTemplates', 'ko': u'User:Kwjbot_IV/whitetemplates/list', @@ -387,7 +385,7 @@ } # Message to put in the talk duplicates_user_talk_text = { - 'commons': u'{{subst:User:Filnik/duplicates|Image:%s|Image:%s}}', + 'commons': u'{{subst:User:Filnik/duplicates|Image:%s|Image:%s}}', # FIXME: it doesn't exist 'en' : None, 'it' : u"{{subst:Progetto:Coordinamento/Immagini/Bot/Messaggi/Duplicati|%s|%s|__botnick__}} --~~~~", } @@ -502,10 +500,69 @@ wikipedia.output(u'No data found.') return False
+def categoryElementsNumber(CatName): + #action=query&prop=categoryinfo&titles=Category:License_tags + """ + """ + params = { + 'action' :'query', + 'prop' :'categoryinfo', + 'titles' :CatName, + } + + data = query.GetData(params, + useAPI = True, encodeTitle = False) + pageid = data['query']['pages'].keys()[0] + elements = data['query']['pages'][pageid]['categoryinfo']['size'] + return elements + +def categoryAllElements(CatName): + #action=query&list=categorymembers&cmlimit=500&cmtitle=Category:License_tags + """ + """ + wikipedia.output("Loading %s..." % CatName) + elements = int(categoryElementsNumber(CatName)) + elements += 20 # better to be sure that all the elements are loaded + if (elements - 20) > 5000: + raise wikipedia.Error(u'The category selected as more than 5.000 elements, limit reached') + elif elements > 5000: # if they are less then 5000, but for few elements + elements = 5000 + params = { + 'action' :'query', + 'list' :'categorymembers', + 'cmlimit' :str(elements), + 'cmtitle' :CatName, + } + + data = query.GetData(params, + useAPI = True, encodeTitle = False) + + members = data['query']['categorymembers'] + allmembers = members + results = list() + for subcat in members: + ns = subcat['ns'] + pageid = subcat['pageid'] + title = subcat['title'] + if ns == 14: + allmembers.extend(categoryAllElements(title)) + members.remove(subcat) + for member in allmembers: + ns = member['ns'] + pageid = member['pageid'] + title = member['title'] + results.append(member) + return results +def categoryAllPageObjects(CatName): + final = list() + for element in categoryAllElements(CatName): + final.append(wikipedia.Page(wikipedia.getSite(), element['title'])) + return final + # Here there is the main class. class main: def __init__(self, site, logFulNumber = 25000, sendemailActive = False, - duplicatesReport = False, smartdetection = False): + duplicatesReport = False): """ Constructor, define some global variable """ self.site = site self.logFulNumber = logFulNumber @@ -513,7 +570,7 @@ self.rep_page = wikipedia.translate(self.site, report_page) self.rep_text = wikipedia.translate(self.site, report_text) self.com = wikipedia.translate(self.site, comm10) - self.hiddentemplate = wikipedia.translate(self.site, HiddenTemplate) + self.hiddentemplates = wikipedia.translate(self.site, HiddenTemplate) self.pageHidden = wikipedia.translate(self.site, PageWithHiddenTemplates) self.pageAllowed = wikipedia.translate(self.site, PageWithAllowedTemplates) # Commento = Summary in italian @@ -533,9 +590,7 @@ image_n = self.site.image_namespace() self.image_namespace = u"%s:" % image_n # Example: "Image:" # Load the licenses only once, so do it once - self.smartdetection = smartdetection - if self.smartdetection: - self.list_licenses = self.load_licenses() + self.list_licenses = self.load_licenses() def setParameters(self, imageName, timestamp, uploader): """ Function to set parameters, now only image but maybe it can be used for others in "future" """ self.imageName = imageName @@ -736,17 +791,18 @@ """ Function to load the white templates """ # A template as {{en is not a license! Adding also them in the whitelist template... for langK in wikipedia.Family(u'wikipedia').langs.keys(): - self.hiddentemplate.append(u'%s' % langK) + self.hiddentemplates.append(wikipedia.Page(self.site, u'Template:%s' % langK)) # The template #if: and #switch: aren't something to care about - self.hiddentemplate.extend([u'#if:', u'#switch:']) + #self.hiddentemplates.extend([u'#if:', u'#switch:']) FIXME # Hidden template loading if self.pageHidden != None: try: pageHiddenText = wikipedia.Page(self.site, self.pageHidden).get() except (wikipedia.NoPage, wikipedia.IsRedirectPage): pageHiddenText = '' - self.hiddentemplate.extend(self.load(pageHiddenText)) - return self.hiddentemplate + for element in self.load(pageHiddenText): + self.hiddentemplates.append(wikipedia.Page(self.site, element)) + return self.hiddentemplates
def returnOlderTime(self, listGiven, timeListGiven): """ Get some time and return the oldest of them """ @@ -1029,6 +1085,7 @@
def load_licenses(self): """ Load the list of the licenses """ + """ catName = wikipedia.translate(self.site, category_with_licenses) cat = catlib.Category(wikipedia.getSite(), catName) categories = [page.title() for page in pagegenerators.SubCategoriesPageGenerator(cat)] @@ -1040,6 +1097,10 @@ gen = pagegenerators.CategorizedPageGenerator(cat) pages = [page for page in gen] list_licenses.extend(pages) + """ + catName = wikipedia.translate(self.site, category_with_licenses) + wikipedia.output(u'\n\t...Loading the licenses allowed...\n') + list_licenses = categoryAllPageObjects(catName)
# Add the licenses set in the default page as licenses # to check @@ -1049,95 +1110,57 @@ except (wikipedia.NoPage, wikipedia.IsRedirectPage): pageAllowedText = '' for nameLicense in self.load(pageAllowedText): - if not 'template:' in nameLicense.lower(): - nameLicense = u'Template:%s' % nameLicense pageLicense = wikipedia.Page(self.site, nameLicense) if pageLicense not in list_licenses: list_licenses.append(pageLicense) # the list has wiki-pages return list_licenses
- def giveMeTheTemplate(self, license_selected): - """ From the name of a template see if it's template:something or just - an inclusion of another namespace != template. If it's a redirect - gets the real page, if there's a NoPage, return None. - """ - #print template.exists() - template = wikipedia.Page(self.site, u'Template:%s' % license_selected) - try: - template.pageAPInfo() - except wikipedia.NoPage: - try: - template = wikipedia.Page(self.site, license_selected) - template.pageAPInfo() - except (wikipedia.NoPage, wikipedia.IsRedirectPage): - return None # break and exit - except wikipedia.IsRedirectPage: - template = template.getRedirectTarget() - return template - - def smartDetection(self, image_text): + def smartDetection(self): """ The bot instead of checking if there's a simple template in the image's description, checks also if that template is a license or something else. In this sense this type of check is smart. """ seems_ok = False license_found = None - regex_find_licenses = re.compile(r'{{(?:[Tt]emplate:|)(.*?)(?:[|\n<].*?|)}}', re.DOTALL) - licenses_found = regex_find_licenses.findall(image_text) - second_round = False - - exit_cicle = False # howTo exit from both the for and the while cicle - while 1: - if exit_cicle: # howTo exit from the while - break - if licenses_found != []: - for license_selected in licenses_found: - # put the first, if there is problem, this will be reported in the log - if license_found == None: - license_found = license_selected + self.hiddentemplates = self.loadHiddenTemplates() + self.licenses_found = self.image.getTemplates() + whiteTemplatesFound = False + regex_find_licenses = re.compile(r'(?<!{){{(?:[Tt]emplate:|)([^{]*?)[|\n<}]', re.DOTALL) + templatesInTheImageRaw = regex_find_licenses.findall(self.imageCheckText) + allLicenses = list() + # Found the templates ONLY in the image's description + for template_selected in templatesInTheImageRaw: + for templateReal in self.licenses_found: + if self.convert_to_url(template_selected).lower().replace('template:', '') == \ + self.convert_to_url(templateReal.title().lower().replace('template:', '')): + allLicenses.append(templateReal) + if self.licenses_found != []: + for template in self.licenses_found: + license_selected = template.title().replace('Template:', '') + if template in self.list_licenses: # the list_licenses are loaded in the __init__ (not to load them multimple times) + seems_ok = True + license_found = license_selected # let the last "fake" license normally detected + break + if template in self.hiddentemplates: + # if the whitetemplate is not in the images description, we don't care try: - template = self.giveMeTheTemplate(license_selected) - if template == None: - continue - except wikipedia.BadTitle: - # Template with wrong name, no need to report, simply skip + allLicenses.remove(template) + except ValueError: continue - if template in self.list_licenses: # the list_licenses are loaded in the __init__ (not to load them multimple times) - seems_ok = True - exit_cicle = True - license_found = license_selected # let the last "fake" license normally detected - break - # previous block was unsuccessful? Try with the next one - for license_selected in licenses_found: - try: - template = self.giveMeTheTemplate(license_selected) - if template == None: - continue # ok, this template it's not ok, continue.. - except wikipedia.BadTitle: - # Template with wrong name, no need to report, simply skip - continue - try: - template_text = template.get() - except wikipedia.NoPage: - continue # ok, this template it's not ok, continue.. - regex_noinclude = re.compile(r'<noinclude>(.*?)</noinclude>', re.DOTALL) - template_text = regex_noinclude.sub('', template_text) - if second_round == False: - licenses_found = regex_find_licenses.findall(template_text) - second_round = True - break # only exit from the for, not from the while else: - exit_cicle = True - break - if not seems_ok: + whiteTemplatesFound = True + continue + if license_found == None and allLicenses != list(): + license_found = license_selected + if not seems_ok and license_found != None: rep_text_license_fake = u"\n*[[:Image:%s]] seems to have a ''fake license'', license detected: <nowiki>%s</nowiki>" % (self.imageName, license_found) regexFakeLicense = r"* ?[[:Image:%s]] seems to have a ''fake license'', license detected: <nowiki>%s</nowiki>$" % (self.imageName, license_found) printWithTimeZone(u"%s seems to have a fake license: %s, reporting..." % (self.imageName, license_found)) self.report_image(self.imageName, rep_text = rep_text_license_fake, addings = False, regex = regexFakeLicense) - else: + elif license_found != None: printWithTimeZone(u"%s seems ok, license found: %s..." % (self.imageName, license_found)) - return license_found + return (license_found, whiteTemplatesFound)
def load(self, raw): """ Load a list of object from a string using regex. """ @@ -1252,31 +1275,8 @@ return True elif i.lower() in self.imageCheckText: return True - return False # Nothing Found? Ok: False + return False # Nothing Found? Ok: False
- def whiteTemplateEraser(self): - """ Erase the white template from the checking text and return how many have been found. """ - # Load the white templates(hidden template is the same as white template, regarding the meaning) - white_templates_found = 0 - hiddentemplate = self.loadHiddenTemplates() - for regexWhiteLicense in hiddentemplate: - fullRegexWL = r'{{(?:template:|)(?:%s[ \n]*?(?:\n|||}|<)|creator:)' % regexWhiteLicense.lower() - if self.tagged == False: - # why creator? Because on commons there's a template such as {{creator:name}} that.. works - res = re.findall(fullRegexWL, self.imageCheckText.lower()) - if res != []: - for element in res: # if a regex gives more than 1 results, are more than 1 template found. - white_templates_found += 1 - if regexWhiteLicense != '' and regexWhiteLicense != ' ': # Check that regexWhiteLicense is not nothing or a space - # Deleting! (replace the template with nothing) - regex_white_template = re.compile(fullRegexWL, re.IGNORECASE) - self.imageCheckText = regex_white_template.sub(r'', self.imageCheckText) - if white_templates_found == 1: - wikipedia.output(u'A white template found, skipping the template...') - elif white_templates_found > 1: - wikipedia.output(u'White templates found: %s; skipping those templates...' % white_templates_found) - return white_templates_found - def findAdditionalProblems(self): # In every tupla there's a setting configuration for tupla in self.settingsData: @@ -1322,7 +1322,7 @@ self.mex_used = mexCatched continue
- def checkStep(self, smartdetection): + def checkStep(self): # nothing = Defining an empty image description nothing = ['', ' ', ' ', ' ', '\n', '\n ', '\n ', '\n\n', '\n \n', ' \n', ' \n ', ' \n \n'] # something = Minimal requirements for an image description. @@ -1369,11 +1369,10 @@ # Deleting the useless template from the description (before adding something # in the image the original text will be reloaded, don't worry). self.tagged = self.isTagged() - white_templates_found = self.whiteTemplateEraser() - if white_templates_found != 0: - hiddenTemplateFound = True - else: - hiddenTemplateFound = False + if self.tagged == True: + # Tagged? Yes, skip. + printWithTimeZone(u'%s is already tagged...' % self.imageName) + return True for a_word in something: # something is the array with {{, MIT License and so on. if a_word in self.imageCheckText: # There's a template, probably a license (or I hope so) @@ -1382,6 +1381,7 @@ for parl in notallowed: if parl.lower() in extension.lower(): delete = True + (license_found, hiddenTemplateFound) = self.smartDetection() self.some_problem = False # If it has "some_problem" it must check # the additional settings. # if self.settingsData, use addictional settings @@ -1392,10 +1392,6 @@ #if p.exists(): <-- improve thebot, better to make as # less call to the server as possible # Here begins the check block. - if self.tagged == True: - # Tagged? Yes, skip. - printWithTimeZone(u'%s is already tagged...' % self.imageName) - return True if self.some_problem == True: if self.mex_used in self.imageCheckText: wikipedia.output(u'Image already fixed. Skip.') @@ -1414,13 +1410,8 @@ wikipedia.output(u"Skipping the image...") self.some_problem = False return True - elif brackets == True: + elif brackets == True and license_found != None: seems_ok = False - license_found = None - if smartdetection: - license_found = self.smartDetection(self.imageCheckText) - else: - printWithTimeZone(u"%s seems ok..." % self.imageName) # It works also without this... but i want only to be sure ^^ brackets = False return True @@ -1469,7 +1460,6 @@ duplicatesActive = False # Use the duplicate option duplicatesReport = False # Use the duplicate-report option sendemailActive = False # Use the send-email - smartdetection = False # Enable the smart detection
# Here below there are the parameters. for arg in wikipedia.handleArgs(): @@ -1497,8 +1487,6 @@ duplicatesReport = True elif arg == '-sendemail': sendemailActive = True - elif arg == '-smartdetection': - smartdetection = True elif arg.startswith('-skip'): if len(arg) == 5: skip = True @@ -1597,7 +1585,7 @@ # Main Loop while 1: # Defing the Main Class. - mainClass = main(site, sendemailActive = sendemailActive, duplicatesReport = duplicatesReport, smartdetection = smartdetection) + mainClass = main(site, sendemailActive = sendemailActive, duplicatesReport = duplicatesReport) # Untagged is True? Let's take that generator if untagged == True: generator = mainClass.untaggedGenerator(projectUntagged, limit) @@ -1674,7 +1662,7 @@ response2 = mainClass.checkImageDuplicated(duplicates_rollback) if response2 == False: continue - resultCheck = mainClass.checkStep(smartdetection) + resultCheck = mainClass.checkStep() if resultCheck: continue # A little block to perform the repeat or to break. @@ -1694,5 +1682,5 @@ final = datetime.datetime.strptime(str(datetime.datetime.utcnow()).split('.')[0], "%Y-%m-%d %H:%M:%S") #timezones are UTC delta = final - old secs_of_diff = delta.seconds - print "seconds: %s" % secs_of_diff + wikipedia.output("Execution time: %s" % secs_of_diff) wikipedia.stopme()
pywikipedia-l@lists.wikimedia.org