Revision: 5985 Author: filnik Date: 2008-10-17 17:54:33 +0000 (Fri, 17 Oct 2008)
Log Message: ----------- Some rewrite, skip -> in a function
Modified Paths: -------------- trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py =================================================================== --- trunk/pywikipedia/checkimages.py 2008-10-16 20:47:02 UTC (rev 5984) +++ trunk/pywikipedia/checkimages.py 2008-10-17 17:54:33 UTC (rev 5985) @@ -518,16 +518,19 @@ botolist.append(botnick) self.botolist = botolist self.sendemailActive = sendemailActive + self.skip_list = list() # Inizialize the skip list used below self.duplicatesReport = duplicatesReport image_n = self.site.image_namespace() - self.image_namespace = "%s:" % image_n # Example: "User_talk:" + self.image_namespace = "%s:" % image_n # Example: "Image:" # Load the licenses only once, so do it once self.smartdetection = smartdetection if self.smartdetection: self.list_licenses = self.load_licenses() - def setParameters(self, image): + def setParameters(self, imageName): """ Function to set parameters, now only image but maybe it can be used for others in "future" """ - self.image = image + self.imageName = imageName + # Defing the image's Page Object + self.image = wikipedia.ImagePage(self.site, '%s%s' % (self.image_namespace, self.imageName)) def report(self, newtext, image_to_report, notification = None, head = None, notification2 = None, unver = True, commTalk = None, commImage = None): """ Function to make the reports easier (or I hope so). """ @@ -590,23 +593,22 @@ def tag_image(self, put = True): """ Function to add the template in the image and to find out who's the user that has uploaded the image. """ - # Defing the image's Page Object - p = wikipedia.ImagePage(self.site, 'Image:%s' % self.image_to_report) # Get the image's description + reportPageObject = wikipedia.ImagePage(self.site, self.image_namespace + self.image_to_report) try: - testoa = p.get() + reportPageText = reportPageObject.get() except wikipedia.NoPage: - wikipedia.output(u'%s has been deleted...' % p.title()) + wikipedia.output(u'%s has been deleted...' % self.imageName) # We have a problem! Report and exit! return False # You can use this function also to find only the user that # has upload the image (FixME: Rewrite a bit this part) if put: - p.put(testoa + self.newtext, comment = self.commImage, minorEdit = True) + reportPageObject.put(reportPageText + self.newtext, comment = self.commImage, minorEdit = True) # paginetta it's the image page object. - paginetta = wikipedia.ImagePage(self.site, self.image_namespace + self.image_to_report) + try: - nick = paginetta.getLatestUploader()[0] + nick = reportPageObject.getLatestUploader()[0] except wikipedia.NoPage: wikipedia.output(u"Seems that %s hasn't the image at all, but there is something in the description..." % self.image_to_report) repme = "\n*[[:Image:%s]] problems '''with the APIs'''" @@ -614,9 +616,7 @@ self.report_image(self.image_to_report, self.rep_page, self.com, repme) return False luser = wikipedia.url2link(nick, self.site, self.site) - pagina_discussione = "%s:%s" % (self.site.namespace(3), luser) - # Defing the talk page (pagina_discussione = talk_page ^__^ ) - talk_page = wikipedia.Page(self.site, pagina_discussione) + talk_page = wikipedia.Page(self.site, "%s:%s" % (self.site.namespace(3), luser)) self.talk_page = talk_page self.luser = luser return True @@ -787,34 +787,34 @@
def checkImageOnCommons(self): """ Checking if the image is on commons """ - wikipedia.output(u'Checking if %s is on commons...' % self.image) + wikipedia.output(u'Checking if %s is on commons...' % self.imageName) commons_site = wikipedia.getSite('commons', 'commons') - regexOnCommons = r"\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:.*?]](?: (same name)|)$" % self.image - imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image) + regexOnCommons = r"\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:.*?]](?: (same name)|)$" % self.imageName + imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.imageName) hash_found = imagePage.getHash() if hash_found == None: return False # Problems? Yes! Image deleted, no hash found. Skip the image. else: commons_image_with_this_hash = commons_site.getImagesFromAnHash(hash_found) if commons_image_with_this_hash != []: - wikipedia.output(u'%s is on commons!' % self.image) - imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image) + wikipedia.output(u'%s is on commons!' % self.imageName) + imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.imageName) on_commons_text = imagePage.getImagePageHtml() if "<div class='sharedUploadNotice'>" in on_commons_text: wikipedia.output(u"But, the image doesn't exist on your project! Skip...") # Problems? Yes! We have to skip the check part for that image! # Because it's on commons but someone has added something on your project. return False - elif re.findall(r'\bstemma\b', self.image.lower()) != [] and self.site.lang == 'it': - wikipedia.output(u'%s has "stemma" inside, means that it's ok.' % self.image) + elif re.findall(r'\bstemma\b', self.imageName.lower()) != [] and self.site.lang == 'it': + wikipedia.output(u'%s has "stemma" inside, means that it's ok.' % self.imageName) return True # Problems? No, it's only not on commons but the image needs a check else: # the second usually is a url or something like that. Compare the two in equal way, both url. - if self.convert_to_url(self.image) == self.convert_to_url(commons_image_with_this_hash[0]): - repme = "\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]] (same name)" % (self.image, commons_image_with_this_hash[0]) + if self.convert_to_url(self.imageName) == self.convert_to_url(commons_image_with_this_hash[0]): + repme = "\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]] (same name)" % (self.imageName, commons_image_with_this_hash[0]) else: - repme = "\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]]" % (self.image, commons_image_with_this_hash[0]) - self.report_image(self.image, self.rep_page, self.com, repme, addings = False, regex = regexOnCommons) + repme = "\n*[[:Image:%s]] is also on '''Commons''': [[commons:Image:%s]]" % (self.imageName, commons_image_with_this_hash[0]) + self.report_image(self.imageName, self.rep_page, self.com, repme, addings = False, regex = regexOnCommons) # Problems? No, return True return True else: @@ -825,7 +825,7 @@ """ Function to check the duplicated images. """ # {{Dupe|Image:Blanche_Montel.jpg}} # Skip the stub images - #if 'stub' in self.image.lower() and self.project == 'wikipedia' and self.site.lang == 'it': + #if 'stub' in self.imageName.lower() and self.project == 'wikipedia' and self.site.lang == 'it': # return True # Skip the stub, ok dupText = wikipedia.translate(self.site, duplicatesText) dupRegex = wikipedia.translate(self.site, duplicatesRegex) @@ -833,17 +833,17 @@ dupTalkText = wikipedia.translate(self.site, duplicates_user_talk_text) dupComment_talk = wikipedia.translate(self.site, duplicates_comment_talk) dupComment_image = wikipedia.translate(self.site, duplicates_comment_image) - duplicateRegex = r'\n*(?:[[:Image:%s]] has the following duplicates(?: ('''forced mode''')|):|*[[:Image:%s]])$' % (self.convert_to_url(self.image), self.convert_to_url(self.image)) - imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image) + duplicateRegex = r'\n*(?:[[:Image:%s]] has the following duplicates(?: ('''forced mode''')|):|*[[:Image:%s]])$' % (self.convert_to_url(self.imageName), self.convert_to_url(self.imageName)) + imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.imageName) hash_found = imagePage.getHash() duplicates = self.site.getImagesFromAnHash(hash_found) if duplicates == None: return False # Error, image deleted, no hash found. Skip the image. if len(duplicates) > 1: if len(duplicates) == 2: - wikipedia.output(u'%s has a duplicate! Reporting it...' % self.image) + wikipedia.output(u'%s has a duplicate! Reporting it...' % self.imageName) else: - wikipedia.output(u'%s has %s duplicates! Reporting them...' % (self.image, len(duplicates) - 1)) + wikipedia.output(u'%s has %s duplicates! Reporting them...' % (self.imageName, len(duplicates) - 1)) if not dupText == None and not dupRegex == None: time_image_list = list() time_list = list() @@ -919,17 +919,17 @@ commImage = dupComment_image, unver = True) if self.duplicatesReport or only_report: if only_report: - repme = "\n*[[:Image:%s]] has the following duplicates ('''forced mode'''):" % self.convert_to_url(self.image) + repme = "\n*[[:Image:%s]] has the following duplicates ('''forced mode'''):" % self.convert_to_url(self.imageName) else: - repme = "\n*[[:Image:%s]] has the following duplicates:" % self.convert_to_url(self.image) + repme = "\n*[[:Image:%s]] has the following duplicates:" % self.convert_to_url(self.imageName) for duplicate in duplicates: - if self.convert_to_url(duplicate) == self.convert_to_url(self.image): + if self.convert_to_url(duplicate) == self.convert_to_url(self.imageName): continue # the image itself, not report also this as duplicate repme += "\n**[[:Image:%s]]" % self.convert_to_url(duplicate) - result = self.report_image(self.image, self.rep_page, self.com, repme, addings = False, regex = duplicateRegex) + result = self.report_image(self.imageName, self.rep_page, self.com, repme, addings = False, regex = duplicateRegex) if not result: return True # If Errors, exit (but continue the check) - if older_image != self.image: + if older_image != self.imageName: return False # The image is a duplicate, it will be deleted. So skip the check-part, useless return True # Ok - No problem. Let's continue the checking phase
@@ -1091,13 +1091,13 @@ exit_cicle = True break if not seems_ok: - rep_text_license_fake = "\n*[[:Image:%s]] seems to have a ''fake license'', license detected: {{tl|%s}}." % (self.image, license_found) - regexFakeLicense = r"* ?[[:Image:%s]] seems to have a ''fake license'', license detected: {{tl|%s}}.$" % (self.image, license_found) - printWithTimeZone(u"%s seems to have a fake license: %s, reporting..." % (self.image, license_found)) - self.report_image(self.image, rep_text = rep_text_license_fake, + rep_text_license_fake = "\n*[[:Image:%s]] seems to have a ''fake license'', license detected: {{tl|%s}}." % (self.imageName, license_found) + regexFakeLicense = r"* ?[[:Image:%s]] seems to have a ''fake license'', license detected: {{tl|%s}}.$" % (self.imageName, license_found) + printWithTimeZone(u"%s seems to have a fake license: %s, reporting..." % (self.imageName, license_found)) + self.report_image(self.imageName, rep_text = rep_text_license_fake, addings = False, regex = regexFakeLicense) else: - printWithTimeZone(u"%s seems ok, license found: %s..." % (self.image, license_found)) + printWithTimeZone(u"%s seems ok, license found: %s..." % (self.imageName, license_found)) return license_found
def load(self, raw): @@ -1115,6 +1115,29 @@ list_loaded.append(word) return list_loaded
+ def skipImages(self, skip_number, limit): + # If the images to skip are more the images to check, make them the same number + if skip_number == 0: + wikipedia.output(u'\t\t>> No images to skip...<<') + return False + if skip_number > limit: skip_number = limit + # Print a starting message only if no images has been skipped + if self.skip_list == []: + if skip_number == 1: + wikipedia.output(u'Skipping the first image:\n') + else: + wikipedia.output(u'Skipping the first %s images:\n' % skip_number) + # If we still have pages to skip: + if len(self.skip_list) < skip_number: + wikipedia.output(u'Skipping %s...' % self.imageName) + self.skip_list.append(self.imageName) + if skip_number == 1: + wikipedia.output('') + return True + else: + wikipedia.output('') # Print a blank line. + return False + def checkbot(): """ Main function """ # Command line configurable parameters @@ -1127,8 +1150,7 @@ normal = False # Check the new images or use another generator? urlUsed = False # Use the url-related function instead of the new-pages generator regexGen = False # Use the regex generator - untagged = False # Use the untagged generator - skip_list = list() # Inizialize the skip list used below + untagged = False # Use the untagged generator duplicatesActive = False # Use the duplicate option duplicatesReport = False # Use the duplicate-report option sendemailActive = False # Use the send-email @@ -1181,7 +1203,7 @@ firstPageTitle = str(wikipedia.input(u'From witch page do you want to start?')) elif len(arg) > 6: firstPageTitle = str(arg[7:]) - generator = wikipedia.getSite().allpages(start=firstPageTitle ,namespace=6) + generator = wikipedia.getSite().allpages(start=firstPageTitle, namespace=6) repeat = False elif arg.startswith('-page'): if len(arg) == 5: @@ -1355,38 +1377,21 @@ mainClass.setParameters(imageName) # Setting the image for the main class # Skip block if skip == True: - # If the images to skip are more the images to check, make them the same number - if skip_number > limit: skip_number = limit - # Print a starting message only if no images has been skipped - if skip_list == []: - if skip_number == 1: - wikipedia.output(u'Skipping the first image:\n') - else: - wikipedia.output(u'Skipping the first %s images:\n' % skip_number) - # If we still have pages to skip: - if len(skip_list) < skip_number: - wikipedia.output(u'Skipping %s...' % imageName) - skip_list.append(imageName) - if skip_number == 1: - wikipedia.output('') - skip = False - continue - else: - wikipedia.output('') # Print a blank line. - skip = False - elif skip_list == []: # Skip must be false if we are here but - # the user has set 0 as images to skip - wikipedia.output(u'\t\t>> No images to skip...<<') - skip_list.append('skip = Off') # Only to print it once + skip = mainClass.skipImages(skip_number, limit) + if skip == True: + continue parentesi = False # parentesi are these in italian: { ( ) } [] delete = False tagged = False extension = imageName.split('.')[-1] # get the extension from the image's name # Page => ImagePage p = wikipedia.ImagePage(site, image.title()) - # Get the text in the image (called g) + # Get the text in the image (called imageCheckText) try: - g = p.get() + # the checkText will be modified in order to make the check phase easier + # the imageFullText will be used when the full text is needed without changes + imageCheckText = p.get() + imageFullText = imageCheckText except wikipedia.NoPage: wikipedia.output(u"Skipping %s because it has been deleted." % imageName) continue @@ -1396,7 +1401,7 @@ # Delete the fields where the templates cannot be loaded regex_nowiki = re.compile(r'<nowiki>(.*?)</nowiki>', re.DOTALL) regex_pre = re.compile(r'<pre>(.*?)</pre>', re.DOTALL) - g = regex_nowiki.sub('', g); g = regex_pre.sub('', g) + imageCheckText = regex_nowiki.sub('', imageCheckText); imageCheckText = regex_pre.sub('', imageCheckText) # Check on commons if there's already an image with the same name if commonsActive == True: response = mainClass.checkImageOnCommons() @@ -1413,10 +1418,10 @@ # and the regex will be wrong) if '{{' in i: regexP = re.compile('{{(?:template|)%s ?(?:||\n|}|<) ?' % i.split('{{')[1].replace(' ', '[ _]'), re.I) - result = regexP.findall(g) + result = regexP.findall(imageCheckText) if result != []: tagged = True - elif i.lower() in g: + elif i.lower() in imageCheckText: tagged = True # Deleting the useless template from the description (before adding something # in the image the original text will be reloaded, don't worry). @@ -1425,13 +1430,13 @@ for l in hiddentemplate: if tagged == False: # why creator? Because on commons there's a template such as {{creator:name}} that.. works - res = re.findall(r'{{(?:[Tt]emplate:|)(?:%s[ \n]*?(?:\n|||}|<)|creator:)' % l.lower(), g.lower()) + res = re.findall(r'{{(?:[Tt]emplate:|)(?:%s[ \n]*?(?:\n|||}|<)|creator:)' % l.lower(), imageCheckText.lower()) if res != []: white_template_found += 1 if l != '' and l != ' ': # Check that l is not nothing or a space # Deleting! (replace the template with nothing) regex_white_template = re.compile(r'{{(?:template:|)(?:%s|creator)' % l, re.IGNORECASE) - g = regex_white_template.sub(r'', g) + imageCheckText = regex_white_template.sub(r'', imageCheckText) hiddenTemplateFound = True if white_template_found == 1: wikipedia.output(u'A white template found, skipping the template...') @@ -1440,7 +1445,7 @@ else: wikipedia.output(u'White templates found: %s; skipping those templates...' % white_template_found) for a_word in something: # something is the array with {{, MIT License and so on. - if a_word in g: + if a_word in imageCheckText: # There's a template, probably a license (or I hope so) parentesi = True # Is the extension allowed? (is it an image or f.e. a .xls file?) @@ -1474,7 +1479,7 @@ wikipedia.setAction(summary) for k in find_list: if find_tipe.lower() == 'findonly': - if k.lower() == g.lower(): + if k.lower() == imageCheckText.lower(): some_problem = True text_used = text head_used = head_2 @@ -1484,7 +1489,7 @@ mex_used = mexCatched break elif find_tipe.lower() == 'find': - if k.lower() in g.lower(): + if k.lower() in imageCheckText.lower(): some_problem = True text_used = text head_used = head_2 @@ -1503,7 +1508,7 @@ printWithTimeZone(u'%s is already tagged...' % imageName) continue if some_problem == True: - if mex_used in g: + if mex_used in imageCheckText: wikipedia.output(u'Image already fixed. Skip.') continue wikipedia.output(u"The image description for %s contains %s..." % (imageName, name_used)) @@ -1524,7 +1529,7 @@ seems_ok = False license_found = None if smartdetection: - license_found = mainClass.smartDetection(g) + license_found = mainClass.smartDetection(imageCheckText) else: printWithTimeZone(u"%s seems ok..." % imageName) # It works also without this... but i want only to be sure ^^ @@ -1540,7 +1545,7 @@ mainClass.report(canctext, imageName, notification, head) delete = False continue - elif g in nothing: + elif imageCheckText in nothing: wikipedia.output(u"The image description for %s does not contain a license template!" % imageName) if hiddenTemplateFound and HiddenTN != None and HiddenTN != '' and HiddenTN != ' ': notification = HiddenTN % imageName