Revision: 5634
Author: filnik
Date: 2008-06-26 14:26:46 +0000 (Thu, 26 Jun 2008)
Log Message:
-----------
Fixing the duplicate detector and some minor fixes
Modified Paths:
--------------
trunk/pywikipedia/checkimages.py
Modified: trunk/pywikipedia/checkimages.py
===================================================================
--- trunk/pywikipedia/checkimages.py 2008-06-26 14:26:14 UTC (rev 5633)
+++ trunk/pywikipedia/checkimages.py 2008-06-26 14:26:46 UTC (rev 5634)
@@ -80,7 +80,7 @@
#
import re, time, urllib, urllib2, os, locale, sys
-import wikipedia, config, pagegenerators, catlib
+import wikipedia, config, pagegenerators, catlib, query
locale.setlocale(locale.LC_ALL, '')
@@ -384,10 +384,6 @@
class NothingFound(wikipedia.Error):
""" An exception indicating that a regex has return [] instead of results."""
-class NoHash(wikipedia.Error):
- """ The APIs don't return any Hash for the image searched.
- Really Strange, better to raise an error. """
-
# Other common useful functions
def printWithTimeZone(message):
""" Function to print the messages followed by the TimeZone encoded correctly. """
@@ -400,23 +396,6 @@
time_zone = unicode(time.strftime(u"%d %b %Y %H:%M:%S (UTC)", time.gmtime()))
wikipedia.output(u"%s%s" % (message, time_zone))
-def returnOlderTime(listGiven, timeListGiven):
- """ Get some time and return the oldest of them """
- #print listGiven; print timeListGiven
- #Output:
- #[[1210596312.0, u'Autoritratto.png'], [1210590240.0, u'Duplicato.png'], [1210592052.0, u'Duplicato_2.png']]
- #[1210596312.0, 1210590240.0, 1210592052.0]
- for element in listGiven:
- time = element[0]
- imageName = element[1]
- not_the_oldest = False
- for time_selected in timeListGiven:
- if time > time_selected:
- not_the_oldest = True
- break
- if not_the_oldest == False:
- return imageName
-
class EmailSender(wikipedia.Page):
""" Class to send emails through the Wikipedia's dedicated page. """
def __init__(self, site, user):
@@ -424,6 +403,7 @@
self.user = user
page_special_name = u'Special:EmailUser'
self.page_special_name = page_special_name
+ # Special:EmailUser/Filnik
page = '%s/%s' % (self.page_special_name, self.user)
self.page = page
wikipedia.Page.__init__(self, site, page, None, 0)
@@ -563,7 +543,7 @@
# paginetta it's the image page object.
paginetta = wikipedia.ImagePage(self.site, self.image_namespace + self.image)
try:
- nick = paginetta.getLatestUploader()
+ nick = paginetta.getLatestUploader()[0]
except wikipedia.NoPage:
wikipedia.output(u"Seems that %s hasn't the image at all, but there is something in the description..." % self.image)
repme = "\n*[[:Image:%s]] problems '''with the APIs'''"
@@ -679,6 +659,44 @@
yield image
#continue
+ def returnOlderTime(self, listGiven, timeListGiven):
+ """ Get some time and return the oldest of them """
+ #print listGiven; print timeListGiven
+ #Output:
+ #[[1210596312.0, u'Autoritratto.png'], [1210590240.0, u'Duplicato.png'], [1210592052.0, u'Duplicato_2.png']]
+ #[1210596312.0, 1210590240.0, 1210592052.0]
+ usage = False
+ num = 0
+ num_older = None
+ max_usage = 0
+ for element in listGiven:
+ imageName = element[1]
+ imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % imageName)
+ imageUsage = [page for page in imagePage.usingPages()]
+ if len(imageUsage) != 0 and imageUsage > max_usage:
+ max_usage = imageUsage
+ num_older = num
+ num += 1
+ if num_older != None:
+ return listGiven[num_older][1]
+ for element in listGiven:
+ time = element[0]
+ imageName = element[1]
+ not_the_oldest = False
+ for time_selected in timeListGiven:
+ if time > time_selected:
+ not_the_oldest = True
+ break
+ if not_the_oldest == False:
+ return imageName
+
+ def convert_to_url(self, page):
+ # Function stolen from wikipedia.py
+ """The name of the page this Page refers to, in a form suitable for the URL of the page."""
+ title = page.replace(" ", "_")
+ encodedTitle = title.encode(self.site.encoding())
+ return urllib.quote(encodedTitle)
+
def checkImageOnCommons(self, image):
""" Checking if the image is on commons """
self.image = image
@@ -706,13 +724,6 @@
# Problems? No, return True
return True
- def convert_to_url(self, page):
- # Function stolen from wikipedia.py
- """The name of the page this Page refers to, in a form suitable for the URL of the page."""
- title = page.replace(" ", "_")
- encodedTitle = title.encode(self.site.encoding())
- return urllib.quote(encodedTitle)
-
def checkImageDuplicated(self, image):
""" Function to check the duplicated images. """
# {{Dupe|Image:Blanche_Montel.jpg}}
@@ -722,23 +733,12 @@
dupTalkText = wikipedia.translate(self.site, duplicates_user_talk_text)
dupComment_talk = wikipedia.translate(self.site, duplicates_comment_talk)
dupComment_image = wikipedia.translate(self.site, duplicates_comment_image)
-
self.image = image
duplicateRegex = r'\n\*(?:\[\[:Image:%s\]\] has the following duplicates:|\*\[\[:Image:%s\]\])$' % (self.convert_to_url(self.image), self.convert_to_url(self.image))
- imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image)
- wikipedia.output(u'Checking if %s has duplicates...' % image)
- get_hash = self.site.getUrl(self.site.apipath() + '?action=query&format=xml&titles=Image:%s&prop=imageinfo&iiprop=sha1' % self.convert_to_url(self.image))
- hash_found_list = re.findall(r'<ii sha1="(.*?)" />', get_hash)
- if hash_found_list != []:
- hash_found = hash_found_list[0]
- else:
- if imagePage.exists():
- raise NoHash('No Hash found in the APIs! Maybe the regex to catch it is wrong or someone has changed the APIs structure.')
- else:
- wikipedia.output(u'Image deleted before getting the Hash. Skipping...')
- return False # Error, we need to skip the page.
- get_duplicates = self.site.getUrl(self.site.apipath() + '?action=query&format=xml&list=allimages&aisha1=%s' % hash_found)
- duplicates = re.findall(r'<img name="(.*?)".*?/>', get_duplicates)
+ imagePage = wikipedia.ImagePage(self.site, 'Image:%s' % self.image)
+ duplicates = imagePage.getDuplicates()
+ if duplicates == None:
+ return False # Error, we need to skip the page.
if len(duplicates) > 1:
if len(duplicates) == 2:
wikipedia.output(u'%s has a duplicate! Reporting it...' % self.image)
@@ -758,17 +758,13 @@
time_list = list()
for duplicate in duplicates:
DupePage = wikipedia.ImagePage(self.site, u'Image:%s' % duplicate)
- imagedata = DupePage.getFileVersionHistory()[-1][0]
- try:
- # Example: 21:15, 5 ott 2005
- data = time.strptime(imagedata, "%H:%M, %d %b %Y")
- except ValueError:
- # Example: 21:15, 5 Ottobre 2005
- data = time.strptime(imagedata, "%H:%M, %d %B %Y")
+ imagedata = DupePage.getLatestUploader()[1]
+ # '2008-06-18T08:04:29Z'
+ data = time.strptime(imagedata, "%Y-%m-%dT%H:%M:%SZ")
data_seconds = time.mktime(data)
time_image_list.append([data_seconds, duplicate])
time_list.append(data_seconds)
- older_image = returnOlderTime(time_image_list, time_list)
+ older_image = self.returnOlderTime(time_image_list, time_list)
# And if the images are more than two?
Page_oder_image = wikipedia.ImagePage(self.site, u'Image:%s' % older_image)
string = ''
@@ -895,6 +891,7 @@
list_loaded.append(word)
def checkbot():
+ """ Main function """
# Command line configurable parameters
repeat = True # Restart after having check all the images?
limit = 80 # How many images check?