SVN: [7190] trunk/pywikipedia/flickrripper.py - Pywikipedia-svn

31 Aug 2009

Revision: 7190
Author:   multichill
Date:     2009-08-31 15:09:31 +0000 (Mon, 31 Aug 2009)
Log Message:
-----------
*Skipping dupes now
*Rearranged some code
Modified Paths:
--------------
    trunk/pywikipedia/flickrripper.py
Modified: trunk/pywikipedia/flickrripper.py
===================================================================

--- trunk/pywikipedia/flickrripper.py	2009-08-31 11:00:07 UTC (rev 7189)
+++ trunk/pywikipedia/flickrripper.py	2009-08-31 15:09:31 UTC (rev 7190)
@@ -31,7 +31,7 @@
 #
 __version__ = '$Id$'
-import sys, urllib, re,  StringIO
+import sys, urllib, re,  StringIO, hashlib, base64
 import wikipedia, config, query, imagerecat, upload
import flickrapi                  # see: http://stuvel.eu/projects/flickrapi
@@ -61,6 +61,37 @@
         #We don't accept other licenses
         return False
+def getPhotoUrl(photoSizes=None):
+    '''
+    Get the url of the jpg file with the highest resolution
+    '''
+    url = ''
+    # The assumption is that the largest image is last
+    for size in photoSizes.find('sizes').findall('size'):
+        url = size.attrib['source']
+    return url
+
+def downloadPhoto(photoUrl=''):
+    imageFile=urllib.urlopen(photoUrl).read()
+    return StringIO.StringIO(imageFile)
+
+def findDuplicateImages(photo=None, site=wikipedia.getSite()):
+    result = []
+    hashObject = hashlib.sha1()
+    hashObject.update(photo.getvalue())
+    sha1Hash = base64.b16encode(hashObject.digest())
+
+    params = {
+	'action'    : 'query',
+        'list'      : 'allimages',
+        'aisha1'    : sha1Hash,
+        'aiprop'    : '',
+    }
+    data = query.GetData(params, wikipedia.getSite(), useAPI = True, encodeTitle = False)
+    for image in data['query']['allimages']:
+        result.append(image['name'])
+    return result
+
 def getTags(photoInfo = None):
     '''
     Get all the tags on a photo
@@ -116,17 +147,8 @@
     title = title.replace(" ", "_")
return title
+
-def getPhotoUrl(photoSizes=None):
-    '''
-    Get the url of the jpg file with the highest resolution
-    '''
-    url = ''
-    # The assumption is that the largest image is last
-    for size in photoSizes.find('sizes').findall('size'):
-        url = size.attrib['source']
-    return url
-
 def buildDescription(flinfoDescription=u'', flickrreview=False, reviewer=u'', override=u''):
     '''
     Build the final description for the image. The description is based on the info from flickrinfo and improved.
@@ -143,57 +165,28 @@
         if(reviewer):
             description = description.replace(u'{{flickrreview}}', u'{{flickrreview|' + reviewer + '|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}')
     description = description.replace(u'\r\n', u'\n')
-    return description
+    return description
-def getPhotos(flickr=None, user_id=u'', group_id=u'', photoset_id=u'', tags=u''):
-    result = []    
-    # http://www.flickr.com/services/api/flickr.groups.pools.getPhotos.html
-    if(group_id):
-        #First get the total number of photo's in the group
-        photos = flickr.groups_pools_getPhotos(group_id=group_id, user_id=user_id, tags=tags, per_page='100', page='1')
-        pages = photos.find('photos').attrib['pages']
-
-        for i in range(1, int(pages)):
-            for photo in flickr.groups_pools_getPhotos(group_id=group_id, user_id=user_id, tags=tags, per_page='100', page=i).find('photos').getchildren():
-                yield photo.attrib['id']
-            
-    # http://www.flickr.com/services/api/flickr.photosets.getPhotos.html
-    elif(photoset_id):
-        photos = flickr.photosets_getPhotos(photoset_id=photoset_id, per_page='100', page='1')
-        pages = photos.find('photos').attrib['pages']
-
-        for i in range(1, int(pages)):
-            for photo in flickr.photosets_getPhotos(photoset_id=photoset_id, per_page='100', page=i).find('photos').getchildren():
-                yield photo.attrib['id']
-    
-    # http://www.flickr.com/services/api/flickr.people.getPublicPhotos.html
-    elif(user_id):
-        photos = flickr.people_getPublicPhotos(user_id=user_id, per_page='100', page='1')
-        pages = photos.find('photos').attrib['pages']
-
-        for i in range(1, int(pages)):
-            for photo in flickr.people_getPublicPhotos(user_id=user_id, per_page='100', page=i).find('photos').getchildren():
-                yield photo.attrib['id']
-    return
-    
-
 def processPhoto(flickr=None, photo_id=u'', flickrreview=False, reviewer=u'', override=u''):
     if(photo_id):
         print photo_id
         (photoInfo, photoSizes) = getPhoto(flickr=flickr, photo_id=photo_id)
     if (isAllowedLicense(photoInfo=photoInfo) or override):
-        # Tags not needed atm
-        #tags=getTags(photoInfo=photoInfo)
+        #Get the url of the largest photo
+        photoUrl = getPhotoUrl(photoSizes=photoSizes)
+        #Should download the photo only once
+        photo = downloadPhoto(photoUrl=photoUrl)
-        flinfoDescription = getFlinfoDescription(photo_id=photo_id)
-
-        filename = getFilename(photoInfo=photoInfo)
-        #print filename
-        photoUrl = getPhotoUrl(photoSizes=photoSizes)
-        #print photoUrl
-        photoDescription = buildDescription(flinfoDescription=flinfoDescription, flickrreview=flickrreview, reviewer=reviewer, override=override)
-        #wikipedia.output(photoDescription)
-        (newPhotoDescription, newFilename, skip)=Tkdialog(photoDescription, photoUrl, filename).run()
+        #Don't upload duplicate images, should add override option
+        duplicates = findDuplicateImages(photo=photo)
+        if duplicates:
+            wikipedia.output(u'Found duplicate image at %s' % duplicates.pop())
+        else:
+            filename = getFilename(photoInfo=photoInfo)
+            flinfoDescription = getFlinfoDescription(photo_id=photo_id)
+            photoDescription = buildDescription(flinfoDescription=flinfoDescription, flickrreview=flickrreview, reviewer=reviewer, override=override)
+            #wikipedia.output(photoDescription)
+            (newPhotoDescription, newFilename, skip)=Tkdialog(photoDescription, photo, filename).run()
         #wikipedia.output(newPhotoDescription)
         #if (wikipedia.Page(title=u'File:'+ filename, site=wikipedia.getSite()).exists()):
         # I should probably check if the hash is the same and if not upload it under a different name
@@ -202,13 +195,14 @@
             #Do the actual upload
             #Would be nice to check before I upload if the file is already at Commons
             #Not that important for this program, but maybe for derived programs
-        if not skip:
-            bot = upload.UploadRobot(url=photoUrl, description=newPhotoDescription, useFilename=newFilename, keepFilename=True, verifyDescription=False)
-            bot.upload_image(debug=False)
-    return 0
+            if not skip:
+                bot = upload.UploadRobot(url=photoUrl, description=newPhotoDescription, useFilename=newFilename, keepFilename=True, verifyDescription=False)
+                bot.upload_image(debug=False)
+                return 1
+    return 0
class Tkdialog:
-    def __init__(self, photoDescription, photoUrl, filename):
+    def __init__(self, photoDescription, photo, filename):
         self.root=Tk()
         #"%dx%d%+d%+d" % (width, height, xoffset, yoffset)
         self.root.geometry("%ix%i+10-10"%(config.tkhorsize, config.tkvertsize))
@@ -216,13 +210,13 @@
         self.root.title(filename)
         self.photoDescription = photoDescription
         self.filename = filename 
-        self.photoUrl = photoUrl
+        self.photo = photo
         self.skip=False
         self.exit=False
## Init of the widgets
         # The image
-        self.image=self.getImage(self.photoUrl, 800, 600)
+        self.image=self.getImage(self.photo, 800, 600)
         self.imagePanel=Label(self.root, image=self.image)
self.imagePanel.image = self.image
@@ -262,12 +256,10 @@
         self.descriptionField.grid(row=14, column=1, columnspan=3)
         self.descriptionScrollbar.grid(row=14, column=5)
-    def getImage(self, url, width, height):
-        image=urllib.urlopen(url).read()
-        output = StringIO.StringIO(image)
-        image2 = Image.open(output)
-        image2.thumbnail((width, height))
-        imageTk = ImageTk.PhotoImage(image2)
+    def getImage(self, photo, width, height):
+        image = Image.open(photo)
+        image.thumbnail((width, height))
+        imageTk = ImageTk.PhotoImage(image)
         return imageTk
def okFile(self):
@@ -292,7 +284,37 @@
         self.root.mainloop()
         return (self.photoDescription, self.filename, self.skip)
+def getPhotos(flickr=None, user_id=u'', group_id=u'', photoset_id=u'', tags=u''):
+    result = []    
+    # http://www.flickr.com/services/api/flickr.groups.pools.getPhotos.html
+    if(group_id):
+        #First get the total number of photo's in the group
+        photos = flickr.groups_pools_getPhotos(group_id=group_id, user_id=user_id, tags=tags, per_page='100', page='1')
+        pages = photos.find('photos').attrib['pages']
+        for i in range(1, int(pages)):
+            for photo in flickr.groups_pools_getPhotos(group_id=group_id, user_id=user_id, tags=tags, per_page='100', page=i).find('photos').getchildren():
+                yield photo.attrib['id']
+            
+    # http://www.flickr.com/services/api/flickr.photosets.getPhotos.html
+    elif(photoset_id):
+        photos = flickr.photosets_getPhotos(photoset_id=photoset_id, per_page='100', page='1')
+        pages = photos.find('photos').attrib['pages']
+
+        for i in range(1, int(pages)):
+            for photo in flickr.photosets_getPhotos(photoset_id=photoset_id, per_page='100', page=i).find('photos').getchildren():
+                yield photo.attrib['id']
+    
+    # http://www.flickr.com/services/api/flickr.people.getPublicPhotos.html
+    elif(user_id):
+        photos = flickr.people_getPublicPhotos(user_id=user_id, per_page='100', page='1')
+        pages = photos.find('photos').attrib['pages']
+
+        for i in range(1, int(pages)):
+            for photo in flickr.people_getPublicPhotos(user_id=user_id, per_page='100', page=i).find('photos').getchildren():
+                yield photo.attrib['id']
+    return
+
 def usage():
     wikipedia.output(u"Flickrripper is a tool to transfer flickr photos to Wikimedia Commons")
     wikipedia.output(u"-group_id:<group_id>\n")