Revision: 7169 Author: multichill Date: 2009-08-23 16:13:23 +0000 (Sun, 23 Aug 2009)
Log Message: ----------- First version of flickrripper. It works, but I still have to do a lot to get a nice program.
Added Paths: ----------- trunk/pywikipedia/flickrripper.py
Added: trunk/pywikipedia/flickrripper.py =================================================================== --- trunk/pywikipedia/flickrripper.py (rev 0) +++ trunk/pywikipedia/flickrripper.py 2009-08-23 16:13:23 UTC (rev 7169) @@ -0,0 +1,389 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +''' +Tool to copy a flickr stream to Commons + +# Get a set to work on (start with just a username). +# * Make it possible to delimit the set (from/to) +#For each image +#*Check the license +#*Check if it isn't already on Commons +#*Build suggested filename +#**Check for name collision and maybe alter it +#*Pull description from Flinfo +#*Show image and description to user +#**Add a nice hotcat lookalike for the adding of categories +#**Filter the categories +#*Upload the image + +Todo: +*Check if the image is already uploaded (SHA hash) +*Check and prevent filename collisions +**Initial suggestion +**User input +*Filter the categories + +''' +# +# (C) Multichill, 2009 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id: upload_to_commons.py 69 2009-08-23 11:44:26Z multichill $' + +import sys, urllib, re, StringIO +import wikipedia, config, query, imagerecat, upload + +import flickrapi +import xml.etree.ElementTree +from Tkinter import * +from PIL import Image, ImageTk + +def getPhoto(flickr=None, photo_id=''): + ''' + Get the photo info and the photo sizes so we can use these later on + ''' + photoInfo = flickr.photos_getInfo(photo_id=photo_id) + #xml.etree.ElementTree.dump(photoInfo) + photoSizes = flickr.photos_getSizes(photo_id=photo_id) + #xml.etree.ElementTree.dump(photoSizes) + return (photoInfo, photoSizes) + +def isAllowedLicense(photoInfo=None): + ''' + Check if the image contains the right license + ''' + license = photoInfo.find('photo').attrib['license'] + if (license=='4' or license=='5'): + #Is cc-by or cc-by-sa + return True + else: + #We don't accept other licenses + return False + +def getTags(photoInfo=None): + ''' + Get all the tags on a photo + ''' + result = [] + for tag in photoInfo.find('photo').find('tags').findall('tag'): + result.append(tag.text.lower()) + return result + +def getFlinfoDescription(photo_id=0): + ''' + Get the description from http://wikipedia.ramselehof.de/flinfo.php + ''' + parameters = urllib.urlencode({'id' : photo_id, 'raw' : 'on'}) + + #print 'Flinfo gaat nu aan de slag' + rawDescription = urllib.urlopen("http://wikipedia.ramselehof.de/flinfo.php?%s" % parameters).read() + #print rawDescription.decode('utf-8') + return rawDescription.decode('utf-8') + +def getFilename(photoInfo=None): + ''' + Build a good filename for the upload based on the username and the title + ''' + username = photoInfo.find('photo').find('owner').attrib['username'] + title = photoInfo.find('photo').find('title').text + if title: + title = cleanUpTitle(title) + else: + title = u'' + + return u'Flickr - %s - %s.jpg' % (username, title) + +def cleanUpTitle(title): + title = title.strip() + + title = re.sub("[<{\[]", "(", title) + title = re.sub("[>}\]]", ")", title) + title = re.sub("[ _]?\(!\)", "", title) + title = re.sub(",:[ _]", ", ", title) + title = re.sub("[;:][ _]", ", ", title) + title = re.sub("[\t\n ]+", " ", title) + title = re.sub("[\r\n ]+", " ", title) + title = re.sub("[\n]+", "", title) + title = re.sub("[?!]([."]|$)", "\1", title) + title = re.sub("[&#%?!]", "^", title) + title = re.sub("[;]", ",", title) + title = re.sub("[/+\\:]", "-", title) + title = re.sub("--+", "-", title) + title = re.sub(",,+", ",", title) + title = re.sub("[-,^]([.]|$)", "\1", title) + title = title.replace(" ", "_") + + return title + +def getPhotoUrl(photoSizes=None): + ''' + Get the url of the jpg file with the highest resolution + ''' + url = '' + # The assumption is that the largest image is last + for size in photoSizes.find('sizes').findall('size'): + url = size.attrib['source'] + return url + +def buildDescription(flinfoDescription=u'', flickrreview=False, reviewer=u'', override=u''): + ''' + Build the final description for the image. The description is based on the info from flickrinfo and improved. + ''' + description = flinfoDescription + + if(override): + description = description.replace(u'{{cc-by-sa-2.0}}\n', u'') + description = description.replace(u'{{cc-by-2.0}}\n', u'') + description = description.replace(u'{{flickrreview}}\n', u'') + description = description.replace(u'{{copyvio|Flickr, licensed as "All Rights Reserved" which is not a free license --~~~~}}\n', u'') + description = description.replace(u'=={{int:license}}==', u'=={{int:license}}==\n' + override) + elif(flickrreview): + if(reviewer): + description = description.replace(u'{{flickrreview}}', u'{{flickrreview|' + reviewer + '|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}') + description = description.replace(u'\r\n', u'\n') + return description + +def getPhotos(flickr=None, user_id=u'', group_id=u'', photoset_id=u'', tags=u''): + result = [] + # http://www.flickr.com/services/api/flickr.groups.pools.getPhotos.html + if(group_id): + #First get the total number of photo's in the group + photos = flickr.groups_pools_getPhotos(group_id=group_id, user_id=user_id, tags=tags, per_page='100', page='1') + pages = photos.find('photos').attrib['pages'] + + for i in range(1, int(pages)): + for photo in flickr.groups_pools_getPhotos(group_id=group_id, user_id=user_id, tags=tags, per_page='100', page=i).find('photos').getchildren(): + yield photo.attrib['id'] + + # http://www.flickr.com/services/api/flickr.photosets.getPhotos.html + elif(photoset_id): + photos = flickr.photosets_getPhotos(photoset_id=photoset_id, per_page='100', page='1') + pages = photos.find('photos').attrib['pages'] + + for i in range(1, int(pages)): + for photo in flickr.photosets_getPhotos(photoset_id=photoset_id, per_page='100', page=i).find('photos').getchildren(): + yield photo.attrib['id'] + + # http://www.flickr.com/services/api/flickr.people.getPublicPhotos.html + elif(user_id): + photos = flickr.people_getPublicPhotos(user_id=user_id, per_page='100', page='1') + pages = photos.find('photos').attrib['pages'] + + for i in range(1, int(pages)): + for photo in flickr.people_getPublicPhotos(user_id=user_id, per_page='100', page=i).find('photos').getchildren(): + yield photo.attrib['id'] + return + + +def processPhoto(flickr=None, photo_id=u'', flickrreview=False, reviewer=u'', override=u''): + if(photo_id): + print photo_id + (photoInfo, photoSizes) = getPhoto(flickr=flickr, photo_id=photo_id) + if (isAllowedLicense(photoInfo=photoInfo) or override): + # Tags not needed atm + #tags=getTags(photoInfo=photoInfo) + + flinfoDescription = getFlinfoDescription(photo_id=photo_id) + + filename = getFilename(photoInfo=photoInfo) + #print filename + photoUrl = getPhotoUrl(photoSizes=photoSizes) + #print photoUrl + photoDescription = buildDescription(flinfoDescription=flinfoDescription, flickrreview=flickrreview, reviewer=reviewer, override=override) + #wikipedia.output(photoDescription) + (newPhotoDescription, newFilename, skip)=Tkdialog(photoDescription, photoUrl, filename).run() + #wikipedia.output(newPhotoDescription) + #if (wikipedia.Page(title=u'File:'+ filename, site=wikipedia.getSite()).exists()): + # I should probably check if the hash is the same and if not upload it under a different name + #wikipedia.output(u'File:' + filename + u' already exists!') + #else: + #Do the actual upload + #Would be nice to check before I upload if the file is already at Commons + #Not that important for this program, but maybe for derived programs + if not skip: + bot = upload.UploadRobot(url=photoUrl, description=newPhotoDescription, useFilename=newFilename, keepFilename=True, verifyDescription=False) + bot.upload_image(debug=False) + return 0 + +class Tkdialog: + def __init__(self, photoDescription, photoUrl, filename): + self.root=Tk() + #"%dx%d%+d%+d" % (width, height, xoffset, yoffset) + self.root.geometry("1600x1000+10-10") + + self.root.title(filename) + self.photoDescription = photoDescription + self.filename = filename + self.photoUrl = photoUrl + self.skip=False + self.exit=False + + ## Init of the widgets + # The image + self.image=self.getImage(self.photoUrl, 800, 600) + self.imagePanel=Label(self.root, image=self.image) + + self.imagePanel.image = self.image + + # The filename + self.filenameLabel=Label(self.root,text=u"Suggested filename") + self.filenameField=Entry(self.root, width=100) + self.filenameField.insert(END, filename) + + # The description + self.descriptionLabel=Label(self.root,text=u"Suggested description") + self.descriptionScrollbar=Scrollbar(self.root, orient=VERTICAL) + self.descriptionField=Text(self.root) + self.descriptionField.insert(END, photoDescription) + self.descriptionField.config(state=NORMAL, height=12, width=100, padx=0, pady=0, wrap=WORD, yscrollcommand=self.descriptionScrollbar.set) + self.descriptionScrollbar.config(command=self.descriptionField.yview) + + # The buttons + self.okButton=Button(self.root, text="OK", command=self.okFile) + self.skipButton=Button(self.root, text="Skip", command=self.skipFile) + + ## Start grid + # The image + self.imagePanel.grid(row=0, column=0, rowspan=11, columnspan=4) + + # The filename + self.filenameLabel.grid(row=11, column=0) + self.filenameField.grid(row=11, column=1, columnspan=3) + + # The description + self.descriptionLabel.grid(row=12, column=0) + self.descriptionField.grid(row=12, column=1, columnspan=3) + self.descriptionScrollbar.grid(row=12, column=5) + + # The buttons + self.okButton.grid(row=13, column=1, rowspan=2) + self.skipButton.grid(row=13, column=2, rowspan=2) + + def getImage(self, url, width, height): + image=urllib.urlopen(url).read() + output = StringIO.StringIO(image) + image2 = Image.open(output) + image2.thumbnail((width, height)) + imageTk = ImageTk.PhotoImage(image2) + return imageTk + + def okFile(self): + ''' + The user pressed the OK button. + ''' + self.filename=self.filenameField.get() + self.photoDescription=self.descriptionField.get(0.0, END) + self.root.destroy() + + def skipFile(self): + ''' + The user pressed the Skip button. + ''' + self.skip=True + self.root.destroy() + + def run(self): + ''' + Activate the dialog and return the new name and if the image is skipped. + ''' + self.root.mainloop() + return (self.photoDescription, self.filename, self.skip) + + +def usage(): + wikipedia.output(u"Flickrripper is a tool to transfer flickr photos to Wikimedia Commons") + wikipedia.output(u"-group_id:<group_id>\n") + wikipedia.output(u"-photoset_id:<photoset_id>\n") + wikipedia.output(u"-user_id:<user_id>\n") + wikipedia.output(u"-tags:<tag>\n") + return + +def main(): + site = wikipedia.getSite(u'commons', u'commons') + wikipedia.setSite(site) + #imagerecat.initLists() + + #Get the api key + if(config.flickr['api_key']): + flickr = flickrapi.FlickrAPI(config.flickr['api_key']) + else: + wikipedia.output('Flickr api key not found! Get yourself an api key') + wikipedia.output('Any flickr user can get a key at http://www.flickr.com/services/api/keys/apply/') + return + + group_id = u'' + photoset_id = u'' + user_id = u'' + tags = u'' + totalPhotos = 0 + uploadedPhotos = 0 + + # Do we mark the images as reviewed right away? + if(config.flickr['review']): + flickrreview = config.flickr['review'] + else: + flickrreview = False + + # Set the Flickr reviewer + if(config.flickr['reviewer']): + reviewer = config.flickr['reviewer'] + elif(config.sysopnames['commons']['commons']): + reviewer = config.sysopnames['commons']['commons'] + elif(config.usernames['Commons']['Commons']): + reviewer = config.usernames['Commons']['Commons'] + else: + reviewer = u'' + + override = u'' + + for arg in wikipedia.handleArgs(): + if arg.startswith('-group_id'): + if len(arg) == 9: + group_id = wikipedia.input(u'What is the group_id of the pool?') + else: + group_id = arg[10:] + elif arg.startswith('-photoset_id'): + if len(arg) == 12: + photoset_id = wikipedia.input(u'What is the photoset_id)?') + else: + photoset_id = arg[13:] + elif arg.startswith('-user_id'): + if len(arg) == 8: + user_id = wikipedia.input(u'What is the user_id of the flickr user?') + else: + user_id = arg[9:] + elif arg.startswith('-tags'): + if len(arg) == 5: + tags = wikipedia.input(u'What is the tag you want to filter out (currently only one supported)?') + else: + tags = arg[6:] + elif arg == '-flickrreview': + flickrreview = True + elif arg.startswith('-reviewer'): + if len(arg) == 9: + reviewer = wikipedia.input(u'Who is the reviewer?') + else: + reviewer = arg[10:] + elif arg.startswith('-override'): + if len(arg) == 9: + override = wikipedia.input(u'What is the override text?') + else: + override = arg[10:] + + if(user_id or group_id or photoset_id): + for photo_id in getPhotos(flickr=flickr, user_id=user_id, group_id=group_id, photoset_id=photoset_id, tags=tags): + uploadedPhotos = uploadedPhotos + processPhoto(flickr=flickr, photo_id=photo_id, flickrreview=flickrreview, reviewer=reviewer, override=override) + totalPhotos = totalPhotos + 1 + else: + usage() + + wikipedia.output(u'Finished running') + wikipedia.output(u'Total photos: ' + str(totalPhotos)) + wikipedia.output(u'Uploaded photos: ' + str(uploadedPhotos)) + +if __name__ == "__main__": + try: + main() + finally: + wikipedia.stopme()
pywikipedia-svn@lists.wikimedia.org