Revision: 8669
Author: multichill
Date: 2010-10-20 14:41:13 +0000 (Wed, 20 Oct 2010)
Log Message:
-----------
First version of a tool to mass copy images from Panoramio.
Added Paths:
-----------
trunk/pywikipedia/panoramiopicker.py
Copied: trunk/pywikipedia/panoramiopicker.py (from rev 8668,
trunk/pywikipedia/flickrripper.py)
===================================================================
--- trunk/pywikipedia/panoramiopicker.py (rev 0)
+++ trunk/pywikipedia/panoramiopicker.py 2010-10-20 14:41:13 UTC (rev 8669)
@@ -0,0 +1,449 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Tool to copy a Panoramio set to Commons
+
+'''
+#
+# (C) Multichill, 2010
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import sys, urllib, re, StringIO, hashlib, base64, time, json
+import wikipedia as pywikibot
+import config, query, imagerecat, upload
+
+from Tkinter import *
+from PIL import Image, ImageTk # see:
http://www.pythonware.com/products/pil/
+from BeautifulSoup import BeautifulSoup
+
+def isAllowedLicense(photoInfo = None):
+ '''
+ Check if the image contains the right license
+
+ TODO: Maybe add more licenses
+ '''
+ allowed = [u'by-sa']
+ if photoInfo[u'license'] in allowed:
+ return True
+ else:
+ return False
+
+def downloadPhoto(photoUrl = ''):
+ '''
+ Download the photo and store it in a StrinIO.StringIO object.
+
+ TODO: Add exception handling
+
+ '''
+ imageFile=urllib.urlopen(photoUrl).read()
+ return StringIO.StringIO(imageFile)
+
+def findDuplicateImages(photo=None,
+ site=pywikibot.getSite(u'commons', u'commons')):
+ ''' Takes the photo, calculates the SHA1 hash and asks the mediawiki api
+ for a list of duplicates.
+
+ TODO: Add exception handling, fix site thing
+
+ '''
+ hashObject = hashlib.sha1()
+ hashObject.update(photo.getvalue())
+ return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
+
+def getTags(photoInfo = None):
+ ''' Get all the tags on a photo '''
+ result = []
+
+ return result
+
+def getLicense(photoInfo=None):
+ '''
+ Currently the Panoramio API doesn't expose the license
+ Adding it with a beautiful soup hack
+ '''
+
+ photoInfo['license']=u'c'
+ page = urllib.urlopen(photoInfo.get(u'photo_url'))
+ data = page.read()
+ soup = BeautifulSoup(data)
+ if soup.find("div", {'id' : 'photo-info'}):
+ pointer = soup.find("div", {'id' : 'photo-info'})
+ if pointer.find("div", {'id' : 'photo-details'}):
+ pointer = pointer.find("div", {'id' :
'photo-details'})
+ if pointer.find("ul", {'id' : 'details'}):
+ pointer = pointer.find("ul", {'id' :
'details'})
+ if pointer.find("li", {'class' : 'license
by-sa'}):
+ photoInfo['license']=u'by-sa'
+ # Does Panoramio have more license options?
+
+ return photoInfo
+
+
+
+def getFilename(photoInfo=None, site=pywikibot.getSite(u'commons',
u'commons'),
+ project=u'Panoramio'):
+ ''' Build a good filename for the upload based on the username and the
+ title. Prevents naming collisions.
+
+ '''
+ username = photoInfo.get(u'owner_name')
+ title = photoInfo.get(u'photo_title')
+ if title:
+ title = cleanUpTitle(title)
+ else:
+ title = u''
+
+ if pywikibot.Page(site, u'File:%s - %s - %s.jpg'
+ % (project, username, title) ).exists():
+ i = 1
+ while True:
+ if (pywikibot.Page(site, u'File:%s - %s - %s (%s).jpg'
+ % (project, username, title, str(i))).exists()):
+ i = i + 1
+ else:
+ return u'%s - %s - %s (%s).jpg' % (project, username, title,
+ str(i))
+ else:
+ return u'%s - %s - %s.jpg' % (project, username, title)
+
+def cleanUpTitle(title):
+ ''' Clean up the title of a potential mediawiki page. Otherwise the title
of
+ the page might not be allowed by the software.
+
+ '''
+ title = title.strip()
+ title = re.sub(u"[<{\\[]", u"(", title)
+ title = re.sub(u"[>}\\]]", u")", title)
+ title = re.sub(u"[ _]?\\(!\\)", u"", title)
+ title = re.sub(u",:[ _]", u", ", title)
+ title = re.sub(u"[;:][ _]", u", ", title)
+ title = re.sub(u"[\t\n ]+", u" ", title)
+ title = re.sub(u"[\r\n ]+", u" ", title)
+ title = re.sub(u"[\n]+", u"", title)
+ title = re.sub(u"[?!]([.\"]|$)", u"\\1", title)
+ title = re.sub(u"[&#%?!]", u"^", title)
+ title = re.sub(u"[;]", u",", title)
+ title = re.sub(u"[/+\\\\:]", u"-", title)
+ title = re.sub(u"--+", u"-", title)
+ title = re.sub(u",,+", u",", title)
+ title = re.sub(u"[-,^]([.]|$)", u"\\1", title)
+ title = title.replace(u" ", u"_")
+ return title
+
+
+def getDescription(photoInfo=None, panoramioreview=False, reviewer=u'',
+ override=u'', addCategory=u''):
+ '''
+ Build description for the image.
+ '''
+
+ desc = u''
+ desc = desc + u'{{Information\n'
+ desc = desc + u'|description=%(photo_title)s\n'
+ desc = desc + u'|date=%(upload_date)s (upload date)\n'
+ desc = desc + u'|source=[%(photo_url)s Panoramio]\n'
+ desc = desc + u'|author=[%(owner_url)s?with_photo_id=%(photo_id)s %(owner_name)s]
\n'
+ desc = desc + u'|permission=\n'
+ desc = desc + u'|other_versions=\n'
+ desc = desc + u'|other_fields=\n'
+ desc = desc + u'}}\n'
+ if photoInfo.get(u'latitude') and photoInfo.get(u'longitude'):
+ desc = desc + u'{{Location
dec|%(latitude)s|%(longitude)s|source:Panoramio}}\n'
+ desc = desc + u'\n'
+ desc = desc + u'=={{int:license-header}}==\n'
+
+ if override:
+ desc = desc + override
+ else:
+ if photoInfo.get(u'license')==u'by-sa':
+ desc = desc + u'{{Cc-by-sa-3.0}}\n'
+ if panoramioreview:
+ desc = desc +
u'{{Panoramioreview|%s|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}\n'
% (reviewer,)
+ else:
+ desc = desc + u'{{Panoramioreview}}\n'
+
+ desc = desc + u'\n'
+ if addCategory:
+ desc = desc + u'\n[[Category:%s]]\n' % (addCategory,)
+ # Still have to get a bunch of categories based on the location
+ desc = desc + u'{{subst:Unc}}\n'
+
+ return desc % photoInfo
+
+def processPhoto(photoInfo=None, panoramioreview=False, reviewer=u'',
+ override=u'', addCategory=u'', autonomous=False):
+ ''' Process a single Panoramio photo '''
+
+
+ if isAllowedLicense(photoInfo) or override:
+ #Should download the photo only once
+ photo = downloadPhoto(photoInfo.get(u'photo_file_url'))
+
+ #Don't upload duplicate images, should add override option
+ duplicates = findDuplicateImages(photo)
+ if duplicates:
+ pywikibot.output(u'Found duplicate image at %s' % duplicates.pop())
+ else:
+ filename = getFilename(photoInfo)
+ pywikibot.output(filename)
+ description = getDescription(photoInfo, panoramioreview,
+ reviewer, override, addCategory)
+
+ pywikibot.output(description)
+ if not autonomous:
+ (newDescription, newFilename, skip) = Tkdialog(
+ description, photo, filename).run()
+ else:
+ newDescription = description
+ newFilename = filename
+ skip = False
+ #pywikibot.output(newPhotoDescription)
+ #if (pywikibot.Page(title=u'File:'+ filename,
site=pywikibot.getSite()).exists()):
+ # I should probably check if the hash is the same and if not upload it under a
different name
+ #pywikibot.output(u'File:' + filename + u' already exists!')
+ #else:
+ #Do the actual upload
+ #Would be nice to check before I upload if the file is already at Commons
+ #Not that important for this program, but maybe for derived programs
+ if not skip:
+ bot = upload.UploadRobot(photoInfo.get(u'photo_file_url'),
+ description=newDescription,
+ useFilename=newFilename,
+ keepFilename=True,
+ verifyDescription=False)
+ bot.upload_image(debug=False)
+ return 1
+ return 0
+
+
+class Tkdialog:
+ ''' The user dialog. '''
+ def __init__(self, photoDescription, photo, filename):
+ self.root=Tk()
+ #"%dx%d%+d%+d" % (width, height, xoffset, yoffset)
+ self.root.geometry("%ix%i+10-10"%(config.tkhorsize,
config.tkvertsize))
+
+ self.root.title(filename)
+ self.photoDescription = photoDescription
+ self.filename = filename
+ self.photo = photo
+ self.skip=False
+ self.exit=False
+
+ ## Init of the widgets
+ # The image
+ self.image=self.getImage(self.photo, 800, 600)
+ self.imagePanel=Label(self.root, image=self.image)
+
+ self.imagePanel.image = self.image
+
+ # The filename
+ self.filenameLabel=Label(self.root,text=u"Suggested filename")
+ self.filenameField=Entry(self.root, width=100)
+ self.filenameField.insert(END, filename)
+
+ # The description
+ self.descriptionLabel=Label(self.root,text=u"Suggested description")
+ self.descriptionScrollbar=Scrollbar(self.root, orient=VERTICAL)
+ self.descriptionField=Text(self.root)
+ self.descriptionField.insert(END, photoDescription)
+ self.descriptionField.config(state=NORMAL, height=12, width=100, padx=0, pady=0,
wrap=WORD, yscrollcommand=self.descriptionScrollbar.set)
+ self.descriptionScrollbar.config(command=self.descriptionField.yview)
+
+ # The buttons
+ self.okButton=Button(self.root, text="OK", command=self.okFile)
+ self.skipButton=Button(self.root, text="Skip", command=self.skipFile)
+
+ ## Start grid
+
+ # The image
+ self.imagePanel.grid(row=0, column=0, rowspan=11, columnspan=4)
+
+ # The buttons
+ self.okButton.grid(row=11, column=1, rowspan=2)
+ self.skipButton.grid(row=11, column=2, rowspan=2)
+
+ # The filename
+ self.filenameLabel.grid(row=13, column=0)
+ self.filenameField.grid(row=13, column=1, columnspan=3)
+
+ # The description
+ self.descriptionLabel.grid(row=14, column=0)
+ self.descriptionField.grid(row=14, column=1, columnspan=3)
+ self.descriptionScrollbar.grid(row=14, column=5)
+
+ def getImage(self, photo, width, height):
+ ''' Take the StringIO object and build an imageTK thumbnail
'''
+ image = Image.open(photo)
+ image.thumbnail((width, height))
+ imageTk = ImageTk.PhotoImage(image)
+ return imageTk
+
+ def okFile(self):
+ ''' The user pressed the OK button. '''
+ self.filename=self.filenameField.get()
+ self.photoDescription=self.descriptionField.get(0.0, END)
+ self.root.destroy()
+
+ def skipFile(self):
+ ''' The user pressed the Skip button. '''
+ self.skip=True
+ self.root.destroy()
+
+ def run(self):
+ ''' Activate the dialog and return the new name and if the image is
+ skipped.
+
+ '''
+ self.root.mainloop()
+ return (self.photoDescription, self.filename, self.skip)
+
+
+def getPhotos(photoset=u'', start_id='', end_id='',
interval=100):
+ ''' Loop over a set of Panoramio photos. '''
+ i=0
+ has_more=True
+ url =
u'http://www.panoramio.com/map/get_panoramas.php?set=%s&from=%s&…
+ while has_more:
+ gotInfo = False
+ maxtries = 10
+ tries = 0
+ while(not gotInfo):
+ try:
+ if ( tries < maxtries ):
+ tries = tries + 1
+ panoramioApiPage = urllib.urlopen(url % (photoset, i, i+interval))
+ contents = panoramioApiPage.read().decode('utf-8')
+ gotInfo = True
+ i = i + interval
+ else:
+ break
+ except IOError:
+ pywikibot.output(u'Got an IOError, let\'s try again')
+ except socket.timeout:
+ pywikibot.output(u'Got a timeout, let\'s try again')
+
+ metadata = json.loads(contents)
+ count = metadata.get(u'count') # Useless?
+ photos = metadata.get(u'photos')
+ for photo in photos:
+ yield photo
+ has_more = metadata.get(u'has_more')
+
+ return
+
+def usage():
+ '''
+ Print usage information
+
+ TODO : Need more.
+ '''
+ pywikibot.output(
+ u"Panoramiopicker is a tool to transfer Panaramio photos to Wikimedia
Commons")
+ pywikibot.output(u"-set:<set_id>\n")
+ return
+
+def main():
+ site = pywikibot.getSite(u'commons', u'commons')
+ pywikibot.setSite(site)
+ #imagerecat.initLists()
+
+ photoset = u'' #public (popular photos), full (all photos), user ID number
+ size = u'original'
+ minx = u''
+ miny = u''
+ maxx = u''
+ maxy = u''
+ start_id = u''
+ end_id = u''
+ addCategory = u''
+ autonomous = False
+ totalPhotos = 0
+ uploadedPhotos = 0
+
+ # Do we mark the images as reviewed right away?
+ if config.panoramio ['review']:
+ panoramioreview = config.panoramio['review']
+ else:
+ panoramioreview = False
+
+ # Set the Panoramio reviewer
+ if config.panoramio['reviewer']:
+ reviewer = config.panoramio['reviewer']
+ elif 'commons' in config.sysopnames['commons']:
+ print config.sysopnames['commons']
+ reviewer = config.sysopnames['commons']['commons']
+ elif 'commons' in config.usernames['commons']:
+ reviewer = config.usernames['commons']['commons']
+ else:
+ reviewer = u''
+
+ # Should be renamed to overrideLicense or something like that
+ override = u''
+ for arg in pywikibot.handleArgs():
+ if arg.startswith('-set'):
+ if len(arg) == 4:
+ photoset = pywikibot.input(u'What is the set?')
+ else:
+ photoset = arg[5:]
+ elif arg.startswith('-start_id'):
+ if len(arg) == 9:
+ start_id = pywikibot.input(
+ u'What is the id of the photo you want to start at?')
+ else:
+ start_id = arg[10:]
+ elif arg.startswith('-end_id'):
+ if len(arg) == 7:
+ end_id = pywikibot.input(
+ u'What is the id of the photo you want to end at?')
+ else:
+ end_id = arg[8:]
+ elif arg.startswith('-tags'):
+ if len(arg) == 5:
+ tags = pywikibot.input(
+ u'What is the tag you want to filter out (currently only one
supported)?')
+ else:
+ tags = arg[6:]
+ elif arg == '-panoramioreview':
+ panoramioreview = True
+ elif arg.startswith('-reviewer'):
+ if len(arg) == 9:
+ reviewer = pywikibot.input(u'Who is the reviewer?')
+ else:
+ reviewer = arg[10:]
+ elif arg.startswith('-override'):
+ if len(arg) == 9:
+ override = pywikibot.input(u'What is the override text?')
+ else:
+ override = arg[10:]
+ elif arg.startswith('-addcategory'):
+ if len(arg) == 12:
+ addCategory = pywikibot.input(
+ u'What category do you want to add?')
+ else:
+ addCategory = arg[13:]
+ elif arg == '-autonomous':
+ autonomous = True
+
+ if photoset:
+ for photoInfo in getPhotos(photoset, start_id, end_id):
+ photoInfo = getLicense(photoInfo)
+ #time.sleep(10)
+ uploadedPhotos += processPhoto(photoInfo, panoramioreview,
+ reviewer, override, addCategory,
+ autonomous)
+ totalPhotos += 1
+ else:
+ usage()
+ pywikibot.output(u'Finished running')
+ pywikibot.output(u'Total photos: ' + str(totalPhotos))
+ pywikibot.output(u'Uploaded photos: ' + str(uploadedPhotos))
+
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ pywikibot.stopme()