[Gerrit] Port flickrripper.py from compat - change (pywikibot/core) - Pywikibot-commits

1 Oct 2013

jenkins-bot has submitted this change and it was merged.
Change subject: Port flickrripper.py from compat
......................................................................
Port flickrripper.py from compat
Change-Id: I0e4b71f90d4690861ea58ba3bf754c1d4f49f1c5
---
A scripts/flickrripper.py
1 file changed, 620 insertions(+), 0 deletions(-)
Approvals:
  Merlijn van Deen: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/scripts/flickrripper.py b/scripts/flickrripper.py
new file mode 100644
index 0000000..3e6f145
--- /dev/null
+++ b/scripts/flickrripper.py
@@ -0,0 +1,620 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+'''
+Tool to copy a flickr stream to Commons
+
+# Get a set to work on (start with just a username).
+# * Make it possible to delimit the set (from/to)
+#For each image
+#*Check the license
+#*Check if it isn't already on Commons
+#*Build suggested filename
+#**Check for name collision and maybe alter it
+#*Pull description from Flinfo
+#*Show image and description to user
+#**Add a nice hotcat lookalike for the adding of categories
+#**Filter the categories
+#*Upload the image
+
+Todo:
+*Check if the image is already uploaded (SHA hash)
+*Check and prevent filename collisions
+**Initial suggestion
+**User input
+*Filter the categories
+
+'''
+#
+# (C) Multichill, 2009
+# (C) Pywikipedia team, 2009-2013
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import sys, urllib, re,  StringIO, hashlib, base64, time
+import pywikibot
+from pywikibot import config
+from pywikibot.data import api
+import imagerecat, upload
+
+import flickrapi                  # see: http://stuvel.eu/projects/flickrapi
+import xml.etree.ElementTree
+from Tkinter import *
+from PIL import Image, ImageTk    # see: http://www.pythonware.com/products/pil/
+
+flickr_allowed_license = {
+    0 : False, # All Rights Reserved
+    1 : False, # Creative Commons Attribution-NonCommercial-ShareAlike License
+    2 : False, # Creative Commons Attribution-NonCommercial License
+    3 : False, # Creative Commons Attribution-NonCommercial-NoDerivs License
+    4 : True,  # Creative Commons Attribution License
+    5 : True,  # Creative Commons Attribution-ShareAlike License
+    6 : False, # Creative Commons Attribution-NoDerivs License
+    7 : True,  # No known copyright restrictions
+    8 : True,  # United States Government Work
+}
+
+def getPhoto(flickr = None, photo_id = ''):
+    '''
+    Get the photo info and the photo sizes so we can use these later on
+
+    TODO: Add exception handling
+
+    '''
+    gotPhoto = False
+    while not gotPhoto:
+        try:
+            photoInfo = flickr.photos_getInfo(photo_id=photo_id)
+            #xml.etree.ElementTree.dump(photoInfo)
+            photoSizes = flickr.photos_getSizes(photo_id=photo_id)
+            #xml.etree.ElementTree.dump(photoSizes)
+            gotPhoto = True
+        except flickrapi.exceptions.FlickrError:
+            gotPhotos = False
+            pywikibot.output(u'Flickr api problem, sleeping')
+            time.sleep(30)
+    return (photoInfo, photoSizes)
+
+def isAllowedLicense(photoInfo = None):
+    '''
+    Check if the image contains the right license
+
+    TODO: Maybe add more licenses
+    '''
+
+    license = photoInfo.find('photo').attrib['license']
+    if flickr_allowed_license[int(license)]:
+        return True
+    else:
+        return False
+
+def getPhotoUrl(photoSizes = None):
+    '''
+    Get the url of the jpg file with the highest resolution
+    '''
+    url = ''
+    # The assumption is that the largest image is last
+    for size in photoSizes.find('sizes').findall('size'):
+        url = size.attrib['source']
+    return url
+
+def downloadPhoto(photoUrl = ''):
+    '''
+    Download the photo and store it in a StrinIO.StringIO object.
+
+    TODO: Add exception handling
+
+    '''
+    imageFile=urllib.urlopen(photoUrl).read()
+    return StringIO.StringIO(imageFile)
+
+def findDuplicateImages(photo=None,
+                        site=pywikibot.getSite(u'commons', u'commons')):
+    ''' Takes the photo, calculates the SHA1 hash and asks the mediawiki api
+    for a list of duplicates.
+
+    TODO: Add exception handling, fix site thing
+
+    '''
+    hashObject = hashlib.sha1()
+    hashObject.update(photo.getvalue())
+    return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
+
+def getTags(photoInfo = None):
+    ''' Get all the tags on a photo '''
+    result = []
+    for tag in photoInfo.find('photo').find('tags').findall('tag'):
+        result.append(tag.text.lower())
+
+    return result
+
+def getFlinfoDescription(photo_id = 0):
+    '''
+    Get the description from http://wikipedia.ramselehof.de/flinfo.php
+
+    TODO: Add exception handling, try a couple of times
+    '''
+    parameters = urllib.urlencode({'id' : photo_id, 'raw' : 'on'})
+
+    rawDescription = urllib.urlopen(
+        "http://wikipedia.ramselehof.de/flinfo.php?%s" % parameters).read()
+
+    return rawDescription.decode('utf-8')
+
+def getFilename(photoInfo=None, site=None, project=u'Flickr'):
+    """ Build a good filename for the upload based on the username and the
+    title. Prevents naming collisions.
+
+    """
+    if not site:
+        site = pywikibot.Site(u'commons', u'commons')
+    username = photoInfo.find('photo').find('owner').attrib['username']
+    title = photoInfo.find('photo').find('title').text
+    if title:
+        title = cleanUpTitle(title)
+
+    if not title:
+        #find the max length for a mw title
+        maxBytes = 240 - len(project.encode('utf-8')) \
+                       - len(username.encode('utf-8'))
+        description = photoInfo.find('photo').find('description').text
+        if description:
+            descBytes = len(description.encode('utf-8'))
+            if descBytes > maxBytes:
+                # maybe we cut more than needed, anyway we do it
+                items = max(min(len(description), maxBytes / 4),
+                            len(description) - descBytes + maxBytes)
+                description = description[:items]
+            title = cleanUpTitle(description)
+        else:
+            title = u''
+            # Should probably have the id of the photo as last resort.
+
+    if pywikibot.Page(site, u'File:%s - %s - %s.jpg'
+                      % (title, project, username)).exists():
+        i = 1
+        while True:
+            if (pywikibot.Page(site, u'File:%s - %s - %s (%d).jpg'
+                               % (title, project, username, i)).exists()):
+                i += 1
+            else:
+                return u'%s - %s - %s (%d).jpg' % (title, project, username, i)
+    else:
+        return u'%s - %s - %s.jpg' % (title, project, username)
+
+def cleanUpTitle(title):
+    ''' Clean up the title of a potential mediawiki page. Otherwise the title of
+    the page might not be allowed by the software.
+
+    '''
+    title = title.strip()
+    title = re.sub(u"[<{\[]", u"(", title)
+    title = re.sub(u"[>}\]]", u")", title)
+    title = re.sub(u"[ _]?\(!\)", u"", title)
+    title = re.sub(u",:[ _]", u", ", title)
+    title = re.sub(u"[;:][ _]", u", ", title)
+    title = re.sub(u"[\t\n ]+", u" ", title)
+    title = re.sub(u"[\r\n ]+", u" ", title)
+    title = re.sub(u"[\n]+", u"", title)
+    title = re.sub(u"[?!]([."]|$)", u"\1", title)
+    title = re.sub(u"[&#%?!]", u"^", title)
+    title = re.sub(u"[;]", u",", title)
+    title = re.sub(u"[/+\\:]", u"-", title)
+    title = re.sub(u"--+", u"-", title)
+    title = re.sub(u",,+", u",", title)
+    title = re.sub(u"[-,^]([.]|$)", u"\1", title)
+    title = title.replace(u" ", u"_")
+    title = title.strip(u"_")
+    return title
+
+
+def buildDescription(flinfoDescription=u'', flickrreview=False, reviewer=u'',
+                     override=u'', addCategory=u'', removeCategories=False):
+    ''' Build the final description for the image. The description is based on
+    the info from flickrinfo and improved.
+
+    '''
+    description = u'== {{int:filedesc}} ==\n%s' % flinfoDescription
+    if removeCategories:
+        description = pywikibot.removeCategoryLinks(description,
+                                                    pywikibot.Site(
+                                                        'commons', 'commons'))
+    if override:
+        description = description.replace(u'{{cc-by-sa-2.0}}\n', u'')
+        description = description.replace(u'{{cc-by-2.0}}\n', u'')
+        description = description.replace(u'{{flickrreview}}\n', u'')
+        description = description.replace(
+            u'{{copyvio|Flickr, licensed as "All Rights Reserved" which is not a free license --~~~~}}\n',
+            u'')
+        description = description.replace(u'=={{int:license}}==',
+                                          u'=={{int:license}}==\n' + override)
+    elif flickrreview:
+        if reviewer:
+            description = description.replace(u'{{flickrreview}}',
+                                              u'{{flickrreview|' + reviewer +
+                                              '|{{subst:CURRENTYEAR}}-{{subst:CURRENTMONTH}}-{{subst:CURRENTDAY2}}}}')
+    if addCategory:
+        description = description.replace(u'{{subst:unc}}\n', u'')
+        description = description + u'\n[[Category:' + addCategory + ']]\n'
+    description = description.replace(u'\r\n', u'\n')
+    return description
+
+def processPhoto(flickr=None, photo_id=u'', flickrreview=False, reviewer=u'',
+                 override=u'', addCategory=u'', removeCategories=False,
+                 autonomous=False):
+    ''' Process a single Flickr photo '''
+    if photo_id:
+        print photo_id
+        (photoInfo, photoSizes) = getPhoto(flickr, photo_id)
+    if  isAllowedLicense(photoInfo) or override:
+        #Get the url of the largest photo
+        photoUrl = getPhotoUrl(photoSizes)
+        #Should download the photo only once
+        photo = downloadPhoto(photoUrl)
+
+        #Don't upload duplicate images, should add override option
+        duplicates = findDuplicateImages(photo)
+        if duplicates:
+            pywikibot.output(u'Found duplicate image at %s' % duplicates.pop())
+        else:
+            filename = getFilename(photoInfo)
+            flinfoDescription = getFlinfoDescription(photo_id)
+            photoDescription = buildDescription(flinfoDescription,
+                                                flickrreview, reviewer,
+                                                override, addCategory,
+                                                removeCategories)
+            #pywikibot.output(photoDescription)
+            if not autonomous:
+                (newPhotoDescription, newFilename, skip) = Tkdialog(
+                    photoDescription, photo, filename).run()
+            else:
+                newPhotoDescription = photoDescription
+                newFilename = filename
+                skip = False
+        #pywikibot.output(newPhotoDescription)
+        #if (pywikibot.Page(title=u'File:'+ filename, site=pywikibot.getSite()).exists()):
+        # I should probably check if the hash is the same and if not upload it under a different name
+        #pywikibot.output(u'File:' + filename + u' already exists!')
+        #else:
+            #Do the actual upload
+            #Would be nice to check before I upload if the file is already at Commons
+            #Not that important for this program, but maybe for derived programs
+            if not skip:
+                bot = upload.UploadRobot(photoUrl,
+                                         description=newPhotoDescription,
+                                         useFilename=newFilename,
+                                         keepFilename=True,
+                                         verifyDescription=False)
+                bot.upload_image(debug=False)
+                return 1
+    else:
+        pywikibot.output(u'Invalid license')
+    return 0
+
+
+class Tkdialog:
+    ''' The user dialog. '''
+    def __init__(self, photoDescription, photo, filename):
+        self.root=Tk()
+        #"%dx%d%+d%+d" % (width, height, xoffset, yoffset)
+        self.root.geometry("%ix%i+10-10"%(config.tkhorsize, config.tkvertsize))
+
+        self.root.title(filename)
+        self.photoDescription = photoDescription
+        self.filename = filename
+        self.photo = photo
+        self.skip=False
+        self.exit=False
+
+        ## Init of the widgets
+        # The image
+        self.image=self.getImage(self.photo, 800, 600)
+        self.imagePanel=Label(self.root, image=self.image)
+
+        self.imagePanel.image = self.image
+
+        # The filename
+        self.filenameLabel=Label(self.root,text=u"Suggested filename")
+        self.filenameField=Entry(self.root, width=100)
+        self.filenameField.insert(END, filename)
+
+        # The description
+        self.descriptionLabel=Label(self.root,text=u"Suggested description")
+        self.descriptionScrollbar=Scrollbar(self.root, orient=VERTICAL)
+        self.descriptionField=Text(self.root)
+        self.descriptionField.insert(END, photoDescription)
+        self.descriptionField.config(state=NORMAL, height=12, width=100, padx=0, pady=0, wrap=WORD, yscrollcommand=self.descriptionScrollbar.set)
+        self.descriptionScrollbar.config(command=self.descriptionField.yview)
+
+        # The buttons
+        self.okButton=Button(self.root, text="OK", command=self.okFile)
+        self.skipButton=Button(self.root, text="Skip", command=self.skipFile)
+
+        ## Start grid
+
+        # The image
+        self.imagePanel.grid(row=0, column=0, rowspan=11, columnspan=4)
+
+        # The buttons
+        self.okButton.grid(row=11, column=1, rowspan=2)
+        self.skipButton.grid(row=11, column=2, rowspan=2)
+
+        # The filename
+        self.filenameLabel.grid(row=13, column=0)
+        self.filenameField.grid(row=13, column=1, columnspan=3)
+
+        # The description
+        self.descriptionLabel.grid(row=14, column=0)
+        self.descriptionField.grid(row=14, column=1, columnspan=3)
+        self.descriptionScrollbar.grid(row=14, column=5)
+
+    def getImage(self, photo, width, height):
+        ''' Take the StringIO object and build an imageTK thumbnail '''
+        image = Image.open(photo)
+        image.thumbnail((width, height))
+        imageTk = ImageTk.PhotoImage(image)
+        return imageTk
+
+    def okFile(self):
+        ''' The user pressed the OK button. '''
+        self.filename=self.filenameField.get()
+        self.photoDescription=self.descriptionField.get(0.0, END)
+        self.root.destroy()
+
+    def skipFile(self):
+        ''' The user pressed the Skip button. '''
+        self.skip=True
+        self.root.destroy()
+
+    def run(self):
+        ''' Activate the dialog and return the new name and if the image is
+        skipped.
+
+        '''
+        self.root.mainloop()
+        return (self.photoDescription, self.filename, self.skip)
+
+
+def getPhotos(flickr=None, user_id=u'', group_id=u'', photoset_id=u'',
+              start_id='', end_id='', tags=u''):
+    ''' Loop over a set of Flickr photos. '''
+    result = []
+    retry = False
+    if not start_id:
+        found_start_id=True
+    else:
+        found_start_id=False
+
+    # http://www.flickr.com/services/api/flickr.groups.pools.getPhotos.html
+    # Get the photos in a group
+    if group_id:
+        #First get the total number of photo's in the group
+        photos = flickr.groups_pools_getPhotos(group_id=group_id,
+                                               user_id=user_id, tags=tags,
+                                               per_page='100', page='1')
+        pages = photos.find('photos').attrib['pages']
+
+        for i in range(1, int(pages) + 1):
+            gotPhotos = False
+            while not gotPhotos:
+                try:
+                    for photo in flickr.groups_pools_getPhotos(
+                        group_id=group_id, user_id=user_id, tags=tags,
+                        per_page='100', page=i
+                        ).find('photos').getchildren():
+                        gotPhotos = True
+                        if photo.attrib['id']==start_id:
+                            found_start_id=True
+                        if found_start_id:
+                            if photo.attrib['id']==end_id:
+                                pywikibot.output('Found end_id')
+                                return
+                            else:
+                                yield photo.attrib['id']
+
+                except flickrapi.exceptions.FlickrError:
+                    gotPhotos = False
+                    pywikibot.output(u'Flickr api problem, sleeping')
+                    time.sleep(30)
+
+    # http://www.flickr.com/services/api/flickr.photosets.getPhotos.html
+    # Get the photos in a photoset
+    elif photoset_id:
+        photos = flickr.photosets_getPhotos(photoset_id=photoset_id,
+                                            per_page='100', page='1')
+        pages = photos.find('photoset').attrib['pages']
+
+        for i in range(1, int(pages)+1):
+            gotPhotos = False
+            while not gotPhotos:
+                try:
+                    for photo in flickr.photosets_getPhotos(
+                        photoset_id=photoset_id, per_page='100', page=i
+                        ).find('photoset').getchildren():
+                        gotPhotos = True
+                        if photo.attrib['id']==start_id:
+                            found_start_id=True
+                        if found_start_id:
+                            if photo.attrib['id']==end_id:
+                                pywikibot.output('Found end_id')
+                                return
+                            else:
+                                yield photo.attrib['id']
+
+                except flickrapi.exceptions.FlickrError:
+                    gotPhotos = False
+                    pywikibot.output(u'Flickr api problem, sleeping')
+                    time.sleep(30)
+
+    # http://www.flickr.com/services/api/flickr.people.getPublicPhotos.html
+    # Get the (public) photos uploaded by a user
+    elif user_id:
+        photos = flickr.people_getPublicPhotos(user_id=user_id,
+                                               per_page='100', page='1')
+        pages = photos.find('photos').attrib['pages']
+        #flickrapi.exceptions.FlickrError
+        for i in range(1, int(pages)+1):
+            gotPhotos = False
+            while not gotPhotos:
+                try:
+                    for photo in flickr.people_getPublicPhotos(
+                        user_id=user_id, per_page='100', page=i
+                        ).find('photos').getchildren():
+                        gotPhotos = True
+                        if photo.attrib['id'] == start_id:
+                            found_start_id=True
+                        if found_start_id:
+                            if photo.attrib['id'] == end_id:
+                                pywikibot.output('Found end_id')
+                                return
+                            else:
+                                yield photo.attrib['id']
+
+                except flickrapi.exceptions.FlickrError:
+                    gotPhotos = False
+                    pywikibot.output(u'Flickr api problem, sleeping')
+                    time.sleep(30)
+
+    return
+
+def usage():
+    '''
+    Print usage information
+
+    TODO : Need more.
+    '''
+    pywikibot.output(
+        u"Flickrripper is a tool to transfer flickr photos to Wikimedia Commons")
+    pywikibot.output(u"-group_id:<group_id>\n")
+    pywikibot.output(u"-photoset_id:<photoset_id>\n")
+    pywikibot.output(u"-user_id:<user_id>\n")
+    pywikibot.output(u"-tags:<tag>\n")
+    return
+
+def main():
+    site = pywikibot.getSite(u'commons', u'commons')
+    #imagerecat.initLists()
+
+    #Get the api key
+    if not config.flickr['api_key']:
+        pywikibot.output('Flickr api key not found! Get yourself an api key')
+        pywikibot.output(
+            'Any flickr user can get a key at http://www.flickr.com/services/api/keys/apply/')
+        return
+
+    if 'api_secret' in config.flickr and config.flickr['api_secret']:
+        flickr = flickrapi.FlickrAPI(config.flickr['api_key'], config.flickr['api_secret'])
+        (token, frob) = flickr.get_token_part_one(perms='read')
+        if not token: # The user still hasn't authorised this app yet, get_token_part_one() will have spawn a browser window
+            pywikibot.input("Press ENTER after you authorized this program")
+        flickr.get_token_part_two((token, frob))
+    else:
+        print 'Accessing public content only'
+        flickr = flickrapi.FlickrAPI(config.flickr['api_key'])
+
+
+    group_id = u''
+    photoset_id = u''
+    user_id = u''
+    start_id= u''
+    end_id=u''
+    tags = u''
+    addCategory = u''
+    removeCategories = False
+    autonomous = False
+    totalPhotos = 0
+    uploadedPhotos = 0
+
+    # Do we mark the images as reviewed right away?
+    if config.flickr['review']:
+        flickrreview = config.flickr['review']
+    else:
+        flickrreview = False
+
+    # Set the Flickr reviewer
+    if config.flickr['reviewer']:
+        reviewer = config.flickr['reviewer']
+    elif 'commons' in config.sysopnames['commons']:
+        print config.sysopnames['commons']
+        reviewer = config.sysopnames['commons']['commons']
+    elif 'commons' in config.usernames['commons']:
+        reviewer = config.usernames['commons']['commons']
+    else:
+        reviewer = u''
+
+    # Should be renamed to overrideLicense or something like that
+    override = u''
+    for arg in pywikibot.handleArgs():
+        if arg.startswith('-group_id'):
+            if len(arg) == 9:
+                group_id = pywikibot.input(u'What is the group_id of the pool?')
+            else:
+                group_id = arg[10:]
+        elif arg.startswith('-photoset_id'):
+            if len(arg) == 12:
+                photoset_id = pywikibot.input(u'What is the photoset_id?')
+            else:
+                photoset_id = arg[13:]
+        elif arg.startswith('-user_id'):
+            if len(arg) == 8:
+                user_id = pywikibot.input(
+                    u'What is the user_id of the flickr user?')
+            else:
+                user_id = arg[9:]
+        elif arg.startswith('-start_id'):
+            if len(arg) == 9:
+                start_id = pywikibot.input(
+                    u'What is the id of the photo you want to start at?')
+            else:
+                start_id = arg[10:]
+        elif arg.startswith('-end_id'):
+            if len(arg) == 7:
+                end_id = pywikibot.input(
+                    u'What is the id of the photo you want to end at?')
+            else:
+                end_id = arg[8:]
+        elif arg.startswith('-tags'):
+            if len(arg) == 5:
+                tags = pywikibot.input(
+                    u'What is the tag you want to filter out (currently only one supported)?')
+            else:
+                tags = arg[6:]
+        elif arg == '-flickrreview':
+            flickrreview = True
+        elif arg.startswith('-reviewer'):
+            if len(arg) == 9:
+                reviewer = pywikibot.input(u'Who is the reviewer?')
+            else:
+                reviewer = arg[10:]
+        elif arg.startswith('-override'):
+            if len(arg) == 9:
+                override = pywikibot.input(u'What is the override text?')
+            else:
+                override = arg[10:]
+        elif arg.startswith('-addcategory'):
+            if len(arg) == 12:
+                addCategory = pywikibot.input(
+                    u'What category do you want to add?')
+            else:
+                addCategory = arg[13:]
+        elif arg == '-removecategories':
+            removeCategories = True
+        elif arg == '-autonomous':
+            autonomous = True
+
+    if user_id or group_id or photoset_id:
+        for photo_id in getPhotos(flickr, user_id, group_id, photoset_id,
+                                  start_id, end_id, tags):
+            uploadedPhotos += processPhoto(flickr, photo_id, flickrreview,
+                                           reviewer, override, addCategory,
+                                           removeCategories, autonomous)
+            totalPhotos += 1
+    else:
+        usage()
+    pywikibot.output(u'Finished running')
+    pywikibot.output(u'Total photos: ' + str(totalPhotos))
+    pywikibot.output(u'Uploaded photos: ' + str(uploadedPhotos))
+
+if __name__ == "__main__":
+    main()
-- 
To view, visit https://gerrit.wikimedia.org/r/86625
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I0e4b71f90d4690861ea58ba3bf754c1d4f49f1c5
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Legoktm legoktm.wikipedia@gmail.com
Gerrit-Reviewer: Ladsgroup ladsgroup@gmail.com
Gerrit-Reviewer: Merlijn van Deen valhallasw@arctus.nl
Gerrit-Reviewer: jenkins-bot