http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10688
Revision: 10688 Author: valhallasw Date: 2012-11-10 20:07:33 +0000 (Sat, 10 Nov 2012) Log Message: ----------- Improved data_ingestion implementation; basic implementation of a CSV-based uploader bot + added tests.
Modified Paths: -------------- trunk/pywikipedia/data_ingestion.py
Added Paths: ----------- trunk/pywikipedia/tests/data/MP_sounds.png trunk/pywikipedia/tests/data/MP_sounds_AUTHORS trunk/pywikipedia/tests/data/anefo.json trunk/pywikipedia/tests/test_data_ingestion.py
Modified: trunk/pywikipedia/data_ingestion.py =================================================================== --- trunk/pywikipedia/data_ingestion.py 2012-11-10 17:25:08 UTC (rev 10687) +++ trunk/pywikipedia/data_ingestion.py 2012-11-10 20:07:33 UTC (rev 10688) @@ -9,10 +9,123 @@ import wikipedia as pywikibot import config, query, upload import csv, urllib -import flickrripper import pagegenerators +import urlparse, posixpath
+class Photo(object): + ''' + Represents a Photo (or other file), with metadata, to upload to Commons. + + The constructor takes two parameters: URL (string) and metadata (dict with str:str key:value pairs) + that can be referred to from the title & template generation. + + + ''' + def __init__(self, URL, metadata): + self.URL = URL + self.metadata = metadata + self.metadata["_url"] = URL + self.metadata["_filename"] = filename = posixpath.split(urlparse.urlparse(URL)[2])[1] + self.metadata["_ext"] = ext = filename.split(".")[-1] + if ext == filename: + self.metadata["_ext"] = ext = None + self.contents = None + + def downloadPhoto(self): + ''' + Download the photo and store it in a StringIO.StringIO object. + + TODO: Add exception handling + ''' + if not self.contents: + imageFile=urllib.urlopen(self.URL).read() + self.contents = StringIO.StringIO(imageFile) + return self.contents + + def findDuplicateImages(self, site = pywikibot.getSite(u'commons', u'commons')): + ''' + Takes the photo, calculates the SHA1 hash and asks the mediawiki api for a list of duplicates. + + TODO: Add exception handling, fix site thing + ''' + hashObject = hashlib.sha1() + hashObject.update(self.downloadPhoto().getvalue()) + return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) + + def getTitle(self, fmt): + """ + Given a format string with %(name)s entries, returns the string formatted with metadata + """ + return fmt % self.metadata + + def getDescription(self, template, extraparams={}): + ''' + Generate a description for a file + ''' + + params = {} + params.update(self.metadata) + params.update(extraparams) + description = u'{{%s\n' % template + for key in sorted(params.keys()): + value = params[key] + if not key.startswith("_"): + description = description + (u'|%s=%s' % (key, self._safeTemplateValue(value))) + "\n" + description = description + u'}}' + + return description + + def _safeTemplateValue(self, value): + return value.replace("|", "{{!}}") + +def CSVReader(fileobj, urlcolumn, *args, **kwargs): + reader = csv.DictReader(fileobj, *args, **kwargs) + + for line in reader: + yield Photo(line[urlcolumn], line) + class DataIngestionBot: + def __init__(self, reader, titlefmt, pagefmt, site=pywikibot.getSite(u'commons', u'commons')): + self.reader = reader + self.titlefmt = titlefmt + self.pagefmt = pagefmt + self.site = site + + def _doUpload(self, photo): + duplicates = photo.findDuplicateImages(self.site) + if duplicates: + pywikibot.output(u"Skipping duplicate of %r" % (duplicates, )) + return duplicates[0] + + title = photo.getTitle(self.titlefmt) + description = photo.getDescription(self.pagefmt) + + bot = upload.UploadRobot(url = photo.URL, + description = description, + useFilename = title, + keepFilename = True, + verifyDescription = False, + targetSite = self.site) + bot._contents = photo.downloadPhoto().getvalue() + bot._retrieved = True + bot.run() + + return title + + def doSingle(self): + return self._doUpload(self.reader.next()) + + def run(self): + for photo in self.reader: + self._doUpload(photo) + +if __name__=="__main__": + reader = CSVReader(open('tests/data/csv_ingestion.csv'), 'url') + bot = DataIngestionBot(reader, "%(name)s - %(set)s.%(_ext)s", ":user:valhallasw/test_template", pywikibot.getSite('test', 'test')) + bot.run() + +""" +class DataIngestionBot: def __init__(self, configurationPage): '''
@@ -63,18 +176,6 @@ hashObject.update(photo.getvalue()) return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
- def getDescription(self, metadata): - ''' - Generate a description for a file - ''' - - description = u'{{%(configurationTemplate)s}}\n' % self.configuration - for key, value in metadata.iteritems(): - description = description + u'|' + key + u'=%(' + key + u')s\n' - description = description + u'}}\n' - - return description % metadata - def getTitle(self, metadata): ''' Build a title. @@ -180,3 +281,4 @@ main(sys.argv[1:]) finally: print "All done!" +"""
Added: trunk/pywikipedia/tests/data/MP_sounds.png =================================================================== (Binary files differ)
Property changes on: trunk/pywikipedia/tests/data/MP_sounds.png ___________________________________________________________________ Added: svn:mime-type + application/octet-stream
Added: trunk/pywikipedia/tests/data/MP_sounds_AUTHORS =================================================================== --- trunk/pywikipedia/tests/data/MP_sounds_AUTHORS (rev 0) +++ trunk/pywikipedia/tests/data/MP_sounds_AUTHORS 2012-11-10 20:07:33 UTC (rev 10688) @@ -0,0 +1,8 @@ +Description +English: "Sounds" icon +Date +Source File:Sound-icon.svg +Author KDE artists / Silsor +Permission +(Reusing this file) + This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. See version 2.1 and version 3 of the GNU Lesser General Public License for more details.
Added: trunk/pywikipedia/tests/data/anefo.json =================================================================== --- trunk/pywikipedia/tests/data/anefo.json (rev 0) +++ trunk/pywikipedia/tests/data/anefo.json 2012-11-10 20:07:33 UTC (rev 10688) @@ -0,0 +1 @@ +{"response":{"numFound":1,"start":0,"docs":[{"id":"ad33b886-d0b4-102d-bcf8-003048976d84","mrx_collection_id":"2f7593fe-b9a1-11df-ba8e-03c82bd9ba46","Inhoud":"Amsterdam, Prinsenhof conferentie over discriminatie en racisme; publieke belangstelling bij de zaalmicrofoon","Nummer_toegang":"2.24.01.05","hdl":"hdl://10648/ad33b886-d0b4-102d-bcf8-003048976d84","auteursrechten_auteursrechthebbende":"Nationaal Archief, CC-BY-SA","oldid":"na:col1:dat317440","Titel":"Amsterdam, Prinsenhof conferentie over discriminatie en racisme; publieke belangstelling bij de zaalmicrofoon","Inhoudsdatering":"1984-01-26T00:00:00Z","Datering":"26 januari 1984","Webwinkel":"Ja","auteursrechten_voorwaarde_CC_BY_SA":true,"auteursrechten_voorwaarde_webwinkel":true,"auteursrechten_voorwaarde_webdisplay":true,"auteursrechten_voorwaarde_bestelbaar":true,"timestamp":"2012-11-04T08:35:02.131Z","PhotoName":["NL-HaNA_2.24.01.05_0_932-8455.tjp"],"Geografisch_trefwoord_autocomplete":["Amsterdam","Noord-Holland"],"searchfield":["NL-HaNA_2.24.01.05_0_932-8455.tjp","Prinsenhof","317440","[ onbekend ]","Amsterdam, Prinsenhof conferentie over discriminatie en racisme; publieke belangstelling bij de zaalmicrofoon","932-8455","2.24.01.05","Nationaal Archief, CC-BY-SA","DISCRIMINATIE","conferenties","Amsterdam","Noord-Holland","Anefo / Croes, R.C.","[onbekend]"],"Serie_Collectie_search":["317440"],"nummer":["2b60a15f-1755-eea7-06d5-c528b09613e1","NL-HaNA_2.24.01.05_0_932-8455.tjp","932-8455"],"Vervaardiger_autocomplete":["Anefo / Croes, R.C.","[onbekend]"],"Serie_Collectie":["317440"],"Geografisch_trefwoord_search":["Amsterdam","Noord-Holland"],"searchfield_with_comment_fuzzy":["NL-HaNA_2.24.01.05_0_932-8455.tjp","Prinsenhof","317440","[ onbekend ]","Amsterdam, Prinsenhof conferentie over discriminatie en racisme; publieke belangstelling bij de zaalmicrofoon","932-8455","2.24.01.05","Nationaal Archief, CC-BY-SA","DISCRIMINATIE","conferenties","Amsterdam","Noord-Holland","Anefo / Croes, R.C.","[onbekend]"],"searchfield_with_comment":["NL-HaNA_2.24.01.05_0_932-8455.tjp","Prinsenhof","317440","[ onbekend ]","Amsterdam, Prinsenhof conferentie over discriminatie en racisme; publieke belangstelling bij de zaalmicrofoon","932-8455","2.24.01.05","Nationaal Archief, CC-BY-SA","DISCRIMINATIE","conferenties","Amsterdam","Noord-Holland","Anefo / Croes, R.C.","[onbekend]"],"Vervaardiger_search":["Anefo / Croes, R.C.","[onbekend]"],"Materiaal_Zoekveld":["Foto"],"Reportage_Serienaam":["[ onbekend ]"],"Vervaardiger":["Anefo / Croes, R.C.","[onbekend]"],"Geografisch_trefwoord":["Amsterdam","Noord-Holland"],"Instellingsnaam":["Prinsenhof"],"searchfield_fuzzy":["Prinsenhof","317440","[ onbekend ]","Amsterdam, Prinsenhof conferentie over discriminatie en racisme; publieke belangstelling bij de zaalmicrofoon","932-8455","2.24.01.05","Nationaal Archief, CC-BY-SA","DISCRIMINATIE","conferenties","Amsterdam","Noord-Holland","Anefo / Croes, R.C.","[onbekend]"],"Materiaalsoort":["negatief, zwart/wit"],"Trefwoorden":["DISCRIMINATIE","conferenties"],"Bestanddeelnummer":["932-8455"],"images":{"2b60a15f-1755-eea7-06d5-c528b09613e1":[{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/800x600/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":800,"height":600},{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/40x40/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":40,"height":40},{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/88x88/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":88,"height":88},{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/150x150/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":150,"height":150},{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/160x160/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":160,"height":160},{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/188x188/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":188,"height":188},{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/300x300/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":300,"height":300},{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/460x460/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":460,"height":460},{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/500x500/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":500,"height":500},{"url":"http://afbeeldingen.gahetna.nl/naa/thumb/1280x1280/2b60a15f-1755-eea7-06d5-c528b09613e1.jpg","width":1280,"height":1280}]}}]}} \ No newline at end of file
Added: trunk/pywikipedia/tests/test_data_ingestion.py =================================================================== --- trunk/pywikipedia/tests/test_data_ingestion.py (rev 0) +++ trunk/pywikipedia/tests/test_data_ingestion.py 2012-11-10 20:07:33 UTC (rev 10688) @@ -0,0 +1,76 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +"""Unit tests for data_ingestion.py""" +__version__ = '$Id: test_userlib.py 9043 2011-03-13 10:25:08Z xqt $' + +import os +import unittest +import test_utils + +import userlib +import wikipedia + +import data_ingestion + +class TestPhoto(unittest.TestCase): + def setUp(self): + self.obj = data_ingestion.Photo(URL='http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png', + metadata={'description.en': '"Sounds" icon', + 'source': 'http://commons.wikimedia.org/wiki/File:Sound-icon.svg', + 'author': 'KDE artists | Silstor', + 'license': 'LGPL', + 'set': 'Crystal SVG icon set', + 'name': 'Sound icon'} + ) + + def test_downloadPhoto(self): + f = open(os.path.join(os.path.split(__file__)[0], 'data', 'MP_sounds.png')) + self.assertEqual(f.read(), self.obj.downloadPhoto().read()) + + def test_findDuplicateImages(self): + duplicates = self.obj.findDuplicateImages() + self.assertIn('MP_sounds.png', duplicates) + + def test_getTitle(self): + self.assertEqual(self.obj.getTitle("%(name)s - %(set)s.%(_ext)s"), "Sound icon - Crystal SVG icon set.png") + + def test_getDescription(self): + self.assertEqual(self.obj.getDescription('CrystalTemplate'), +"""{{CrystalTemplate +|author=KDE artists {{!}} Silstor +|description.en="Sounds" icon +|license=LGPL +|name=Sound icon +|set=Crystal SVG icon set +|source=http://commons.wikimedia.org/wiki/File:Sound-icon.svg +}}""") + + +class TestCSVReader(unittest.TestCase): + def setUp(self): + fileobj = open(os.path.join(os.path.split(__file__)[0], 'data', 'csv_ingestion.csv')) + self.iterator = data_ingestion.CSVReader(fileobj, 'url') + self.obj = self.iterator.next() + + def test_PhotoURL(self): + self.assertEqual(self.obj.URL, 'http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png') + + def test_getTitle(self): + self.assertEqual(self.obj.getTitle("%(name)s - %(set)s.%(_ext)s"), "Sound icon - Crystal SVG icon set.png") + + def test_getDescription(self): + self.assertEqual(self.obj.getDescription('CrystalTemplate'), +"""{{CrystalTemplate +|author=KDE artists {{!}} Silstor +|description.en="Sounds" icon +|license=LGPL +|name=Sound icon +|set=Crystal SVG icon set +|source=http://commons.wikimedia.org/wiki/File:Sound-icon.svg +|url=http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png +}}""") + + +if __name__ == "__main__": + unittest.main()