http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10717
Revision: 10717 Author: valhallasw Date: 2012-11-13 21:35:06 +0000 (Tue, 13 Nov 2012) Log Message: ----------- data_ingestion.py now for both rewrite /and/ trunk!
Modified Paths: -------------- branches/rewrite/pywikibot/site.py trunk/pywikipedia/data_ingestion.py trunk/pywikipedia/tests/test_data_ingestion.py
Added Paths: ----------- branches/rewrite/scripts/data_ingestion.py branches/rewrite/scripts/tests/ branches/rewrite/scripts/tests/data/ branches/rewrite/scripts/tests/data/MP_sounds.png branches/rewrite/scripts/tests/data/csv_ingestion.csv branches/rewrite/scripts/tests/test_data_ingestion.py branches/rewrite/scripts/tests/test_utils.py trunk/pywikipedia/tests/data/csv_ingestion.csv
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2012-11-13 21:25:31 UTC (rev 10716) +++ branches/rewrite/pywikibot/site.py 2012-11-13 21:35:06 UTC (rev 10717) @@ -2830,7 +2830,7 @@ return self.logevents(logtype="upload", total=number, start=lestart, end=leend, user=leuser, page=letitle)
- def getImagesFromAnHash(self, hash_found=None): + def getFilesFromAnHash(self, hash_found=None): """Return all images that have the same hash.
Useful to find duplicates or nowcommons. @@ -2846,6 +2846,11 @@ return [image.title(withNamespace=False) for image in self.allimages(sha1=hash_found)]
+ @deprecated('Site().getFilesFromAnHash') + def getImagesFromAnHash(self, hash_found=None): + return self.getFilesFromAnHash(self, hash_found) + + def upload(self, imagepage, source_filename=None, source_url=None, comment=None, watch=False, ignore_warnings=False): """Upload a file to the wiki.
Added: branches/rewrite/scripts/data_ingestion.py =================================================================== --- branches/rewrite/scripts/data_ingestion.py (rev 0) +++ branches/rewrite/scripts/data_ingestion.py 2012-11-13 21:35:06 UTC (rev 10717) @@ -0,0 +1,283 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +''' +A generic bot to do data ingestion (batch uploading) to Commons + +''' +import pywikibot +import posixpath, urlparse +import urllib +import hashlib, base64 +import StringIO + +class Photo(object): + ''' + Represents a Photo (or other file), with metadata, to upload to Commons. + + The constructor takes two parameters: URL (string) and metadata (dict with str:str key:value pairs) + that can be referred to from the title & template generation. + + + ''' + def __init__(self, URL, metadata): + self.URL = URL + self.metadata = metadata + self.metadata["_url"] = URL + self.metadata["_filename"] = filename = posixpath.split(urlparse.urlparse(URL)[2])[1] + self.metadata["_ext"] = ext = filename.split(".")[-1] + if ext == filename: + self.metadata["_ext"] = ext = None + self.contents = None + + def downloadPhoto(self): + ''' + Download the photo and store it in a StringIO.StringIO object. + + TODO: Add exception handling + ''' + if not self.contents: + imageFile=urllib.urlopen(self.URL).read() + self.contents = StringIO.StringIO(imageFile) + return self.contents + + def findDuplicateImages(self, site = pywikibot.getSite(u'commons', u'commons')): + ''' + Takes the photo, calculates the SHA1 hash and asks the mediawiki api for a list of duplicates. + + TODO: Add exception handling, fix site thing + ''' + hashObject = hashlib.sha1() + hashObject.update(self.downloadPhoto().getvalue()) + return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) + + def getTitle(self, fmt): + """ + Given a format string with %(name)s entries, returns the string formatted with metadata + """ + return fmt % self.metadata + + def getDescription(self, template, extraparams={}): + ''' + Generate a description for a file + ''' + + params = {} + params.update(self.metadata) + params.update(extraparams) + description = u'{{%s\n' % template + for key in sorted(params.keys()): + value = params[key] + if not key.startswith("_"): + description = description + (u'|%s=%s' % (key, self._safeTemplateValue(value))) + "\n" + description = description + u'}}' + + return description + + def _safeTemplateValue(self, value): + return value.replace("|", "{{!}}") + +def CSVReader(fileobj, urlcolumn, *args, **kwargs): + import csv + reader = csv.DictReader(fileobj, *args, **kwargs) + + for line in reader: + yield Photo(line[urlcolumn], line) + +class DataIngestionBot: + def __init__(self, reader, titlefmt, pagefmt, site=pywikibot.getSite(u'commons', u'commons')): + self.reader = reader + self.titlefmt = titlefmt + self.pagefmt = pagefmt + self.site = site + + def _doUpload(self, photo): + duplicates = photo.findDuplicateImages(self.site) + if duplicates: + pywikibot.output(u"Skipping duplicate of %r" % (duplicates, )) + return duplicates[0] + + title = photo.getTitle(self.titlefmt) + description = photo.getDescription(self.pagefmt) + + bot = upload.UploadRobot(url = photo.URL, + description = description, + useFilename = title, + keepFilename = True, + verifyDescription = False, + targetSite = self.site) + bot._contents = photo.downloadPhoto().getvalue() + bot._retrieved = True + bot.run() + + return title + + def doSingle(self): + return self._doUpload(self.reader.next()) + + def run(self): + for photo in self.reader: + self._doUpload(photo) + +if __name__=="__main__": + reader = CSVReader(open('tests/data/csv_ingestion.csv'), 'url') + bot = DataIngestionBot(reader, "%(name)s - %(set)s.%(_ext)s", ":user:valhallasw/test_template", pywikibot.getSite('test', 'test')) + bot.run() + +""" +class DataIngestionBot: + def __init__(self, configurationPage): + ''' + + ''' + self.site = configurationPage.site() + self.configuration = self.parseConfigurationPage(configurationPage) + + def parseConfigurationPage(self, configurationPage): + ''' + Expects a pywikibot.page object "configurationPage" which contains the configuration + ''' + configuration = {} + # Set a bunch of defaults + configuration['csvDialect']=u'excel' + configuration['csvDelimiter']=';' + configuration['csvEncoding']=u'Windows-1252' #FIXME: Encoding hell + + templates = configurationPage.templatesWithParams() + for (template, params) in templates: + if template==u'Data ingestion': + for param in params: + (field, sep, value) = param.partition(u'=') + + # Remove leading or trailing spaces + field = field.strip() + value = value.strip() + configuration[field] = value + print configuration + return configuration + + + def downloadPhoto(self, photoUrl = ''): + ''' + Download the photo and store it in a StrinIO.StringIO object. + + TODO: Add exception handling + ''' + imageFile=urllib.urlopen(photoUrl).read() + return StringIO.StringIO(imageFile) + + def findDuplicateImages(self, photo = None, site = pywikibot.getSite(u'commons', u'commons')): + ''' + Takes the photo, calculates the SHA1 hash and asks the mediawiki api for a list of duplicates. + + TODO: Add exception handling, fix site thing + ''' + hashObject = hashlib.sha1() + hashObject.update(photo.getvalue()) + return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) + + def getTitle(self, metadata): + ''' + Build a title. + Have titleFormat to indicate how the title would look. + We need to be able to strip off stuff if it's too long. configuration.get('maxTitleLength') + ''' + + #FIXME: Make this configurable. + title = self.configuration.get('titleFormat') % metadata + + description = metadata.get(u'dc:title') + identifier = metadata.get(u'dc:identifier') + + if len(description)>120: + description = description[0 : 120] + + title = u'%s - %s.jpg' % (description, identifier) + + return flickrripper.cleanUpTitle(title) + + def cleanDate(self, field): + ''' + A function to do date clean up. + ''' + # Empty, make it really empty + if field==u'-': + return u'' + # TODO: Circa + # TODO: Period + + return field + + def cleanEmptyField(self, field): + return field + + def procesFile(self, metadata): + # FIXME: Do some metadata enrichment + #metadata = getEuropeanaMetadata(metadata) + + fileLocation = metadata.get(self.configuration.get('sourceFileField')) + + photo = self.downloadPhoto(fileLocation) + duplicates = self.findDuplicateImages(photo) + + # We don't want to upload dupes + if duplicates: + pywikibot.output(u'Found duplicate image at %s' % duplicates.pop()) + # The file is at Commons so return True + return True + + # FIXME: Do some checking to see if the title already exists + + title = self.getTitle(metadata) + description = self.getDescription(metadata) + + + pywikibot.output(u'Preparing upload for %s.' % title) + pywikibot.output(description) + + bot = upload.UploadRobot(url=fileLocation, description=description, useFilename=title, keepFilename=True, verifyDescription=False, targetSite = self.site) + bot.run() + + def processCSV(self): + database = {} + + reader = csv.DictReader(open(self.configuration.get('csvFile'), "rb"), dialect=self.configuration.get('csvDialect'), delimiter=self.configuration.csvDelimiter) + # FIXME : Encoding problems http://docs.python.org/library/csv.html#csv-examples + for row in reader: + self.metadataCSV(row) + self.processFile(metadata) + + def run(self): + ''' + Do crap + ''' + if not self.configuration.get('sourceFormat'): + pywikibot.output(u'The field "sourceFormat" is not set') + return False + + if self.configuration.get('sourceFormat')==u'csv': + self.processCSV() + else: + pywikibot.output(u'%s is not a supported source format') + +def main(args): + generator = None; + + genFactory = pagegenerators.GeneratorFactory() + + for arg in pywikibot.handleArgs(): + genFactory.handleArg(arg) + + generator = genFactory.getCombinedGenerator() + if not generator: + return False + + for page in generator: + bot = DataIngestionBot(page) + bot.run() + +if __name__ == "__main__": + try: + main(sys.argv[1:]) + finally: + print "All done!" +"""
Added: branches/rewrite/scripts/tests/data/MP_sounds.png =================================================================== (Binary files differ)
Property changes on: branches/rewrite/scripts/tests/data/MP_sounds.png ___________________________________________________________________ Added: svn:mime-type + application/octet-stream
Added: branches/rewrite/scripts/tests/data/csv_ingestion.csv =================================================================== --- branches/rewrite/scripts/tests/data/csv_ingestion.csv (rev 0) +++ branches/rewrite/scripts/tests/data/csv_ingestion.csv 2012-11-13 21:35:06 UTC (rev 10717) @@ -0,0 +1,2 @@ +description.en,source,author,license,set,name,url +"""Sounds"" icon",http://commons.wikimedia.org/wiki/File:Sound-icon.svg,KDE artists | Silstor,LGPL,Crystal SVG icon set,Sound icon,http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png
Property changes on: branches/rewrite/scripts/tests/data/csv_ingestion.csv ___________________________________________________________________ Added: svn:executable + *
Added: branches/rewrite/scripts/tests/test_data_ingestion.py =================================================================== --- branches/rewrite/scripts/tests/test_data_ingestion.py (rev 0) +++ branches/rewrite/scripts/tests/test_data_ingestion.py 2012-11-13 21:35:06 UTC (rev 10717) @@ -0,0 +1,75 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +"""Unit tests for data_ingestion.py""" +__version__ = '$Id: test_userlib.py 9043 2011-03-13 10:25:08Z xqt $' + +import os +import unittest +import test_utils + +import pywikibot + +import data_ingestion + +class TestPhoto(unittest.TestCase): + def setUp(self): + self.obj = data_ingestion.Photo(URL='http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png', + metadata={'description.en': '"Sounds" icon', + 'source': 'http://commons.wikimedia.org/wiki/File:Sound-icon.svg', + 'author': 'KDE artists | Silstor', + 'license': 'LGPL', + 'set': 'Crystal SVG icon set', + 'name': 'Sound icon'} + ) + + def test_downloadPhoto(self): + f = open(os.path.join(os.path.split(__file__)[0], 'data', 'MP_sounds.png')) + self.assertEqual(f.read(), self.obj.downloadPhoto().read()) + + def test_findDuplicateImages(self): + duplicates = self.obj.findDuplicateImages() + self.assertIn('MP sounds.png', [dup.replace("_", " ") for dup in duplicates]) + + def test_getTitle(self): + self.assertEqual(self.obj.getTitle("%(name)s - %(set)s.%(_ext)s"), "Sound icon - Crystal SVG icon set.png") + + def test_getDescription(self): + self.assertEqual(self.obj.getDescription('CrystalTemplate'), +"""{{CrystalTemplate +|author=KDE artists {{!}} Silstor +|description.en="Sounds" icon +|license=LGPL +|name=Sound icon +|set=Crystal SVG icon set +|source=http://commons.wikimedia.org/wiki/File:Sound-icon.svg +}}""") + + +class TestCSVReader(unittest.TestCase): + def setUp(self): + fileobj = open(os.path.join(os.path.split(__file__)[0], 'data', 'csv_ingestion.csv')) + self.iterator = data_ingestion.CSVReader(fileobj, 'url') + self.obj = self.iterator.next() + + def test_PhotoURL(self): + self.assertEqual(self.obj.URL, 'http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png') + + def test_getTitle(self): + self.assertEqual(self.obj.getTitle("%(name)s - %(set)s.%(_ext)s"), "Sound icon - Crystal SVG icon set.png") + + def test_getDescription(self): + self.assertEqual(self.obj.getDescription('CrystalTemplate'), +"""{{CrystalTemplate +|author=KDE artists {{!}} Silstor +|description.en="Sounds" icon +|license=LGPL +|name=Sound icon +|set=Crystal SVG icon set +|source=http://commons.wikimedia.org/wiki/File:Sound-icon.svg +|url=http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png +}}""") + + +if __name__ == "__main__": + unittest.main()
Added: branches/rewrite/scripts/tests/test_utils.py =================================================================== --- branches/rewrite/scripts/tests/test_utils.py (rev 0) +++ branches/rewrite/scripts/tests/test_utils.py 2012-11-13 21:35:06 UTC (rev 10717) @@ -0,0 +1,12 @@ +""" +Support module for PyWikipediaBot regression tests. +""" +__version__ = '$Id: test_utils.py 9197 2011-04-25 08:57:30Z xqt $' + +import sys + +# Add current directory and parent directory to module search path. +sys.path.insert(0, '..') +sys.path.insert(0, '.') + +del sys
Modified: trunk/pywikipedia/data_ingestion.py =================================================================== --- trunk/pywikipedia/data_ingestion.py 2012-11-13 21:25:31 UTC (rev 10716) +++ trunk/pywikipedia/data_ingestion.py 2012-11-13 21:35:06 UTC (rev 10717) @@ -4,13 +4,11 @@ A generic bot to do data ingestion (batch uploading) to Commons
''' -import sys, os.path, glob, re, hashlib, base64, StringIO -import xml.etree.ElementTree -import wikipedia as pywikibot -import config, query, upload -import csv, urllib -import pagegenerators -import urlparse, posixpath +import pywikibot +import posixpath, urlparse +import urllib +import hashlib, base64 +import StringIO
class Photo(object): ''' @@ -79,6 +77,7 @@ return value.replace("|", "{{!}}")
def CSVReader(fileobj, urlcolumn, *args, **kwargs): + import csv reader = csv.DictReader(fileobj, *args, **kwargs)
for line in reader:
Added: trunk/pywikipedia/tests/data/csv_ingestion.csv =================================================================== --- trunk/pywikipedia/tests/data/csv_ingestion.csv (rev 0) +++ trunk/pywikipedia/tests/data/csv_ingestion.csv 2012-11-13 21:35:06 UTC (rev 10717) @@ -0,0 +1,2 @@ +description.en,source,author,license,set,name,url +"""Sounds"" icon",http://commons.wikimedia.org/wiki/File:Sound-icon.svg,KDE artists | Silstor,LGPL,Crystal SVG icon set,Sound icon,http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png
Property changes on: trunk/pywikipedia/tests/data/csv_ingestion.csv ___________________________________________________________________ Added: svn:executable + *
Modified: trunk/pywikipedia/tests/test_data_ingestion.py =================================================================== --- trunk/pywikipedia/tests/test_data_ingestion.py 2012-11-13 21:25:31 UTC (rev 10716) +++ trunk/pywikipedia/tests/test_data_ingestion.py 2012-11-13 21:35:06 UTC (rev 10717) @@ -8,8 +8,7 @@ import unittest import test_utils
-import userlib -import wikipedia +import pywikibot
import data_ingestion
@@ -30,7 +29,7 @@
def test_findDuplicateImages(self): duplicates = self.obj.findDuplicateImages() - self.assertIn('MP_sounds.png', duplicates) + self.assertIn('MP sounds.png', [dup.replace("_", " ") for dup in duplicates])
def test_getTitle(self): self.assertEqual(self.obj.getTitle("%(name)s - %(set)s.%(_ext)s"), "Sound icon - Crystal SVG icon set.png")
pywikipedia-svn@lists.wikimedia.org