jenkins-bot has submitted this change and it was merged.
Change subject: Normalise data_ingestion script ......................................................................
Normalise data_ingestion script
Photo subclasses FilePage DataIngestionBot subclasses Bot
Commented out parts of data_ingestion now integrated into the script.
Bug: T70611 Bug: T75624 Change-Id: I69bf929cf92bc5cb89c801c9a6da83640595626b --- M scripts/data_ingestion.py M setup.py M tests/data_ingestion_tests.py M tests/script_tests.py M tox.ini 5 files changed, 153 insertions(+), 182 deletions(-)
Approvals: XZise: Looks good to me, approved jenkins-bot: Verified
diff --git a/scripts/data_ingestion.py b/scripts/data_ingestion.py index 72e22f5..1e44ece 100755 --- a/scripts/data_ingestion.py +++ b/scripts/data_ingestion.py @@ -1,6 +1,10 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -"""A generic bot to do data ingestion (batch uploading) to Commons.""" +""" +A generic bot to do data ingestion (batch uploading). + +usage: data_ingestion.py -csvdir:local_dir/ -page:config_page +""" # # (C) Pywikibot team, 2013 # @@ -9,16 +13,25 @@ __version__ = '$Id$' #
-import posixpath -import hashlib import base64 -import sys +import codecs +import hashlib import io +import os +import sys + +import posixpath + +if sys.version_info[0] > 2: + import csv +else: + import unicodecsv as csv
import pywikibot -# TODO: nosetests3 fails on 'import <other_script>', which is used by many -# of our scripts, but only data_ingestion is directly imported (not via pwb). -# https://github.com/nose-devs/nose/issues/839 + +from pywikibot import pagegenerators +from pywikibot.tools import deprecated, deprecated_args + from scripts import upload
if sys.version_info[0] > 2: @@ -29,20 +42,23 @@ from urllib import urlopen
-class Photo(object): +class Photo(pywikibot.FilePage):
- """ - Represents a Photo (or other file), with metadata, to upload to Commons. + """Represents a Photo (or other file), with metadata, to be uploaded."""
- The constructor takes two parameters: URL (string) and metadata (dict with - str:str key:value pairs) that can be referred to from the title & template - generation. + def __init__(self, URL, metadata, site=None): + """ + Constructor.
+ @param URL: URL of photo + @type URL: str + @param metadata: metadata about the photo that can be referred to + from the title & template + @type metadata: dict + @param site: target site + @type site: APISite
- """ - - def __init__(self, URL, metadata): - """Constructor.""" + """ self.URL = URL self.metadata = metadata self.metadata["_url"] = URL @@ -52,6 +68,13 @@ if ext == filename: self.metadata["_ext"] = ext = None self.contents = None + + if not site: + site = pywikibot.Site(u'commons', u'commons') + + # default title + super(Photo, self).__init__(site, + self.getTitle('%(_filename)s.%(_ext)s'))
def downloadPhoto(self): """ @@ -64,8 +87,8 @@ self.contents = io.BytesIO(imageFile) return self.contents
- def findDuplicateImages(self, - site=pywikibot.Site(u'commons', u'commons')): + @deprecated_args(site=None) + def findDuplicateImages(self): """ Find duplicates of the photo.
@@ -76,17 +99,23 @@ """ hashObject = hashlib.sha1() hashObject.update(self.downloadPhoto().getvalue()) - return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) + return list( + page.title(withNamespace=False) for page in + self.site.allimages(sha1=base64.b16encode(hashObject.digest())))
def getTitle(self, fmt): """ Populate format string with %(name)s entries using metadata. + + Note: this does not clean the title, so it may be unusable as + a MediaWiki page title, and cause an API exception when used.
@param fmt: format string @type fmt: unicode @return: formatted string @rtype: unicode """ + # FIXME: normalise the title so it is usable as a MediaWiki title. return fmt % self.metadata
def getDescription(self, template, extraparams={}): @@ -105,31 +134,35 @@ return description
def _safeTemplateValue(self, value): + """Replace pipe (|) with {{!}}.""" return value.replace("|", "{{!}}")
-def CSVReader(fileobj, urlcolumn, *args, **kwargs): +def CSVReader(fileobj, urlcolumn, site=None, *args, **kwargs): """CSV reader.""" - import csv reader = csv.DictReader(fileobj, *args, **kwargs) - for line in reader: - yield Photo(line[urlcolumn], line) + yield Photo(line[urlcolumn], line, site=site)
-class DataIngestionBot: +class DataIngestionBot(pywikibot.Bot):
"""Data ingestion bot."""
def __init__(self, reader, titlefmt, pagefmt, site=pywikibot.Site(u'commons', u'commons')): + """Constructor.""" + super(DataIngestionBot, self).__init__(generator=reader) self.reader = reader self.titlefmt = titlefmt self.pagefmt = pagefmt - self.site = site
- def _doUpload(self, photo): - duplicates = photo.findDuplicateImages(self.site) + if site: + self.site = site + + def treat(self, photo): + """Process each page.""" + duplicates = photo.findDuplicateImages() if duplicates: pywikibot.output(u"Skipping duplicate of %r" % duplicates) return duplicates[0] @@ -149,178 +182,98 @@
return title
+ @deprecated("treat()") def doSingle(self): - return self._doUpload(next(self.reader)) + """Process one page.""" + return self.treat(next(self.reader))
- def run(self): - for photo in self.reader: - self._doUpload(photo) - -if __name__ == "__main__": - reader = CSVReader(open('tests/data/csv_ingestion.csv'), 'url') - bot = DataIngestionBot( - reader, - "%(name)s - %(set)s.%(_ext)s", ":user:valhallasw/test_template", - pywikibot.Site('test', 'test')) - bot.run() - -''' -class DataIngestionBot: - def __init__(self, configurationPage): + @classmethod + def parseConfigurationPage(cls, configurationPage): """ + Parse a Page which contains the configuration.
+ @param configurationPage: page with configuration + @type configurationPage: L{pywikibot.Page} """ - self.site = configurationPage.site() - self.configuration = self.parseConfigurationPage(configurationPage) - - def parseConfigurationPage(self, configurationPage): - """ - Expects a pywikibot.page object "configurationPage" which contains the configuration - """ - configuration = {} + configuration = {} # Set a bunch of defaults - configuration['csvDialect']=u'excel' - configuration['csvDelimiter']=';' - configuration['csvEncoding']=u'Windows-1252' #FIXME: Encoding hell + configuration['csvDialect'] = u'excel' + configuration['csvDelimiter'] = ';' + configuration['csvEncoding'] = u'Windows-1252' # FIXME: Encoding hell
templates = configurationPage.templatesWithParams() for (template, params) in templates: - if template == u'Data ingestion': + if template.title(withNamespace=False) == u'Data ingestion': for param in params: (field, sep, value) = param.partition(u'=')
# Remove leading or trailing spaces field = field.strip() value = value.strip() + if not value: + value = None configuration[field] = value - print(configuration) + return configuration
- def downloadPhoto(self, photoUrl=''): - """ - Download the photo and store it in a io.BytesIO object. +def main(*args): + """ + Process command line arguments and invoke bot.
- TODO: Add exception handling - """ - imageFile = urlopen(photoUrl).read() - return io.BytesIO(imageFile) + If args is an empty list, sys.argv is used.
- def findDuplicateImages(self, photo=None, site=pywikibot.Site(u'commons', u'commons')): - """ - Takes the photo, calculates the SHA1 hash and asks the MediaWiki api for a list of duplicates. - - TODO: Add exception handling, fix site thing - """ - hashObject = hashlib.sha1() - hashObject.update(photo.getvalue()) - return site.getFilesFromAnHash(base64.b16encode(hashObject.digest())) - - def getTitle(self, metadata): - """ - Build a title. - Have titleFormat to indicate how the title would look. - We need to be able to strip off stuff if it's too long. configuration.get('maxTitleLength') - """ - - #FIXME: Make this configurable. - title = self.configuration.get('titleFormat') % metadata - - description = metadata.get(u'dc:title') - identifier = metadata.get(u'dc:identifier') - - if len(description) > 120: - description = description[0 : 120] - - title = u'%s - %s.jpg' % (description, identifier) - - return flickrripper.cleanUpTitle(title) - - def cleanDate(self, field): - """ - A function to do date clean up. - """ - # Empty, make it really empty - if field == u'-': - return u'' - # TODO: Circa - # TODO: Period - - return field - - def cleanEmptyField(self, field): - return field - - def procesFile(self, metadata): - # FIXME: Do some metadata enrichment - #metadata = getEuropeanaMetadata(metadata) - - fileLocation = metadata.get(self.configuration.get('sourceFileField')) - - photo = self.downloadPhoto(fileLocation) - duplicates = self.findDuplicateImages(photo) - - # We don't want to upload dupes - if duplicates: - pywikibot.output(u'Found duplicate image at %s' % duplicates.pop()) - # The file is at Commons so return True - return True - - # FIXME: Do some checking to see if the title already exists - - title = self.getTitle(metadata) - description = self.getDescription(metadata) - - - pywikibot.output(u'Preparing upload for %s.' % title) - pywikibot.output(description) - - bot = upload.UploadRobot(url=fileLocation, description=description, useFilename=title, keepFilename=True, verifyDescription=False, targetSite = self.site) - bot.run() - - def processCSV(self): - database = {} - - reader = csv.DictReader(open(self.configuration.get('csvFile'), "rb"), dialect=self.configuration.get('csvDialect'), delimiter=self.configuration.csvDelimiter) - # FIXME : Encoding problems https://docs.python.org/2/library/csv.html#csv-examples - for row in reader: - self.metadataCSV(row) - self.processFile(metadata) - - def run(self): - """ - Do crap - """ - if not self.configuration.get('sourceFormat'): - pywikibot.output(u'The field "sourceFormat" is not set') - return False - - if self.configuration.get('sourceFormat') == u'csv': - self.processCSV() - else: - pywikibot.output(u'%s is not a supported source format') - -def main(): - generator = None; - + @param args: command line arguments + @type args: list of unicode + """ # Process global args and prepare generator args parser - local_args = pywikibot.handleArgs() + local_args = pywikibot.handle_args(args) genFactory = pagegenerators.GeneratorFactory() + csv_dir = None
for arg in local_args: - genFactory.handleArg(arg) + if arg.startswith('-csvdir:'): + csv_dir = arg[8:] + else: + genFactory.handleArg(arg)
- generator = genFactory.getCombinedGenerator() - if not generator: - return False + config_generator = genFactory.getCombinedGenerator()
- for page in generator: - bot = DataIngestionBot(page) - bot.run() + if not config_generator or not csv_dir: + pywikibot.showHelp() + return + + for config_page in config_generator: + try: + config_page.get() + except pywikibot.NoPage: + pywikibot.error('%s does not exist' % config_page) + continue + + configuration = DataIngestionBot.parseConfigurationPage(config_page) + + filename = os.path.join(csv_dir, configuration['csvFile']) + try: + + f = codecs.open(filename, 'r', configuration['csvEncoding']) + except (IOError, OSError) as e: + pywikibot.error('%s could not be opened: %s' % (filename, e)) + continue + + try: + files = CSVReader(f, urlcolumn='url', + site=config_page.site, + dialect=configuration['csvDialect'], + delimiter=str(configuration['csvDelimiter'])) + + bot = DataIngestionBot(files, + configuration['titleFormat'], + configuration['formattingTemplate'], + site=None) + + bot.run() + finally: + f.close()
if __name__ == "__main__": - try: - main() - finally: - print("All done!") -''' + main() diff --git a/setup.py b/setup.py index 408c054..649d0b8 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,8 @@ test_deps.extend(extra_deps['rcstream'])
if sys.version_info[0] == 2: - extra_deps['wikistats-csv'] = ['unicodecsv'] + # csv is used by wikistats and script data_ingestion + extra_deps['csv'] = ['unicodecsv']
script_deps = { 'script_wui.py': ['irc', 'lunatic-python', 'crontab'], diff --git a/tests/data_ingestion_tests.py b/tests/data_ingestion_tests.py index b62a999..156343e 100644 --- a/tests/data_ingestion_tests.py +++ b/tests/data_ingestion_tests.py @@ -7,7 +7,7 @@ import os from tests import _data_dir from tests import _images_dir -from tests.aspects import unittest, TestCase +from tests.aspects import unittest, TestCase, ScriptMainTestCase from scripts import data_ingestion
@@ -33,8 +33,8 @@ 'author': 'KDE artists | Silstor', 'license': 'LGPL', 'set': 'Crystal SVG icon set', - 'name': 'Sound icon'} - ) + 'name': 'Sound icon'}, + site=self.get_site('commons'))
def test_downloadPhoto(self): """Test download from http://upload.wikimedia.org/.""" @@ -66,12 +66,14 @@
"""Test CSVReader class."""
- net = False + family = 'commons' + code = 'commons'
def setUp(self): super(TestCSVReader, self).setUp() with open(os.path.join(_data_dir, 'csv_ingestion.csv')) as fileobj: - self.iterator = data_ingestion.CSVReader(fileobj, 'url') + self.iterator = data_ingestion.CSVReader(fileobj, 'url', + site=self.get_site()) self.obj = next(self.iterator)
def test_PhotoURL(self): @@ -93,5 +95,19 @@ }}""") # noqa
+class TestDataIngestionBot(ScriptMainTestCase): + + """Test TestDataIngestionBot class.""" + + family = 'commons' + code = 'commons' + + def test_existing_file(self): + """Test uploading a file that already exists.""" + data_ingestion.main( + '-family:test', '-lang:test', '-csvdir:tests/data', + '-page:User:John_Vandenberg/data_ingestion_test_template') + + if __name__ == "__main__": unittest.main() diff --git a/tests/script_tests.py b/tests/script_tests.py index 9a9c88f..ce04236 100644 --- a/tests/script_tests.py +++ b/tests/script_tests.py @@ -28,6 +28,7 @@ 'script_wui': ['crontab', 'lua'], # Note: package 'lunatic-python' provides module 'lua'
+ 'data_ingestion': ['unicodecsv'], 'flickrripper': ['flickrapi'], 'match_images': ['PIL.ImageTk'], 'states_redirect': ['pycountry'], @@ -302,7 +303,6 @@ test_name = 'test_' + script_name + '_help' dct[test_name] = test_execution(script_name, ['-help']) if script_name in ['version', - 'data_ingestion', # bug 68611 'script_wui', # Failing on travis-ci ] + failed_dep_script_list: dct[test_name] = unittest.expectedFailure(dct[test_name]) @@ -325,7 +325,6 @@ no_args_expected_results) if script_name in ['catall', # stdout user interaction 'checkimages', # bug 68613 - 'data_ingestion', # bug 68611 'flickrripper', # Requires a flickr api key 'lonelypages', # uses exit code 1 'script_wui', # Error on any user except DrTrigonBot diff --git a/tox.ini b/tox.ini index ae60d99..ee980f4 100644 --- a/tox.ini +++ b/tox.ini @@ -68,6 +68,7 @@ scripts/clean_sandbox.py \ scripts/commonscat.py \ scripts/coordinate_import.py \ + scripts/data_ingestion.py \ scripts/delete.py \ scripts/flickrripper.py \ scripts/harvest_template.py \ @@ -115,6 +116,7 @@ deps = nose doctest-ignore-unicode + unicodecsv
[testenv:nose34] basepython = python3
pywikibot-commits@lists.wikimedia.org