jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/757889 )
Change subject: [scripts] recover data_ingestion.py
......................................................................
[scripts] recover data_ingestion.py
NOTE:
There might be an unsolved issue as this script was ported from compat:
neither bot._contents nor bot_retrieved is used and probably
Photo.download_photo isn't used for the upload and the UploadRobot
uses it's own read_file_content() method instead of this.
Change-Id: I46cd52d389757053007257a01f6e726e1479b3c1
---
M docs/scripts/scripts.rst
M scripts/README.rst
R scripts/data_ingestion.py
M tests/__init__.py
R tests/data_ingestion_tests.py
5 files changed, 82 insertions(+), 76 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/docs/scripts/scripts.rst b/docs/scripts/scripts.rst
index 1787469..ecde314 100644
--- a/docs/scripts/scripts.rst
+++ b/docs/scripts/scripts.rst
@@ -83,6 +83,11 @@
.. automodule:: scripts.cosmetic_changes
+data\_ingestion script
+----------------------
+
+.. automodule:: scripts.data_ingestion
+
delete script
-------------
diff --git a/scripts/README.rst b/scripts/README.rst
index 8c417a8..2e6e489 100644
--- a/scripts/README.rst
+++ b/scripts/README.rst
@@ -54,6 +54,8 @@
| cosmetic_changes.py | Can do slight modifications to a wiki page source code |
| | such that the code looks cleaner. |
+------------------------+---------------------------------------------------------+
+ | data_ingestion.py | A generic bot to do batch uploading to Commons. |
+ +------------------------+---------------------------------------------------------+
| delete.py | This script can be used to delete pages en masse. |
+------------------------+---------------------------------------------------------+
| djvutext.py | Extracts OCR text from djvu files and uploads onto |
@@ -168,8 +170,6 @@
+------------------------+---------------------------------------------------------+
| create_categories.py | Program to batch create categories. |
+------------------------+---------------------------------------------------------+
- | data_ingestion.py | A generic bot to do batch uploading to Commons. |
- +------------------------+---------------------------------------------------------+
| disambredir.py | Changing redirect names in disambiguation pages. |
+------------------------+---------------------------------------------------------+
| editarticle.py | Edit a Wikipedia article with your favourite editor. |
diff --git a/scripts/archive/data_ingestion.py b/scripts/data_ingestion.py
similarity index 75%
rename from scripts/archive/data_ingestion.py
rename to scripts/data_ingestion.py
index 2516881..40109a0 100755
--- a/scripts/archive/data_ingestion.py
+++ b/scripts/data_ingestion.py
@@ -93,7 +93,7 @@
"""
#
-# (C) Pywikibot team, 2012-2021
+# (C) Pywikibot team, 2012-2022
#
# Distributed under the terms of the MIT license.
#
@@ -104,10 +104,13 @@
import io
import os
import posixpath
+
+from typing import Any, BinaryIO, Optional
from urllib.parse import urlparse
import pywikibot
from pywikibot import pagegenerators
+from pywikibot.backports import Dict, List
from pywikibot.comms.http import fetch
from pywikibot.exceptions import NoPageError
from pywikibot.specialbots import UploadRobot
@@ -117,45 +120,43 @@
"""Represents a Photo (or other file), with metadata, to be
uploaded."""
- def __init__(self, URL: str, metadata: dict, site=None):
+ def __init__(self, url: str, metadata: Dict[str, Any],
+ site: Optional[pywikibot.site.APISite] = None):
"""
Initializer.
- :param URL: URL of photo
+ :param url: URL of photo
:param metadata: metadata about the photo that can be referred to
from the title & template
:param site: target site
- :type site: pywikibot.site.APISite
-
"""
- self.URL = URL
+ self.URL = url
self.metadata = metadata
- self.metadata['_url'] = URL
+ self.metadata['_url'] = url
self.metadata['_filename'] = filename = posixpath.split(
- urlparse(URL)[2])[1]
- self.metadata['_ext'] = ext = filename.split('.')[-1]
- if ext == filename:
- self.metadata['_ext'] = None
+ urlparse(url)[2])[1]
+ ext = filename.split('.')[-1]
+ self.metadata['_ext'] = None if ext == filename else ext
self.contents = None
if not site:
- site = pywikibot.Site('commons', 'commons')
+ site = pywikibot.Site('commons:commons')
# default title
- super().__init__(site, self.getTitle('%(_filename)s.%(_ext)s'))
+ super().__init__(site, self.get_title('%(_filename)s.%(_ext)s'))
- def downloadPhoto(self):
+ def download_photo(self) -> BinaryIO:
"""
Download the photo and store it in an io.BytesIO object.
TODO: Add exception handling
"""
if not self.contents:
- imageFile = fetch(self.URL).content
- self.contents = io.BytesIO(imageFile)
+ image_file = fetch(self.URL).content
+ self.contents = io.BytesIO(image_file)
return self.contents
- def findDuplicateImages(self):
+ def find_duplicate_images(self) -> List[str]:
"""
Find duplicates of the photo.
@@ -164,13 +165,13 @@
TODO: Add exception handling, fix site thing
"""
- hashObject = hashlib.sha1()
- hashObject.update(self.downloadPhoto().getvalue())
- return [page.title(with_ns=False) for page in
- self.site.allimages(
- sha1=base64.b16encode(hashObject.digest()))]
+ hash_object = hashlib.sha1()
+ hash_object.update(self.download_photo().getvalue())
+ return [page.title(with_ns=False)
+ for page in self.site.allimages(
+ sha1=base64.b16encode(hash_object.digest()))]
- def getTitle(self, fmt: str) -> str:
+ def get_title(self, fmt: str) -> str:
"""
Populate format string with %(name)s entries using metadata.
@@ -183,7 +184,8 @@
# FIXME: normalise the title so it is usable as a MediaWiki title.
return fmt % self.metadata
- def getDescription(self, template, extraparams=None):
+ def get_description(self, template,
+ extraparams: Optional[Dict[str, str]] = None) -> str:
"""Generate a description for a file."""
params = {}
params.update(self.metadata)
@@ -192,18 +194,18 @@
for key in sorted(params.keys()):
value = params[key]
if not key.startswith('_'):
- description += ('|{}={}\n'.format(
- key, self._safeTemplateValue(value)))
+ description += '|{}={}\n'.format(
+ key, self._safe_template_value(value))
description += '}}'
return description
- def _safeTemplateValue(self, value):
+ def _safe_template_value(self, value: str) -> str:
"""Replace pipe (|) with {{!}}."""
return value.replace('|', '{{!}}')
-def CSVReader(fileobj, urlcolumn, site=None, *args, **kwargs):
+def CSVReader(fileobj, urlcolumn, site=None, *args, **kwargs): # noqa: N802
"""Yield Photo objects for each row of a CSV file."""
reader = csv.DictReader(fileobj, *args, **kwargs)
for line in reader:
@@ -218,52 +220,45 @@
"""
Initializer.
- :param reader: Generator of Photos to process.
- :type reader: Photo page generator
:param titlefmt: Title format
:param pagefmt: Page format
"""
super().__init__(**kwargs)
-
self.titlefmt = titlefmt
self.pagefmt = pagefmt
- def treat(self, photo):
- """
- Process each page.
+ def treat(self, page):
+ """Process each page.
1. Check for existing duplicates on the wiki specified in self.site.
2. If duplicates are found, then skip uploading.
3. Download the file from photo.URL and upload the file to self.site.
"""
- duplicates = photo.findDuplicateImages()
+ duplicates = page.find_duplicate_images()
if duplicates:
- pywikibot.output('Skipping duplicate of {!r}'
- .format(duplicates))
- return duplicates[0]
+ pywikibot.output('Skipping duplicate of {!r}'.format(duplicates))
+ return
- title = photo.getTitle(self.titlefmt)
- description = photo.getDescription(self.pagefmt)
+ title = page.get_title(self.titlefmt)
+ description = page.get_description(self.pagefmt)
- bot = UploadRobot(url=photo.URL,
+ bot = UploadRobot(url=page.URL,
description=description,
use_filename=title,
keep_filename=True,
verify_description=False,
target_site=self.site)
- bot._contents = photo.downloadPhoto().getvalue()
+ bot._contents = page.download_photo().getvalue()
bot._retrieved = True
bot.run()
- return title
-
@classmethod
- def parseConfigurationPage(cls, configurationPage):
+ def parse_configuration_page(cls, configuration_page) -> Dict[str, str]:
"""
Parse a Page which contains the configuration.
- :param configurationPage: page with configuration
- :type configurationPage: :py:obj:`pywikibot.Page`
+ :param configuration_page: page with configuration
+ :type configuration_page: :py:obj:`pywikibot.Page`
"""
configuration = {}
# Set a bunch of defaults
@@ -271,7 +266,7 @@
configuration['csvDelimiter'] = ';'
configuration['csvEncoding'] = 'Windows-1252' # FIXME: Encoding
hell
- templates = configurationPage.templatesWithParams()
+ templates = configuration_page.templatesWithParams()
for (template, params) in templates:
if template.title(with_ns=False) == 'Data ingestion':
for param in params:
@@ -295,26 +290,30 @@
:param args: command line arguments
"""
- # Process global args and prepare generator args parser
- local_args = pywikibot.handle_args(args)
+ csv_dir = None
+ unknown = []
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
- genFactory = pagegenerators.GeneratorFactory()
- csv_dir = None
+ gen_factory = pagegenerators.GeneratorFactory()
+ # Process global args and prepare generator args parser
+ local_args = pywikibot.handle_args(args)
+ local_args = gen_factory.handle_args(local_args)
for arg in local_args:
- if arg.startswith('-csvdir:'):
- csv_dir = arg[8:]
+ opt, _, value = arg.partition(':')
+ if opt == '-csvdir:':
+ csv_dir = value
else:
- genFactory.handle_arg(arg)
+ unknown.append(arg)
- config_generator = genFactory.getCombinedGenerator()
+ config_generator = gen_factory.getCombinedGenerator()
if pywikibot.bot.suggest_help(
- missing_parameters=[] if csv_dir else ['-csvdir'],
- missing_generator=not config_generator):
+ missing_parameters=None if csv_dir else ['-csvdir'],
+ missing_generator=not config_generator,
+ unknown_parameters=unknown):
return
for config_page in config_generator:
@@ -324,7 +323,7 @@
pywikibot.error('{} does not exist'.format(config_page))
continue
- configuration = DataIngestionBot.parseConfigurationPage(config_page)
+ configuration = DataIngestionBot.parse_configuration_page(config_page)
filename = os.path.join(csv_dir, configuration['csvFile'])
try:
diff --git a/tests/__init__.py b/tests/__init__.py
index a29fdff..b54e2c2 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -149,6 +149,7 @@
'cache',
'category_bot',
'checkimages',
+ 'data_ingestion',
'deletionbot',
'fixing_redirects',
'generate_family_file',
diff --git a/tests/archive/data_ingestion_tests.py b/tests/data_ingestion_tests.py
similarity index 82%
rename from tests/archive/data_ingestion_tests.py
rename to tests/data_ingestion_tests.py
index 7693619..ee19246 100644
--- a/tests/archive/data_ingestion_tests.py
+++ b/tests/data_ingestion_tests.py
@@ -1,13 +1,14 @@
#!/usr/bin/python3
"""Unit tests for data_ingestion.py script."""
#
-# (C) Pywikibot team, 2012-2021
+# (C) Pywikibot team, 2012-2022
#
# Distributed under the terms of the MIT license.
#
import unittest
from scripts import data_ingestion
+
from tests import join_data_path, join_images_path
from tests.aspects import ScriptMainTestCase, TestCase
from tests.utils import empty_sites
@@ -35,7 +36,7 @@
meta_url = 'http://commons.wikimedia.org/wiki/File:Sound-icon.svg'
self.obj = data_ingestion.Photo(
- URL=url,
+ url=url,
metadata={'description.en': '"Sounds" icon',
'source': meta_url,
'author': 'KDE artists | Silstor',
@@ -44,25 +45,25 @@
'name': 'Sound icon'},
site=self.get_site('commons'))
- def test_downloadPhoto(self):
+ def test_download_photo(self):
"""Test download from
http://upload.wikimedia.org/."""
with open(join_images_path('MP_sounds.png'), 'rb') as f:
- self.assertEqual(f.read(), self.obj.downloadPhoto().read())
+ self.assertEqual(f.read(), self.obj.download_photo().read())
- def test_findDuplicateImages(self):
+ def test_find_duplicate_images(self):
"""Test finding duplicates on Wikimedia
Commons."""
- duplicates = self.obj.findDuplicateImages()
+ duplicates = self.obj.find_duplicate_images()
self.assertIn('MP sounds.png',
[dup.replace('_', ' ') for dup in duplicates])
- def test_getTitle(self):
+ def test_get_title(self):
"""Test getTitle()."""
- self.assertEqual(self.obj.getTitle('%(name)s - %(set)s.%(_ext)s'),
+ self.assertEqual(self.obj.get_title('%(name)s - %(set)s.%(_ext)s'),
'Sound icon - Crystal SVG icon set.png')
- def test_getDescription(self):
+ def test_get_description(self):
"""Test getDescription()."""
- self.assertEqual(self.obj.getDescription('CrystalTemplate'),
+ self.assertEqual(self.obj.get_description('CrystalTemplate'),
str("""{{CrystalTemplate
|author=KDE artists {{!}} Silstor
|description.en="Sounds" icon
@@ -88,20 +89,20 @@
site=self.get_site())
self.obj = next(self.iterator)
- def test_PhotoURL(self):
+ def test_photo_url(self):
"""Test PhotoURL()."""
self.assertEqual(
self.obj.URL,
'http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png')
- def test_getTitle(self):
+ def test_get_title(self):
"""Test getTitle()."""
- self.assertEqual(self.obj.getTitle('%(name)s - %(set)s.%(_ext)s'),
+ self.assertEqual(self.obj.get_title('%(name)s - %(set)s.%(_ext)s'),
'Sound icon - Crystal SVG icon set.png')
- def test_getDescription(self):
+ def test_get_description(self):
"""Test getDescription()."""
- self.assertEqual(self.obj.getDescription('CrystalTemplate'),
+ self.assertEqual(self.obj.get_description('CrystalTemplate'),
str("""{{CrystalTemplate
|author=KDE artists {{!}} Silstor
|description.en="Sounds" icon
--
To view, visit
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/757889
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I46cd52d389757053007257a01f6e726e1479b3c1
Gerrit-Change-Number: 757889
Gerrit-PatchSet: 4
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged