jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/757889 )
Change subject: [scripts] recover data_ingestion.py ......................................................................
[scripts] recover data_ingestion.py
NOTE: There might be an unsolved issue as this script was ported from compat: neither bot._contents nor bot_retrieved is used and probably Photo.download_photo isn't used for the upload and the UploadRobot uses it's own read_file_content() method instead of this.
Change-Id: I46cd52d389757053007257a01f6e726e1479b3c1 --- M docs/scripts/scripts.rst M scripts/README.rst R scripts/data_ingestion.py M tests/__init__.py R tests/data_ingestion_tests.py 5 files changed, 82 insertions(+), 76 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/docs/scripts/scripts.rst b/docs/scripts/scripts.rst index 1787469..ecde314 100644 --- a/docs/scripts/scripts.rst +++ b/docs/scripts/scripts.rst @@ -83,6 +83,11 @@
.. automodule:: scripts.cosmetic_changes
+data_ingestion script +---------------------- + +.. automodule:: scripts.data_ingestion + delete script -------------
diff --git a/scripts/README.rst b/scripts/README.rst index 8c417a8..2e6e489 100644 --- a/scripts/README.rst +++ b/scripts/README.rst @@ -54,6 +54,8 @@ | cosmetic_changes.py | Can do slight modifications to a wiki page source code | | | such that the code looks cleaner. | +------------------------+---------------------------------------------------------+ + | data_ingestion.py | A generic bot to do batch uploading to Commons. | + +------------------------+---------------------------------------------------------+ | delete.py | This script can be used to delete pages en masse. | +------------------------+---------------------------------------------------------+ | djvutext.py | Extracts OCR text from djvu files and uploads onto | @@ -168,8 +170,6 @@ +------------------------+---------------------------------------------------------+ | create_categories.py | Program to batch create categories. | +------------------------+---------------------------------------------------------+ - | data_ingestion.py | A generic bot to do batch uploading to Commons. | - +------------------------+---------------------------------------------------------+ | disambredir.py | Changing redirect names in disambiguation pages. | +------------------------+---------------------------------------------------------+ | editarticle.py | Edit a Wikipedia article with your favourite editor. | diff --git a/scripts/archive/data_ingestion.py b/scripts/data_ingestion.py similarity index 75% rename from scripts/archive/data_ingestion.py rename to scripts/data_ingestion.py index 2516881..40109a0 100755 --- a/scripts/archive/data_ingestion.py +++ b/scripts/data_ingestion.py @@ -93,7 +93,7 @@
""" # -# (C) Pywikibot team, 2012-2021 +# (C) Pywikibot team, 2012-2022 # # Distributed under the terms of the MIT license. # @@ -104,10 +104,13 @@ import io import os import posixpath + +from typing import Any, BinaryIO, Optional from urllib.parse import urlparse
import pywikibot from pywikibot import pagegenerators +from pywikibot.backports import Dict, List from pywikibot.comms.http import fetch from pywikibot.exceptions import NoPageError from pywikibot.specialbots import UploadRobot @@ -117,45 +120,43 @@
"""Represents a Photo (or other file), with metadata, to be uploaded."""
- def __init__(self, URL: str, metadata: dict, site=None): + def __init__(self, url: str, metadata: Dict[str, Any], + site: Optional[pywikibot.site.APISite] = None): """ Initializer.
- :param URL: URL of photo + :param url: URL of photo :param metadata: metadata about the photo that can be referred to from the title & template :param site: target site - :type site: pywikibot.site.APISite - """ - self.URL = URL + self.URL = url self.metadata = metadata - self.metadata['_url'] = URL + self.metadata['_url'] = url self.metadata['_filename'] = filename = posixpath.split( - urlparse(URL)[2])[1] - self.metadata['_ext'] = ext = filename.split('.')[-1] - if ext == filename: - self.metadata['_ext'] = None + urlparse(url)[2])[1] + ext = filename.split('.')[-1] + self.metadata['_ext'] = None if ext == filename else ext self.contents = None
if not site: - site = pywikibot.Site('commons', 'commons') + site = pywikibot.Site('commons:commons')
# default title - super().__init__(site, self.getTitle('%(_filename)s.%(_ext)s')) + super().__init__(site, self.get_title('%(_filename)s.%(_ext)s'))
- def downloadPhoto(self): + def download_photo(self) -> BinaryIO: """ Download the photo and store it in an io.BytesIO object.
TODO: Add exception handling """ if not self.contents: - imageFile = fetch(self.URL).content - self.contents = io.BytesIO(imageFile) + image_file = fetch(self.URL).content + self.contents = io.BytesIO(image_file) return self.contents
- def findDuplicateImages(self): + def find_duplicate_images(self) -> List[str]: """ Find duplicates of the photo.
@@ -164,13 +165,13 @@
TODO: Add exception handling, fix site thing """ - hashObject = hashlib.sha1() - hashObject.update(self.downloadPhoto().getvalue()) - return [page.title(with_ns=False) for page in - self.site.allimages( - sha1=base64.b16encode(hashObject.digest()))] + hash_object = hashlib.sha1() + hash_object.update(self.download_photo().getvalue()) + return [page.title(with_ns=False) + for page in self.site.allimages( + sha1=base64.b16encode(hash_object.digest()))]
- def getTitle(self, fmt: str) -> str: + def get_title(self, fmt: str) -> str: """ Populate format string with %(name)s entries using metadata.
@@ -183,7 +184,8 @@ # FIXME: normalise the title so it is usable as a MediaWiki title. return fmt % self.metadata
- def getDescription(self, template, extraparams=None): + def get_description(self, template, + extraparams: Optional[Dict[str, str]] = None) -> str: """Generate a description for a file.""" params = {} params.update(self.metadata) @@ -192,18 +194,18 @@ for key in sorted(params.keys()): value = params[key] if not key.startswith('_'): - description += ('|{}={}\n'.format( - key, self._safeTemplateValue(value))) + description += '|{}={}\n'.format( + key, self._safe_template_value(value)) description += '}}'
return description
- def _safeTemplateValue(self, value): + def _safe_template_value(self, value: str) -> str: """Replace pipe (|) with {{!}}.""" return value.replace('|', '{{!}}')
-def CSVReader(fileobj, urlcolumn, site=None, *args, **kwargs): +def CSVReader(fileobj, urlcolumn, site=None, *args, **kwargs): # noqa: N802 """Yield Photo objects for each row of a CSV file.""" reader = csv.DictReader(fileobj, *args, **kwargs) for line in reader: @@ -218,52 +220,45 @@ """ Initializer.
- :param reader: Generator of Photos to process. - :type reader: Photo page generator :param titlefmt: Title format :param pagefmt: Page format """ super().__init__(**kwargs) - self.titlefmt = titlefmt self.pagefmt = pagefmt
- def treat(self, photo): - """ - Process each page. + def treat(self, page): + """Process each page.
1. Check for existing duplicates on the wiki specified in self.site. 2. If duplicates are found, then skip uploading. 3. Download the file from photo.URL and upload the file to self.site. """ - duplicates = photo.findDuplicateImages() + duplicates = page.find_duplicate_images() if duplicates: - pywikibot.output('Skipping duplicate of {!r}' - .format(duplicates)) - return duplicates[0] + pywikibot.output('Skipping duplicate of {!r}'.format(duplicates)) + return
- title = photo.getTitle(self.titlefmt) - description = photo.getDescription(self.pagefmt) + title = page.get_title(self.titlefmt) + description = page.get_description(self.pagefmt)
- bot = UploadRobot(url=photo.URL, + bot = UploadRobot(url=page.URL, description=description, use_filename=title, keep_filename=True, verify_description=False, target_site=self.site) - bot._contents = photo.downloadPhoto().getvalue() + bot._contents = page.download_photo().getvalue() bot._retrieved = True bot.run()
- return title - @classmethod - def parseConfigurationPage(cls, configurationPage): + def parse_configuration_page(cls, configuration_page) -> Dict[str, str]: """ Parse a Page which contains the configuration.
- :param configurationPage: page with configuration - :type configurationPage: :py:obj:`pywikibot.Page` + :param configuration_page: page with configuration + :type configuration_page: :py:obj:`pywikibot.Page` """ configuration = {} # Set a bunch of defaults @@ -271,7 +266,7 @@ configuration['csvDelimiter'] = ';' configuration['csvEncoding'] = 'Windows-1252' # FIXME: Encoding hell
- templates = configurationPage.templatesWithParams() + templates = configuration_page.templatesWithParams() for (template, params) in templates: if template.title(with_ns=False) == 'Data ingestion': for param in params: @@ -295,26 +290,30 @@
:param args: command line arguments """ - # Process global args and prepare generator args parser - local_args = pywikibot.handle_args(args) + csv_dir = None + unknown = []
# This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. - genFactory = pagegenerators.GeneratorFactory() - csv_dir = None + gen_factory = pagegenerators.GeneratorFactory()
+ # Process global args and prepare generator args parser + local_args = pywikibot.handle_args(args) + local_args = gen_factory.handle_args(local_args) for arg in local_args: - if arg.startswith('-csvdir:'): - csv_dir = arg[8:] + opt, _, value = arg.partition(':') + if opt == '-csvdir:': + csv_dir = value else: - genFactory.handle_arg(arg) + unknown.append(arg)
- config_generator = genFactory.getCombinedGenerator() + config_generator = gen_factory.getCombinedGenerator()
if pywikibot.bot.suggest_help( - missing_parameters=[] if csv_dir else ['-csvdir'], - missing_generator=not config_generator): + missing_parameters=None if csv_dir else ['-csvdir'], + missing_generator=not config_generator, + unknown_parameters=unknown): return
for config_page in config_generator: @@ -324,7 +323,7 @@ pywikibot.error('{} does not exist'.format(config_page)) continue
- configuration = DataIngestionBot.parseConfigurationPage(config_page) + configuration = DataIngestionBot.parse_configuration_page(config_page)
filename = os.path.join(csv_dir, configuration['csvFile']) try: diff --git a/tests/__init__.py b/tests/__init__.py index a29fdff..b54e2c2 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -149,6 +149,7 @@ 'cache', 'category_bot', 'checkimages', + 'data_ingestion', 'deletionbot', 'fixing_redirects', 'generate_family_file', diff --git a/tests/archive/data_ingestion_tests.py b/tests/data_ingestion_tests.py similarity index 82% rename from tests/archive/data_ingestion_tests.py rename to tests/data_ingestion_tests.py index 7693619..ee19246 100644 --- a/tests/archive/data_ingestion_tests.py +++ b/tests/data_ingestion_tests.py @@ -1,13 +1,14 @@ #!/usr/bin/python3 """Unit tests for data_ingestion.py script.""" # -# (C) Pywikibot team, 2012-2021 +# (C) Pywikibot team, 2012-2022 # # Distributed under the terms of the MIT license. # import unittest
from scripts import data_ingestion + from tests import join_data_path, join_images_path from tests.aspects import ScriptMainTestCase, TestCase from tests.utils import empty_sites @@ -35,7 +36,7 @@
meta_url = 'http://commons.wikimedia.org/wiki/File:Sound-icon.svg' self.obj = data_ingestion.Photo( - URL=url, + url=url, metadata={'description.en': '"Sounds" icon', 'source': meta_url, 'author': 'KDE artists | Silstor', @@ -44,25 +45,25 @@ 'name': 'Sound icon'}, site=self.get_site('commons'))
- def test_downloadPhoto(self): + def test_download_photo(self): """Test download from http://upload.wikimedia.org/.""" with open(join_images_path('MP_sounds.png'), 'rb') as f: - self.assertEqual(f.read(), self.obj.downloadPhoto().read()) + self.assertEqual(f.read(), self.obj.download_photo().read())
- def test_findDuplicateImages(self): + def test_find_duplicate_images(self): """Test finding duplicates on Wikimedia Commons.""" - duplicates = self.obj.findDuplicateImages() + duplicates = self.obj.find_duplicate_images() self.assertIn('MP sounds.png', [dup.replace('_', ' ') for dup in duplicates])
- def test_getTitle(self): + def test_get_title(self): """Test getTitle().""" - self.assertEqual(self.obj.getTitle('%(name)s - %(set)s.%(_ext)s'), + self.assertEqual(self.obj.get_title('%(name)s - %(set)s.%(_ext)s'), 'Sound icon - Crystal SVG icon set.png')
- def test_getDescription(self): + def test_get_description(self): """Test getDescription().""" - self.assertEqual(self.obj.getDescription('CrystalTemplate'), + self.assertEqual(self.obj.get_description('CrystalTemplate'), str("""{{CrystalTemplate |author=KDE artists {{!}} Silstor |description.en="Sounds" icon @@ -88,20 +89,20 @@ site=self.get_site()) self.obj = next(self.iterator)
- def test_PhotoURL(self): + def test_photo_url(self): """Test PhotoURL().""" self.assertEqual( self.obj.URL, 'http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png')
- def test_getTitle(self): + def test_get_title(self): """Test getTitle().""" - self.assertEqual(self.obj.getTitle('%(name)s - %(set)s.%(_ext)s'), + self.assertEqual(self.obj.get_title('%(name)s - %(set)s.%(_ext)s'), 'Sound icon - Crystal SVG icon set.png')
- def test_getDescription(self): + def test_get_description(self): """Test getDescription().""" - self.assertEqual(self.obj.getDescription('CrystalTemplate'), + self.assertEqual(self.obj.get_description('CrystalTemplate'), str("""{{CrystalTemplate |author=KDE artists {{!}} Silstor |description.en="Sounds" icon
pywikibot-commits@lists.wikimedia.org