[Gerrit] ...core[master]: [scripts] recover data_ingestion.py - Pywikibot-commits

2 Feb 2022

jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/757889 )

Change subject: [scripts] recover data_ingestion.py
......................................................................

[scripts] recover data_ingestion.py

NOTE:
There might be an unsolved issue as this script was ported from compat:
neither bot._contents nor bot_retrieved is used and probably
Photo.download_photo isn't used for the upload and the UploadRobot
uses it's own read_file_content() method instead of this.

Change-Id: I46cd52d389757053007257a01f6e726e1479b3c1
---
M docs/scripts/scripts.rst
M scripts/README.rst
R scripts/data_ingestion.py
M tests/__init__.py
R tests/data_ingestion_tests.py
5 files changed, 82 insertions(+), 76 deletions(-)

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/docs/scripts/scripts.rst b/docs/scripts/scripts.rst
index 1787469..ecde314 100644
--- a/docs/scripts/scripts.rst
+++ b/docs/scripts/scripts.rst
@@ -83,6 +83,11 @@
 
 .. automodule:: scripts.cosmetic_changes
 
+data\_ingestion script
+----------------------
+
+.. automodule:: scripts.data_ingestion
+
 delete script
 -------------
 
diff --git a/scripts/README.rst b/scripts/README.rst
index 8c417a8..2e6e489 100644
--- a/scripts/README.rst
+++ b/scripts/README.rst
@@ -54,6 +54,8 @@
     | cosmetic_changes.py    | Can do slight modifications to a wiki page source code  |
     |                        | such that the code looks cleaner.                       |
     +------------------------+---------------------------------------------------------+
+    | data_ingestion.py      | A generic bot to do batch uploading to Commons.         |
+    +------------------------+---------------------------------------------------------+
     | delete.py              | This script can be used to delete pages en masse.       |
     +------------------------+---------------------------------------------------------+
     | djvutext.py            | Extracts OCR text from djvu files and uploads onto      |
@@ -168,8 +170,6 @@
     +------------------------+---------------------------------------------------------+
     | create_categories.py   | Program to batch create categories.                     |
     +------------------------+---------------------------------------------------------+
-    | data_ingestion.py      | A generic bot to do batch uploading to Commons.         |
-    +------------------------+---------------------------------------------------------+
     | disambredir.py         | Changing redirect names in disambiguation pages.        |
     +------------------------+---------------------------------------------------------+
     | editarticle.py         | Edit a Wikipedia article with your favourite editor.    |
diff --git a/scripts/archive/data_ingestion.py b/scripts/data_ingestion.py
similarity index 75%
rename from scripts/archive/data_ingestion.py
rename to scripts/data_ingestion.py
index 2516881..40109a0 100755
--- a/scripts/archive/data_ingestion.py
+++ b/scripts/data_ingestion.py
@@ -93,7 +93,7 @@
 
 """
 #
-# (C) Pywikibot team, 2012-2021
+# (C) Pywikibot team, 2012-2022
 #
 # Distributed under the terms of the MIT license.
 #
@@ -104,10 +104,13 @@
 import io
 import os
 import posixpath
+
+from typing import Any, BinaryIO, Optional
 from urllib.parse import urlparse
 
 import pywikibot
 from pywikibot import pagegenerators
+from pywikibot.backports import Dict, List
 from pywikibot.comms.http import fetch
 from pywikibot.exceptions import NoPageError
 from pywikibot.specialbots import UploadRobot
@@ -117,45 +120,43 @@
 
     """Represents a Photo (or other file), with metadata, to be
uploaded."""
 
-    def __init__(self, URL: str, metadata: dict, site=None):
+    def __init__(self, url: str, metadata: Dict[str, Any],
+                 site: Optional[pywikibot.site.APISite] = None):
         """
         Initializer.
 
-        :param URL: URL of photo
+        :param url: URL of photo
         :param metadata: metadata about the photo that can be referred to
             from the title & template
         :param site: target site
-        :type site: pywikibot.site.APISite
-
         """
-        self.URL = URL
+        self.URL = url
         self.metadata = metadata
-        self.metadata['_url'] = URL
+        self.metadata['_url'] = url
         self.metadata['_filename'] = filename = posixpath.split(
-            urlparse(URL)[2])[1]
-        self.metadata['_ext'] = ext = filename.split('.')[-1]
-        if ext == filename:
-            self.metadata['_ext'] = None
+            urlparse(url)[2])[1]
+        ext = filename.split('.')[-1]
+        self.metadata['_ext'] = None if ext == filename else ext
         self.contents = None
 
         if not site:
-            site = pywikibot.Site('commons', 'commons')
+            site = pywikibot.Site('commons:commons')
 
         # default title
-        super().__init__(site, self.getTitle('%(_filename)s.%(_ext)s'))
+        super().__init__(site, self.get_title('%(_filename)s.%(_ext)s'))
 
-    def downloadPhoto(self):
+    def download_photo(self) -> BinaryIO:
         """
         Download the photo and store it in an io.BytesIO object.
 
         TODO: Add exception handling
         """
         if not self.contents:
-            imageFile = fetch(self.URL).content
-            self.contents = io.BytesIO(imageFile)
+            image_file = fetch(self.URL).content
+            self.contents = io.BytesIO(image_file)
         return self.contents
 
-    def findDuplicateImages(self):
+    def find_duplicate_images(self) -> List[str]:
         """
         Find duplicates of the photo.
 
@@ -164,13 +165,13 @@
 
         TODO: Add exception handling, fix site thing
         """
-        hashObject = hashlib.sha1()
-        hashObject.update(self.downloadPhoto().getvalue())
-        return [page.title(with_ns=False) for page in
-                self.site.allimages(
-                    sha1=base64.b16encode(hashObject.digest()))]
+        hash_object = hashlib.sha1()
+        hash_object.update(self.download_photo().getvalue())
+        return [page.title(with_ns=False)
+                for page in self.site.allimages(
+                    sha1=base64.b16encode(hash_object.digest()))]
 
-    def getTitle(self, fmt: str) -> str:
+    def get_title(self, fmt: str) -> str:
         """
         Populate format string with %(name)s entries using metadata.
 
@@ -183,7 +184,8 @@
         # FIXME: normalise the title so it is usable as a MediaWiki title.
         return fmt % self.metadata
 
-    def getDescription(self, template, extraparams=None):
+    def get_description(self, template,
+                        extraparams: Optional[Dict[str, str]] = None) -> str:
         """Generate a description for a file."""
         params = {}
         params.update(self.metadata)
@@ -192,18 +194,18 @@
         for key in sorted(params.keys()):
             value = params[key]
             if not key.startswith('_'):
-                description += ('|{}={}\n'.format(
-                    key, self._safeTemplateValue(value)))
+                description += '|{}={}\n'.format(
+                    key, self._safe_template_value(value))
         description += '}}'
 
         return description
 
-    def _safeTemplateValue(self, value):
+    def _safe_template_value(self, value: str) -> str:
         """Replace pipe (|) with {{!}}."""
         return value.replace('|', '{{!}}')
 
 
-def CSVReader(fileobj, urlcolumn, site=None, *args, **kwargs):
+def CSVReader(fileobj, urlcolumn, site=None, *args, **kwargs):  # noqa: N802
     """Yield Photo objects for each row of a CSV file."""
     reader = csv.DictReader(fileobj, *args, **kwargs)
     for line in reader:
@@ -218,52 +220,45 @@
         """
         Initializer.
 
-        :param reader: Generator of Photos to process.
-        :type reader: Photo page generator
         :param titlefmt: Title format
         :param pagefmt: Page format
         """
         super().__init__(**kwargs)
-
         self.titlefmt = titlefmt
         self.pagefmt = pagefmt
 
-    def treat(self, photo):
-        """
-        Process each page.
+    def treat(self, page):
+        """Process each page.
 
         1. Check for existing duplicates on the wiki specified in self.site.
         2. If duplicates are found, then skip uploading.
         3. Download the file from photo.URL and upload the file to self.site.
         """
-        duplicates = photo.findDuplicateImages()
+        duplicates = page.find_duplicate_images()
         if duplicates:
-            pywikibot.output('Skipping duplicate of {!r}'
-                             .format(duplicates))
-            return duplicates[0]
+            pywikibot.output('Skipping duplicate of {!r}'.format(duplicates))
+            return
 
-        title = photo.getTitle(self.titlefmt)
-        description = photo.getDescription(self.pagefmt)
+        title = page.get_title(self.titlefmt)
+        description = page.get_description(self.pagefmt)
 
-        bot = UploadRobot(url=photo.URL,
+        bot = UploadRobot(url=page.URL,
                           description=description,
                           use_filename=title,
                           keep_filename=True,
                           verify_description=False,
                           target_site=self.site)
-        bot._contents = photo.downloadPhoto().getvalue()
+        bot._contents = page.download_photo().getvalue()
         bot._retrieved = True
         bot.run()
 
-        return title
-
     @classmethod
-    def parseConfigurationPage(cls, configurationPage):
+    def parse_configuration_page(cls, configuration_page) -> Dict[str, str]:
         """
         Parse a Page which contains the configuration.
 
-        :param configurationPage: page with configuration
-        :type configurationPage: :py:obj:`pywikibot.Page`
+        :param configuration_page: page with configuration
+        :type configuration_page: :py:obj:`pywikibot.Page`
         """
         configuration = {}
         # Set a bunch of defaults
@@ -271,7 +266,7 @@
         configuration['csvDelimiter'] = ';'
         configuration['csvEncoding'] = 'Windows-1252'  # FIXME: Encoding
hell
 
-        templates = configurationPage.templatesWithParams()
+        templates = configuration_page.templatesWithParams()
         for (template, params) in templates:
             if template.title(with_ns=False) == 'Data ingestion':
                 for param in params:
@@ -295,26 +290,30 @@
 
     :param args: command line arguments
     """
-    # Process global args and prepare generator args parser
-    local_args = pywikibot.handle_args(args)
+    csv_dir = None
+    unknown = []
 
     # This factory is responsible for processing command line arguments
     # that are also used by other scripts and that determine on which pages
     # to work on.
-    genFactory = pagegenerators.GeneratorFactory()
-    csv_dir = None
+    gen_factory = pagegenerators.GeneratorFactory()
 
+    # Process global args and prepare generator args parser
+    local_args = pywikibot.handle_args(args)
+    local_args = gen_factory.handle_args(local_args)
     for arg in local_args:
-        if arg.startswith('-csvdir:'):
-            csv_dir = arg[8:]
+        opt, _, value = arg.partition(':')
+        if opt == '-csvdir:':
+            csv_dir = value
         else:
-            genFactory.handle_arg(arg)
+            unknown.append(arg)
 
-    config_generator = genFactory.getCombinedGenerator()
+    config_generator = gen_factory.getCombinedGenerator()
 
     if pywikibot.bot.suggest_help(
-            missing_parameters=[] if csv_dir else ['-csvdir'],
-            missing_generator=not config_generator):
+            missing_parameters=None if csv_dir else ['-csvdir'],
+            missing_generator=not config_generator,
+            unknown_parameters=unknown):
         return
 
     for config_page in config_generator:
@@ -324,7 +323,7 @@
             pywikibot.error('{} does not exist'.format(config_page))
             continue
 
-        configuration = DataIngestionBot.parseConfigurationPage(config_page)
+        configuration = DataIngestionBot.parse_configuration_page(config_page)
 
         filename = os.path.join(csv_dir, configuration['csvFile'])
         try:
diff --git a/tests/__init__.py b/tests/__init__.py
index a29fdff..b54e2c2 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -149,6 +149,7 @@
     'cache',
     'category_bot',
     'checkimages',
+    'data_ingestion',
     'deletionbot',
     'fixing_redirects',
     'generate_family_file',
diff --git a/tests/archive/data_ingestion_tests.py b/tests/data_ingestion_tests.py
similarity index 82%
rename from tests/archive/data_ingestion_tests.py
rename to tests/data_ingestion_tests.py
index 7693619..ee19246 100644
--- a/tests/archive/data_ingestion_tests.py
+++ b/tests/data_ingestion_tests.py
@@ -1,13 +1,14 @@
 #!/usr/bin/python3
 """Unit tests for data_ingestion.py script."""
 #
-# (C) Pywikibot team, 2012-2021
+# (C) Pywikibot team, 2012-2022
 #
 # Distributed under the terms of the MIT license.
 #
 import unittest
 
 from scripts import data_ingestion
+
 from tests import join_data_path, join_images_path
 from tests.aspects import ScriptMainTestCase, TestCase
 from tests.utils import empty_sites
@@ -35,7 +36,7 @@
 
         meta_url = 'http://commons.wikimedia.org/wiki/File:Sound-icon.svg'
         self.obj = data_ingestion.Photo(
-            URL=url,
+            url=url,
             metadata={'description.en': '"Sounds" icon',
                       'source': meta_url,
                       'author': 'KDE artists | Silstor',
@@ -44,25 +45,25 @@
                       'name': 'Sound icon'},
             site=self.get_site('commons'))
 
-    def test_downloadPhoto(self):
+    def test_download_photo(self):
         """Test download from
http://upload.wikimedia.org/."""
         with open(join_images_path('MP_sounds.png'), 'rb') as f:
-            self.assertEqual(f.read(), self.obj.downloadPhoto().read())
+            self.assertEqual(f.read(), self.obj.download_photo().read())
 
-    def test_findDuplicateImages(self):
+    def test_find_duplicate_images(self):
         """Test finding duplicates on Wikimedia
Commons."""
-        duplicates = self.obj.findDuplicateImages()
+        duplicates = self.obj.find_duplicate_images()
         self.assertIn('MP sounds.png',
                       [dup.replace('_', ' ') for dup in duplicates])
 
-    def test_getTitle(self):
+    def test_get_title(self):
         """Test getTitle()."""
-        self.assertEqual(self.obj.getTitle('%(name)s - %(set)s.%(_ext)s'),
+        self.assertEqual(self.obj.get_title('%(name)s - %(set)s.%(_ext)s'),
                          'Sound icon - Crystal SVG icon set.png')
 
-    def test_getDescription(self):
+    def test_get_description(self):
         """Test getDescription()."""
-        self.assertEqual(self.obj.getDescription('CrystalTemplate'),
+        self.assertEqual(self.obj.get_description('CrystalTemplate'),
                          str("""{{CrystalTemplate
 |author=KDE artists {{!}} Silstor
 |description.en="Sounds" icon
@@ -88,20 +89,20 @@
                                                      site=self.get_site())
             self.obj = next(self.iterator)
 
-    def test_PhotoURL(self):
+    def test_photo_url(self):
         """Test PhotoURL()."""
         self.assertEqual(
             self.obj.URL,
             'http://upload.wikimedia.org/wikipedia/commons/f/fc/MP_sounds.png')
 
-    def test_getTitle(self):
+    def test_get_title(self):
         """Test getTitle()."""
-        self.assertEqual(self.obj.getTitle('%(name)s - %(set)s.%(_ext)s'),
+        self.assertEqual(self.obj.get_title('%(name)s - %(set)s.%(_ext)s'),
                          'Sound icon - Crystal SVG icon set.png')
 
-    def test_getDescription(self):
+    def test_get_description(self):
         """Test getDescription()."""
-        self.assertEqual(self.obj.getDescription('CrystalTemplate'),
+        self.assertEqual(self.obj.get_description('CrystalTemplate'),
                          str("""{{CrystalTemplate
 |author=KDE artists {{!}} Silstor
 |description.en="Sounds" icon

-- 
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/757889
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I46cd52d389757053007257a01f6e726e1479b3c1
Gerrit-Change-Number: 757889
Gerrit-PatchSet: 4
Gerrit-Owner: Xqt &lt;info(a)gno.de&gt;
Gerrit-Reviewer: D3r1ck01 &lt;xsavitar.wiki(a)aol.com&gt;
Gerrit-Reviewer: Xqt &lt;info(a)gno.de&gt;
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged