[Gerrit] ...core[master]: [cleanup] pheetools is no longer available upstreams - Pywikibot-commits

30 May 2024

jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1036688?usp=email )
Change subject: [cleanup] pheetools is no longer available upstreams
......................................................................
[cleanup] pheetools is no longer available upstreams
- remove pheetools support
- check whether  ocr_tool is in _OCR_CMDS dict instead of _OCR_METHODS
  list
- new default of ocr_tool in ProofreadPage.ocr is wmfOCR now
- Do not show any deprecation warning, the pheetools fails anyway;
  just give some hints in the documentation of ocr method. Do not care
  about private methods.
- Update tests, remove tests for error condition because test is
  skipped anyway in this case.
Bug: T366036
Change-Id: I4eecee8a034ef889857071c82d27fcb5ca5b7db8
---
M pywikibot/proofreadpage.py
M tests/proofreadpage_tests.py
2 files changed, 52 insertions(+), 142 deletions(-)
Approvals:
  jenkins-bot: Verified
  Xqt: Looks good to me, approved

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 84d4e60..90e2a03 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -1,24 +1,13 @@
-"""
-Objects used with ProofreadPage Extension.
-
-This module includes objects:
-
-* ProofreadPage(Page)
-* FullHeader
-* IndexPage(Page)
-
+"""Objects used with ProofreadPage Extension.
OCR support of page scans via:
-- https://phetools.toolforge.org/hocr_cgi.py
-- https://phetools.toolforge.org/ocr.php
-- inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
-- Wikimedia OCR
-- see: https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR
+- Wikimedia OCR, see:
+  https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR
+- https://ocr.wmcloud.org/, inspired by
+  https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
-- https://ocr.wmcloud.org/
-- inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
-- see also: https://wikisource.org/wiki/Wikisource:Google_OCR
+.. seealso:: https://wikisource.org/wiki/Wikisource:Google_OCR
"""
 #
@@ -421,13 +410,6 @@
     p_close = re.compile(r'(</div>|\n\n\n)?</noinclude>')
     p_close_no_div = re.compile('</noinclude>')  # V2 page format.
-    # phetools ocr utility
-    _HOCR_CMD = ('https://phetools.toolforge.org/hocr_cgi.py?'
-                 'cmd=hocr&book={book}&lang={lang}&user={user}')
-
-    _OCR_CMD = ('https://phetools.toolforge.org/ocr.php?'
-                'cmd=ocr&url={url_image}&lang={lang}&user={user}')
-
     # Wikimedia OCR utility
     _WMFOCR_CMD = ('https://ocr.wmcloud.org/api.php?engine=tesseract&'
                    'langs[]={lang}&image={url_image}&uselang={lang}')
@@ -438,14 +420,10 @@
_MULTI_PAGE_EXT = ['djvu', 'pdf']
-    _PHETOOLS = 'phetools'
     _WMFOCR = 'wmfOCR'
     _GOOGLE_OCR = 'googleOCR'
-    _OCR_CMDS = {_PHETOOLS: _OCR_CMD,
-                 _WMFOCR: _WMFOCR_CMD,
-                 _GOOGLE_OCR: _GOCR_CMD,
-                 }
-    _OCR_METHODS = list(_OCR_CMDS.keys())
+    _OCR_CMDS = {_WMFOCR: _WMFOCR_CMD, _GOOGLE_OCR: _GOCR_CMD}
+    _OCR_METHODS = list(_OCR_CMDS)
def __init__(self, source: PageSourceType, title: str = '') -> None:
         """Instantiate a ProofreadPage object.
@@ -883,10 +861,12 @@
return self._url_image_ge_140()
-    def _ocr_callback(self, cmd_uri: str,
-                      parser_func: Callable[[str], str] | None = None,
-                      ocr_tool: str | None = None
-                      ) -> tuple[bool, str | Exception]:
+    def _ocr_callback(
+        self,
+        cmd_uri: str,
+        parser_func: Callable[[str], str] | None = None,
+        ocr_tool: str | None = None,
+    ) -> tuple[bool, str | Exception]:
         """OCR callback function.
:return: tuple (error, text [error description in case of error]).
@@ -903,7 +883,7 @@
         if not callable(parser_func):
             raise TypeError('Keyword parser_func must be callable.')
-        if ocr_tool not in self._OCR_METHODS:
+        if ocr_tool not in self._OCR_CMDS:
             raise TypeError(
                 f"ocr_tool must be in {self._OCR_METHODS}, not '{ocr_tool}'.")
@@ -931,54 +911,17 @@
data = response.json()
-        if ocr_tool == self._PHETOOLS:  # phetools
-            assert 'error' in data, f'Error from phetools: {data}'
-            assert data['error'] in [0, 1, 2, 3], \
-                f'Error from phetools: {data}'
-            error, _text = bool(data['error']), data['text']
-        else:  # googleOCR
-            if 'error' in data:
-                error, _text = True, data['error']
-            else:
-                error, _text = False, data['text']
+        if 'error' in data:
+            error, _text = True, data['error']
+        else:
+            error, _text = False, data['text']
if error:
             pywikibot.error(f'OCR query {cmd_uri}: {_text}')
             return error, _text
+
         return error, parser_func(_text)
-    def _do_hocr(self) -> tuple[bool, str | Exception]:
-        """Do hocr using https://phetools.toolforge.org/hocr_cgi.py?cmd=hocr.
-
-        This is the main method for 'phetools'.
-        Fallback method is ocr.
-
-        :raise ImportError: if bs4 is not installed, _bs4_soup() will raise
-        """
-        def parse_hocr_text(txt: str) -> str:
-            """Parse hocr text."""
-            soup = _bs4_soup(txt)  # type: ignore
-
-            res = []
-            for _ocr_page in soup.find_all(class_='ocr_page'):
-                for area in soup.find_all(class_='ocr_carea'):
-                    for par in area.find_all(class_='ocr_par'):
-                        for line in par.find_all(class_='ocr_line'):
-                            res.append(line.get_text())
-                        res.append('\n')
-            return ''.join(res)
-
-        params = {
-            'book': self.title(as_url=True, with_ns=False),
-            'lang': self.site.lang,
-            'user': self.site.user(),
-        }
-        cmd_uri = self._HOCR_CMD.format_map(params)
-
-        return self._ocr_callback(cmd_uri,
-                                  parser_func=parse_hocr_text,
-                                  ocr_tool=self._PHETOOLS)
-
     def _do_ocr(self, ocr_tool: str | None = None
                 ) -> tuple[bool, str | Exception]:
         """Do ocr using specified ocr_tool method."""
@@ -990,8 +933,8 @@
             return True, error_text
if ocr_tool is None:
-            msg = 'ocr_tool required, must be among {}'
-            raise TypeError(msg.format(self._OCR_METHODS))
+            raise TypeError(
+                f'ocr_tool required, must be among {self._OCR_METHODS}')
try:
             cmd_fmt = self._OCR_CMDS[ocr_tool]
@@ -1011,38 +954,34 @@
     def ocr(self, ocr_tool: str | None = None) -> str:
         """Do OCR of ProofreadPage scan.
-        The text returned by this function shall be assigned to self.body,
-        otherwise the ProofreadPage format will not be maintained.
+        The text returned by this function shall be assigned to
+        :attr:`body`, otherwise the ProofreadPage format will not be
+        maintained.
-        It is the user's responsibility to reset quality level accordingly.
+        .. warning:: It is the user's responsibility to reset quality
+           level accordingly.
-        :param ocr_tool: 'phetools', 'wmfOCR' or 'googleOCR';
-            default is 'phetools'
+        .. versionchanged:: 9.2
+           default for *ocr_tool* is `wmfOCR`.
+        .. versionremoved:: 9.2
+           `phetools` support is not available anymore.
+        :param ocr_tool: 'wmfOCR' or 'googleOCR'; default is 'wmfOCR'
         :return: OCR text for the page.
-
         :raise TypeError: wrong ocr_tool keyword arg.
         :raise ValueError: something went wrong with OCR process.
         """
         if ocr_tool is None:  # default value
-            ocr_tool = self._PHETOOLS
+            ocr_tool = self._WMFOCR
-        if ocr_tool not in self._OCR_METHODS:
+        if ocr_tool not in self._OCR_CMDS:
             raise TypeError(
-                f"ocr_tool must be in {self._OCR_METHODS}, not '{ocr_tool}'.")
-
-        # if _multi_page, try _do_hocr() first and fall back to _do_ocr()
-        if ocr_tool == self._PHETOOLS and self._multi_page:
-            error, text = self._do_hocr()
-            if not error and isinstance(text, str):
-                return text
-            pywikibot.warning(
-                f'{self}: phetools hocr failed, falling back to ocr.')
+                f'ocr_tool must be in {self._OCR_METHODS}, not {ocr_tool!r}.')
error, text = self._do_ocr(ocr_tool=ocr_tool)
-
         if not error and isinstance(text, str):
             return text
+
         raise ValueError(
             f'{self}: not possible to perform OCR. {text}')
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 0e2c525..ac576da 100755
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -449,28 +449,21 @@
family = 'wikisource'
     code = 'en'
-
     cached = True
-    data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10',
-            'hocr': (False, 'ENTERED, according to Act of Congress, in the '
-                            'year 1872,\nBY D. APPLETON & CO.,\nIn the Ofﬁce '
-                            'of the Librarian of Congress, at '
-                            'Washington.\n\n'),
-            'ocr': (False, 'EsTEnen, according to Act of Congress, in the '
-                           'year 1872,\nBy D. APPLETON & CO.,\nIn the '
-                           'Office of the Librarian of Congress, at '
-                           'Washington.\n\u000c'),
-            'wmfOCR': (False, 'Estee, according to Act of Congress, in the '
-                              'year 1872,\n'
-                              'By D. APPLETON & CO.,\n'
-                              'In the Office of the Librarian of Congress, '
-                              'at Washington.'),
-            'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
-                                 'the year 1572,\nBY D. APPLETON & CO.\n'
-                                 'In the Office of the Librarian of '
-                                 'Congress, at Washington.\n4 334\n'),
-            }
+    data = {
+        'title':
+            'Page:Popular Science Monthly Volume 1.djvu/10',
+        'wmfOCR':
+            'Estee, according to Act of Congress, in the year 1872,\n'
+            'By D. APPLETON & CO.,\n'
+            'In the Office of the Librarian of Congress, at Washington.',
+        'googleOCR':
+            'ENTERED, according to Act of Congress, in the year 1572,\n'
+            'BY D. APPLETON & CO.\n'
+            'In the Office of the Librarian of Congress, at Washington.\n'
+            '4 334\n',
+    }
def setUp(self):
         """Test setUp."""
@@ -484,33 +477,12 @@
         with self.assertRaises(TypeError):
             self.page.ocr(ocr_tool='dummy')
-    def test_do_hocr(self):
-        """Test page._do_hocr()."""
-        error, text = self.page._do_hocr()
-        if error:
-            self.skipTest(text)
-        ref_error, ref_text = self.data['hocr']
-        self.assertEqual(error, ref_error)
-        s = difflib.SequenceMatcher(None, text, ref_text)
-        self.assertGreater(s.ratio(), 0.9)
-
-    def test_do_ocr_phetools(self):
-        """Test page._do_ocr(ocr_tool='phetools')."""
-        error, text = self.page._do_ocr(ocr_tool='phetools')
-        ref_error, ref_text = self.data['ocr']
-        if error:
-            self.skipTest(text)
-        self.assertEqual(error, ref_error)
-        s = difflib.SequenceMatcher(None, text, ref_text)
-        self.assertGreater(s.ratio(), 0.9)
-
     def test_do_ocr_wmfocr(self):
         """Test page._do_ocr(ocr_tool='wmfOCR')."""
         error, text = self.page._do_ocr(ocr_tool='wmfOCR')
         if error:
             self.skipTest(text)
-        ref_error, ref_text = self.data['wmfOCR']
-        self.assertEqual(error, ref_error)
+        ref_text = self.data['wmfOCR']
         s = difflib.SequenceMatcher(None, text, ref_text)
         self.assertGreater(s.ratio(), 0.9)
@@ -519,8 +491,7 @@
         error, text = self.page._do_ocr(ocr_tool='googleOCR')
         if error:
             self.skipTest(text)
-        ref_error, ref_text = self.data['googleOCR']
-        self.assertEqual(error, ref_error)
+        ref_text = self.data['googleOCR']
         s = difflib.SequenceMatcher(None, text, ref_text)
         self.assertGreater(s.ratio(), 0.9)
@@ -531,7 +502,7 @@
         except Exception as exc:
             self.assertIsInstance(exc, ValueError)
         else:
-            _error, ref_text = self.data['wmfOCR']
+            ref_text = self.data['wmfOCR']
             s = difflib.SequenceMatcher(None, text, ref_text)
             self.assertGreater(s.ratio(), 0.9)
-- 
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1036688?usp=email
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I4eecee8a034ef889857071c82d27fcb5ca5b7db8
Gerrit-Change-Number: 1036688
Gerrit-PatchSet: 5
Gerrit-Owner: Xqt info@gno.de
Gerrit-Reviewer: Sohom Datta sohomdatta1@gmail.com
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Mpaa mpaa.wiki@gmail.com
Gerrit-MessageType: merged