jenkins-bot submitted this change.

View Change

Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
[cleanup] pheetools is no longer available upstreams

- remove pheetools support
- check whether ocr_tool is in _OCR_CMDS dict instead of _OCR_METHODS
list
- new default of ocr_tool in ProofreadPage.ocr is wmfOCR now
- Do not show any deprecation warning, the pheetools fails anyway;
just give some hints in the documentation of ocr method. Do not care
about private methods.
- Update tests, remove tests for error condition because test is
skipped anyway in this case.

Bug: T366036
Change-Id: I4eecee8a034ef889857071c82d27fcb5ca5b7db8
---
M pywikibot/proofreadpage.py
M tests/proofreadpage_tests.py
2 files changed, 52 insertions(+), 142 deletions(-)

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 84d4e60..90e2a03 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -1,24 +1,13 @@
-"""
-Objects used with ProofreadPage Extension.
-
-This module includes objects:
-
-* ProofreadPage(Page)
-* FullHeader
-* IndexPage(Page)
-
+"""Objects used with ProofreadPage Extension.

OCR support of page scans via:
-- https://phetools.toolforge.org/hocr_cgi.py
-- https://phetools.toolforge.org/ocr.php
-- inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js

-- Wikimedia OCR
-- see: https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR
+- Wikimedia OCR, see:
+ https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR
+- https://ocr.wmcloud.org/, inspired by
+ https://wikisource.org/wiki/MediaWiki:GoogleOCR.js

-- https://ocr.wmcloud.org/
-- inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
-- see also: https://wikisource.org/wiki/Wikisource:Google_OCR
+.. seealso:: https://wikisource.org/wiki/Wikisource:Google_OCR

"""
#
@@ -421,13 +410,6 @@
p_close = re.compile(r'(</div>|\n\n\n)?</noinclude>')
p_close_no_div = re.compile('</noinclude>') # V2 page format.

- # phetools ocr utility
- _HOCR_CMD = ('https://phetools.toolforge.org/hocr_cgi.py?'
- 'cmd=hocr&book={book}&lang={lang}&user={user}')
-
- _OCR_CMD = ('https://phetools.toolforge.org/ocr.php?'
- 'cmd=ocr&url={url_image}&lang={lang}&user={user}')
-
# Wikimedia OCR utility
_WMFOCR_CMD = ('https://ocr.wmcloud.org/api.php?engine=tesseract&'
'langs[]={lang}&image={url_image}&uselang={lang}')
@@ -438,14 +420,10 @@

_MULTI_PAGE_EXT = ['djvu', 'pdf']

- _PHETOOLS = 'phetools'
_WMFOCR = 'wmfOCR'
_GOOGLE_OCR = 'googleOCR'
- _OCR_CMDS = {_PHETOOLS: _OCR_CMD,
- _WMFOCR: _WMFOCR_CMD,
- _GOOGLE_OCR: _GOCR_CMD,
- }
- _OCR_METHODS = list(_OCR_CMDS.keys())
+ _OCR_CMDS = {_WMFOCR: _WMFOCR_CMD, _GOOGLE_OCR: _GOCR_CMD}
+ _OCR_METHODS = list(_OCR_CMDS)

def __init__(self, source: PageSourceType, title: str = '') -> None:
"""Instantiate a ProofreadPage object.
@@ -883,10 +861,12 @@

return self._url_image_ge_140()

- def _ocr_callback(self, cmd_uri: str,
- parser_func: Callable[[str], str] | None = None,
- ocr_tool: str | None = None
- ) -> tuple[bool, str | Exception]:
+ def _ocr_callback(
+ self,
+ cmd_uri: str,
+ parser_func: Callable[[str], str] | None = None,
+ ocr_tool: str | None = None,
+ ) -> tuple[bool, str | Exception]:
"""OCR callback function.

:return: tuple (error, text [error description in case of error]).
@@ -903,7 +883,7 @@
if not callable(parser_func):
raise TypeError('Keyword parser_func must be callable.')

- if ocr_tool not in self._OCR_METHODS:
+ if ocr_tool not in self._OCR_CMDS:
raise TypeError(
f"ocr_tool must be in {self._OCR_METHODS}, not '{ocr_tool}'.")

@@ -931,54 +911,17 @@

data = response.json()

- if ocr_tool == self._PHETOOLS: # phetools
- assert 'error' in data, f'Error from phetools: {data}'
- assert data['error'] in [0, 1, 2, 3], \
- f'Error from phetools: {data}'
- error, _text = bool(data['error']), data['text']
- else: # googleOCR
- if 'error' in data:
- error, _text = True, data['error']
- else:
- error, _text = False, data['text']
+ if 'error' in data:
+ error, _text = True, data['error']
+ else:
+ error, _text = False, data['text']

if error:
pywikibot.error(f'OCR query {cmd_uri}: {_text}')
return error, _text
+
return error, parser_func(_text)

- def _do_hocr(self) -> tuple[bool, str | Exception]:
- """Do hocr using https://phetools.toolforge.org/hocr_cgi.py?cmd=hocr.
-
- This is the main method for 'phetools'.
- Fallback method is ocr.
-
- :raise ImportError: if bs4 is not installed, _bs4_soup() will raise
- """
- def parse_hocr_text(txt: str) -> str:
- """Parse hocr text."""
- soup = _bs4_soup(txt) # type: ignore
-
- res = []
- for _ocr_page in soup.find_all(class_='ocr_page'):
- for area in soup.find_all(class_='ocr_carea'):
- for par in area.find_all(class_='ocr_par'):
- for line in par.find_all(class_='ocr_line'):
- res.append(line.get_text())
- res.append('\n')
- return ''.join(res)
-
- params = {
- 'book': self.title(as_url=True, with_ns=False),
- 'lang': self.site.lang,
- 'user': self.site.user(),
- }
- cmd_uri = self._HOCR_CMD.format_map(params)
-
- return self._ocr_callback(cmd_uri,
- parser_func=parse_hocr_text,
- ocr_tool=self._PHETOOLS)
-
def _do_ocr(self, ocr_tool: str | None = None
) -> tuple[bool, str | Exception]:
"""Do ocr using specified ocr_tool method."""
@@ -990,8 +933,8 @@
return True, error_text

if ocr_tool is None:
- msg = 'ocr_tool required, must be among {}'
- raise TypeError(msg.format(self._OCR_METHODS))
+ raise TypeError(
+ f'ocr_tool required, must be among {self._OCR_METHODS}')

try:
cmd_fmt = self._OCR_CMDS[ocr_tool]
@@ -1011,38 +954,34 @@
def ocr(self, ocr_tool: str | None = None) -> str:
"""Do OCR of ProofreadPage scan.

- The text returned by this function shall be assigned to self.body,
- otherwise the ProofreadPage format will not be maintained.
+ The text returned by this function shall be assigned to
+ :attr:`body`, otherwise the ProofreadPage format will not be
+ maintained.

- It is the user's responsibility to reset quality level accordingly.
+ .. warning:: It is the user's responsibility to reset quality
+ level accordingly.

- :param ocr_tool: 'phetools', 'wmfOCR' or 'googleOCR';
- default is 'phetools'
+ .. versionchanged:: 9.2
+ default for *ocr_tool* is `wmfOCR`.
+ .. versionremoved:: 9.2
+ `phetools` support is not available anymore.

+ :param ocr_tool: 'wmfOCR' or 'googleOCR'; default is 'wmfOCR'
:return: OCR text for the page.
-
:raise TypeError: wrong ocr_tool keyword arg.
:raise ValueError: something went wrong with OCR process.
"""
if ocr_tool is None: # default value
- ocr_tool = self._PHETOOLS
+ ocr_tool = self._WMFOCR

- if ocr_tool not in self._OCR_METHODS:
+ if ocr_tool not in self._OCR_CMDS:
raise TypeError(
- f"ocr_tool must be in {self._OCR_METHODS}, not '{ocr_tool}'.")
-
- # if _multi_page, try _do_hocr() first and fall back to _do_ocr()
- if ocr_tool == self._PHETOOLS and self._multi_page:
- error, text = self._do_hocr()
- if not error and isinstance(text, str):
- return text
- pywikibot.warning(
- f'{self}: phetools hocr failed, falling back to ocr.')
+ f'ocr_tool must be in {self._OCR_METHODS}, not {ocr_tool!r}.')

error, text = self._do_ocr(ocr_tool=ocr_tool)
-
if not error and isinstance(text, str):
return text
+
raise ValueError(
f'{self}: not possible to perform OCR. {text}')

diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 0e2c525..ac576da 100755
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -449,28 +449,21 @@

family = 'wikisource'
code = 'en'
-
cached = True

- data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10',
- 'hocr': (False, 'ENTERED, according to Act of Congress, in the '
- 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office '
- 'of the Librarian of Congress, at '
- 'Washington.\n\n'),
- 'ocr': (False, 'EsTEnen, according to Act of Congress, in the '
- 'year 1872,\nBy D. APPLETON & CO.,\nIn the '
- 'Office of the Librarian of Congress, at '
- 'Washington.\n\u000c'),
- 'wmfOCR': (False, 'Estee, according to Act of Congress, in the '
- 'year 1872,\n'
- 'By D. APPLETON & CO.,\n'
- 'In the Office of the Librarian of Congress, '
- 'at Washington.'),
- 'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
- 'the year 1572,\nBY D. APPLETON & CO.\n'
- 'In the Office of the Librarian of '
- 'Congress, at Washington.\n4 334\n'),
- }
+ data = {
+ 'title':
+ 'Page:Popular Science Monthly Volume 1.djvu/10',
+ 'wmfOCR':
+ 'Estee, according to Act of Congress, in the year 1872,\n'
+ 'By D. APPLETON & CO.,\n'
+ 'In the Office of the Librarian of Congress, at Washington.',
+ 'googleOCR':
+ 'ENTERED, according to Act of Congress, in the year 1572,\n'
+ 'BY D. APPLETON & CO.\n'
+ 'In the Office of the Librarian of Congress, at Washington.\n'
+ '4 334\n',
+ }

def setUp(self):
"""Test setUp."""
@@ -484,33 +477,12 @@
with self.assertRaises(TypeError):
self.page.ocr(ocr_tool='dummy')

- def test_do_hocr(self):
- """Test page._do_hocr()."""
- error, text = self.page._do_hocr()
- if error:
- self.skipTest(text)
- ref_error, ref_text = self.data['hocr']
- self.assertEqual(error, ref_error)
- s = difflib.SequenceMatcher(None, text, ref_text)
- self.assertGreater(s.ratio(), 0.9)
-
- def test_do_ocr_phetools(self):
- """Test page._do_ocr(ocr_tool='phetools')."""
- error, text = self.page._do_ocr(ocr_tool='phetools')
- ref_error, ref_text = self.data['ocr']
- if error:
- self.skipTest(text)
- self.assertEqual(error, ref_error)
- s = difflib.SequenceMatcher(None, text, ref_text)
- self.assertGreater(s.ratio(), 0.9)
-
def test_do_ocr_wmfocr(self):
"""Test page._do_ocr(ocr_tool='wmfOCR')."""
error, text = self.page._do_ocr(ocr_tool='wmfOCR')
if error:
self.skipTest(text)
- ref_error, ref_text = self.data['wmfOCR']
- self.assertEqual(error, ref_error)
+ ref_text = self.data['wmfOCR']
s = difflib.SequenceMatcher(None, text, ref_text)
self.assertGreater(s.ratio(), 0.9)

@@ -519,8 +491,7 @@
error, text = self.page._do_ocr(ocr_tool='googleOCR')
if error:
self.skipTest(text)
- ref_error, ref_text = self.data['googleOCR']
- self.assertEqual(error, ref_error)
+ ref_text = self.data['googleOCR']
s = difflib.SequenceMatcher(None, text, ref_text)
self.assertGreater(s.ratio(), 0.9)

@@ -531,7 +502,7 @@
except Exception as exc:
self.assertIsInstance(exc, ValueError)
else:
- _error, ref_text = self.data['wmfOCR']
+ ref_text = self.data['wmfOCR']
s = difflib.SequenceMatcher(None, text, ref_text)
self.assertGreater(s.ratio(), 0.9)


To view, visit change 1036688. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I4eecee8a034ef889857071c82d27fcb5ca5b7db8
Gerrit-Change-Number: 1036688
Gerrit-PatchSet: 5
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Sohom Datta <sohomdatta1@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Mpaa <mpaa.wiki@gmail.com>
Gerrit-MessageType: merged