jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1036688?usp=email )
Change subject: [cleanup] pheetools is no longer available upstreams ......................................................................
[cleanup] pheetools is no longer available upstreams
- remove pheetools support - check whether ocr_tool is in _OCR_CMDS dict instead of _OCR_METHODS list - new default of ocr_tool in ProofreadPage.ocr is wmfOCR now - Do not show any deprecation warning, the pheetools fails anyway; just give some hints in the documentation of ocr method. Do not care about private methods. - Update tests, remove tests for error condition because test is skipped anyway in this case.
Bug: T366036 Change-Id: I4eecee8a034ef889857071c82d27fcb5ca5b7db8 --- M pywikibot/proofreadpage.py M tests/proofreadpage_tests.py 2 files changed, 52 insertions(+), 142 deletions(-)
Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py index 84d4e60..90e2a03 100644 --- a/pywikibot/proofreadpage.py +++ b/pywikibot/proofreadpage.py @@ -1,24 +1,13 @@ -""" -Objects used with ProofreadPage Extension. - -This module includes objects: - -* ProofreadPage(Page) -* FullHeader -* IndexPage(Page) - +"""Objects used with ProofreadPage Extension.
OCR support of page scans via: -- https://phetools.toolforge.org/hocr_cgi.py -- https://phetools.toolforge.org/ocr.php -- inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
-- Wikimedia OCR -- see: https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR +- Wikimedia OCR, see: + https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR +- https://ocr.wmcloud.org/, inspired by + https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
-- https://ocr.wmcloud.org/ -- inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js -- see also: https://wikisource.org/wiki/Wikisource:Google_OCR +.. seealso:: https://wikisource.org/wiki/Wikisource:Google_OCR
""" # @@ -421,13 +410,6 @@ p_close = re.compile(r'(</div>|\n\n\n)?</noinclude>') p_close_no_div = re.compile('</noinclude>') # V2 page format.
- # phetools ocr utility - _HOCR_CMD = ('https://phetools.toolforge.org/hocr_cgi.py?' - 'cmd=hocr&book={book}&lang={lang}&user={user}') - - _OCR_CMD = ('https://phetools.toolforge.org/ocr.php?' - 'cmd=ocr&url={url_image}&lang={lang}&user={user}') - # Wikimedia OCR utility _WMFOCR_CMD = ('https://ocr.wmcloud.org/api.php?engine=tesseract&' 'langs[]={lang}&image={url_image}&uselang={lang}') @@ -438,14 +420,10 @@
_MULTI_PAGE_EXT = ['djvu', 'pdf']
- _PHETOOLS = 'phetools' _WMFOCR = 'wmfOCR' _GOOGLE_OCR = 'googleOCR' - _OCR_CMDS = {_PHETOOLS: _OCR_CMD, - _WMFOCR: _WMFOCR_CMD, - _GOOGLE_OCR: _GOCR_CMD, - } - _OCR_METHODS = list(_OCR_CMDS.keys()) + _OCR_CMDS = {_WMFOCR: _WMFOCR_CMD, _GOOGLE_OCR: _GOCR_CMD} + _OCR_METHODS = list(_OCR_CMDS)
def __init__(self, source: PageSourceType, title: str = '') -> None: """Instantiate a ProofreadPage object. @@ -883,10 +861,12 @@
return self._url_image_ge_140()
- def _ocr_callback(self, cmd_uri: str, - parser_func: Callable[[str], str] | None = None, - ocr_tool: str | None = None - ) -> tuple[bool, str | Exception]: + def _ocr_callback( + self, + cmd_uri: str, + parser_func: Callable[[str], str] | None = None, + ocr_tool: str | None = None, + ) -> tuple[bool, str | Exception]: """OCR callback function.
:return: tuple (error, text [error description in case of error]). @@ -903,7 +883,7 @@ if not callable(parser_func): raise TypeError('Keyword parser_func must be callable.')
- if ocr_tool not in self._OCR_METHODS: + if ocr_tool not in self._OCR_CMDS: raise TypeError( f"ocr_tool must be in {self._OCR_METHODS}, not '{ocr_tool}'.")
@@ -931,54 +911,17 @@
data = response.json()
- if ocr_tool == self._PHETOOLS: # phetools - assert 'error' in data, f'Error from phetools: {data}' - assert data['error'] in [0, 1, 2, 3], \ - f'Error from phetools: {data}' - error, _text = bool(data['error']), data['text'] - else: # googleOCR - if 'error' in data: - error, _text = True, data['error'] - else: - error, _text = False, data['text'] + if 'error' in data: + error, _text = True, data['error'] + else: + error, _text = False, data['text']
if error: pywikibot.error(f'OCR query {cmd_uri}: {_text}') return error, _text + return error, parser_func(_text)
- def _do_hocr(self) -> tuple[bool, str | Exception]: - """Do hocr using https://phetools.toolforge.org/hocr_cgi.py?cmd=hocr. - - This is the main method for 'phetools'. - Fallback method is ocr. - - :raise ImportError: if bs4 is not installed, _bs4_soup() will raise - """ - def parse_hocr_text(txt: str) -> str: - """Parse hocr text.""" - soup = _bs4_soup(txt) # type: ignore - - res = [] - for _ocr_page in soup.find_all(class_='ocr_page'): - for area in soup.find_all(class_='ocr_carea'): - for par in area.find_all(class_='ocr_par'): - for line in par.find_all(class_='ocr_line'): - res.append(line.get_text()) - res.append('\n') - return ''.join(res) - - params = { - 'book': self.title(as_url=True, with_ns=False), - 'lang': self.site.lang, - 'user': self.site.user(), - } - cmd_uri = self._HOCR_CMD.format_map(params) - - return self._ocr_callback(cmd_uri, - parser_func=parse_hocr_text, - ocr_tool=self._PHETOOLS) - def _do_ocr(self, ocr_tool: str | None = None ) -> tuple[bool, str | Exception]: """Do ocr using specified ocr_tool method.""" @@ -990,8 +933,8 @@ return True, error_text
if ocr_tool is None: - msg = 'ocr_tool required, must be among {}' - raise TypeError(msg.format(self._OCR_METHODS)) + raise TypeError( + f'ocr_tool required, must be among {self._OCR_METHODS}')
try: cmd_fmt = self._OCR_CMDS[ocr_tool] @@ -1011,38 +954,34 @@ def ocr(self, ocr_tool: str | None = None) -> str: """Do OCR of ProofreadPage scan.
- The text returned by this function shall be assigned to self.body, - otherwise the ProofreadPage format will not be maintained. + The text returned by this function shall be assigned to + :attr:`body`, otherwise the ProofreadPage format will not be + maintained.
- It is the user's responsibility to reset quality level accordingly. + .. warning:: It is the user's responsibility to reset quality + level accordingly.
- :param ocr_tool: 'phetools', 'wmfOCR' or 'googleOCR'; - default is 'phetools' + .. versionchanged:: 9.2 + default for *ocr_tool* is `wmfOCR`. + .. versionremoved:: 9.2 + `phetools` support is not available anymore.
+ :param ocr_tool: 'wmfOCR' or 'googleOCR'; default is 'wmfOCR' :return: OCR text for the page. - :raise TypeError: wrong ocr_tool keyword arg. :raise ValueError: something went wrong with OCR process. """ if ocr_tool is None: # default value - ocr_tool = self._PHETOOLS + ocr_tool = self._WMFOCR
- if ocr_tool not in self._OCR_METHODS: + if ocr_tool not in self._OCR_CMDS: raise TypeError( - f"ocr_tool must be in {self._OCR_METHODS}, not '{ocr_tool}'.") - - # if _multi_page, try _do_hocr() first and fall back to _do_ocr() - if ocr_tool == self._PHETOOLS and self._multi_page: - error, text = self._do_hocr() - if not error and isinstance(text, str): - return text - pywikibot.warning( - f'{self}: phetools hocr failed, falling back to ocr.') + f'ocr_tool must be in {self._OCR_METHODS}, not {ocr_tool!r}.')
error, text = self._do_ocr(ocr_tool=ocr_tool) - if not error and isinstance(text, str): return text + raise ValueError( f'{self}: not possible to perform OCR. {text}')
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py index 0e2c525..ac576da 100755 --- a/tests/proofreadpage_tests.py +++ b/tests/proofreadpage_tests.py @@ -449,28 +449,21 @@
family = 'wikisource' code = 'en' - cached = True
- data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10', - 'hocr': (False, 'ENTERED, according to Act of Congress, in the ' - 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office ' - 'of the Librarian of Congress, at ' - 'Washington.\n\n'), - 'ocr': (False, 'EsTEnen, according to Act of Congress, in the ' - 'year 1872,\nBy D. APPLETON & CO.,\nIn the ' - 'Office of the Librarian of Congress, at ' - 'Washington.\n\u000c'), - 'wmfOCR': (False, 'Estee, according to Act of Congress, in the ' - 'year 1872,\n' - 'By D. APPLETON & CO.,\n' - 'In the Office of the Librarian of Congress, ' - 'at Washington.'), - 'googleOCR': (False, 'ENTERED, according to Act of Congress, in ' - 'the year 1572,\nBY D. APPLETON & CO.\n' - 'In the Office of the Librarian of ' - 'Congress, at Washington.\n4 334\n'), - } + data = { + 'title': + 'Page:Popular Science Monthly Volume 1.djvu/10', + 'wmfOCR': + 'Estee, according to Act of Congress, in the year 1872,\n' + 'By D. APPLETON & CO.,\n' + 'In the Office of the Librarian of Congress, at Washington.', + 'googleOCR': + 'ENTERED, according to Act of Congress, in the year 1572,\n' + 'BY D. APPLETON & CO.\n' + 'In the Office of the Librarian of Congress, at Washington.\n' + '4 334\n', + }
def setUp(self): """Test setUp.""" @@ -484,33 +477,12 @@ with self.assertRaises(TypeError): self.page.ocr(ocr_tool='dummy')
- def test_do_hocr(self): - """Test page._do_hocr().""" - error, text = self.page._do_hocr() - if error: - self.skipTest(text) - ref_error, ref_text = self.data['hocr'] - self.assertEqual(error, ref_error) - s = difflib.SequenceMatcher(None, text, ref_text) - self.assertGreater(s.ratio(), 0.9) - - def test_do_ocr_phetools(self): - """Test page._do_ocr(ocr_tool='phetools').""" - error, text = self.page._do_ocr(ocr_tool='phetools') - ref_error, ref_text = self.data['ocr'] - if error: - self.skipTest(text) - self.assertEqual(error, ref_error) - s = difflib.SequenceMatcher(None, text, ref_text) - self.assertGreater(s.ratio(), 0.9) - def test_do_ocr_wmfocr(self): """Test page._do_ocr(ocr_tool='wmfOCR').""" error, text = self.page._do_ocr(ocr_tool='wmfOCR') if error: self.skipTest(text) - ref_error, ref_text = self.data['wmfOCR'] - self.assertEqual(error, ref_error) + ref_text = self.data['wmfOCR'] s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9)
@@ -519,8 +491,7 @@ error, text = self.page._do_ocr(ocr_tool='googleOCR') if error: self.skipTest(text) - ref_error, ref_text = self.data['googleOCR'] - self.assertEqual(error, ref_error) + ref_text = self.data['googleOCR'] s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9)
@@ -531,7 +502,7 @@ except Exception as exc: self.assertIsInstance(exc, ValueError) else: - _error, ref_text = self.data['wmfOCR'] + ref_text = self.data['wmfOCR'] s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9)
pywikibot-commits@lists.wikimedia.org