jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1036688?usp=email )
Change subject: [cleanup] pheetools is no longer available upstreams
......................................................................
[cleanup] pheetools is no longer available upstreams
- remove pheetools support
- check whether ocr_tool is in _OCR_CMDS dict instead of _OCR_METHODS
list
- new default of ocr_tool in ProofreadPage.ocr is wmfOCR now
- Do not show any deprecation warning, the pheetools fails anyway;
just give some hints in the documentation of ocr method. Do not care
about private methods.
- Update tests, remove tests for error condition because test is
skipped anyway in this case.
Bug: T366036
Change-Id: I4eecee8a034ef889857071c82d27fcb5ca5b7db8
---
M pywikibot/proofreadpage.py
M tests/proofreadpage_tests.py
2 files changed, 52 insertions(+), 142 deletions(-)
Approvals:
jenkins-bot: Verified
Xqt: Looks good to me, approved
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 84d4e60..90e2a03 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -1,24 +1,13 @@
-"""
-Objects used with ProofreadPage Extension.
-
-This module includes objects:
-
-* ProofreadPage(Page)
-* FullHeader
-* IndexPage(Page)
-
+"""Objects used with ProofreadPage Extension.
OCR support of page scans via:
-- https://phetools.toolforge.org/hocr_cgi.py
-- https://phetools.toolforge.org/ocr.php
-- inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
-- Wikimedia OCR
-- see: https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR
+- Wikimedia OCR, see:
+ https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR
+- https://ocr.wmcloud.org/, inspired by
+ https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
-- https://ocr.wmcloud.org/
-- inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
-- see also: https://wikisource.org/wiki/Wikisource:Google_OCR
+.. seealso:: https://wikisource.org/wiki/Wikisource:Google_OCR
"""
#
@@ -421,13 +410,6 @@
p_close = re.compile(r'(</div>|\n\n\n)?</noinclude>')
p_close_no_div = re.compile('</noinclude>') # V2 page format.
- # phetools ocr utility
- _HOCR_CMD = ('https://phetools.toolforge.org/hocr_cgi.py?'
- 'cmd=hocr&book={book}&lang={lang}&user={user}')
-
- _OCR_CMD = ('https://phetools.toolforge.org/ocr.php?'
- 'cmd=ocr&url={url_image}&lang={lang}&user={user}')
-
# Wikimedia OCR utility
_WMFOCR_CMD = ('https://ocr.wmcloud.org/api.php?engine=tesseract&'
'langs[]={lang}&image={url_image}&uselang={lang}')
@@ -438,14 +420,10 @@
_MULTI_PAGE_EXT = ['djvu', 'pdf']
- _PHETOOLS = 'phetools'
_WMFOCR = 'wmfOCR'
_GOOGLE_OCR = 'googleOCR'
- _OCR_CMDS = {_PHETOOLS: _OCR_CMD,
- _WMFOCR: _WMFOCR_CMD,
- _GOOGLE_OCR: _GOCR_CMD,
- }
- _OCR_METHODS = list(_OCR_CMDS.keys())
+ _OCR_CMDS = {_WMFOCR: _WMFOCR_CMD, _GOOGLE_OCR: _GOCR_CMD}
+ _OCR_METHODS = list(_OCR_CMDS)
def __init__(self, source: PageSourceType, title: str = '') -> None:
"""Instantiate a ProofreadPage object.
@@ -883,10 +861,12 @@
return self._url_image_ge_140()
- def _ocr_callback(self, cmd_uri: str,
- parser_func: Callable[[str], str] | None = None,
- ocr_tool: str | None = None
- ) -> tuple[bool, str | Exception]:
+ def _ocr_callback(
+ self,
+ cmd_uri: str,
+ parser_func: Callable[[str], str] | None = None,
+ ocr_tool: str | None = None,
+ ) -> tuple[bool, str | Exception]:
"""OCR callback function.
:return: tuple (error, text [error description in case of error]).
@@ -903,7 +883,7 @@
if not callable(parser_func):
raise TypeError('Keyword parser_func must be callable.')
- if ocr_tool not in self._OCR_METHODS:
+ if ocr_tool not in self._OCR_CMDS:
raise TypeError(
f"ocr_tool must be in {self._OCR_METHODS}, not '{ocr_tool}'.")
@@ -931,54 +911,17 @@
data = response.json()
- if ocr_tool == self._PHETOOLS: # phetools
- assert 'error' in data, f'Error from phetools: {data}'
- assert data['error'] in [0, 1, 2, 3], \
- f'Error from phetools: {data}'
- error, _text = bool(data['error']), data['text']
- else: # googleOCR
- if 'error' in data:
- error, _text = True, data['error']
- else:
- error, _text = False, data['text']
+ if 'error' in data:
+ error, _text = True, data['error']
+ else:
+ error, _text = False, data['text']
if error:
pywikibot.error(f'OCR query {cmd_uri}: {_text}')
return error, _text
+
return error, parser_func(_text)
- def _do_hocr(self) -> tuple[bool, str | Exception]:
- """Do hocr using https://phetools.toolforge.org/hocr_cgi.py?cmd=hocr.
-
- This is the main method for 'phetools'.
- Fallback method is ocr.
-
- :raise ImportError: if bs4 is not installed, _bs4_soup() will raise
- """
- def parse_hocr_text(txt: str) -> str:
- """Parse hocr text."""
- soup = _bs4_soup(txt) # type: ignore
-
- res = []
- for _ocr_page in soup.find_all(class_='ocr_page'):
- for area in soup.find_all(class_='ocr_carea'):
- for par in area.find_all(class_='ocr_par'):
- for line in par.find_all(class_='ocr_line'):
- res.append(line.get_text())
- res.append('\n')
- return ''.join(res)
-
- params = {
- 'book': self.title(as_url=True, with_ns=False),
- 'lang': self.site.lang,
- 'user': self.site.user(),
- }
- cmd_uri = self._HOCR_CMD.format_map(params)
-
- return self._ocr_callback(cmd_uri,
- parser_func=parse_hocr_text,
- ocr_tool=self._PHETOOLS)
-
def _do_ocr(self, ocr_tool: str | None = None
) -> tuple[bool, str | Exception]:
"""Do ocr using specified ocr_tool method."""
@@ -990,8 +933,8 @@
return True, error_text
if ocr_tool is None:
- msg = 'ocr_tool required, must be among {}'
- raise TypeError(msg.format(self._OCR_METHODS))
+ raise TypeError(
+ f'ocr_tool required, must be among {self._OCR_METHODS}')
try:
cmd_fmt = self._OCR_CMDS[ocr_tool]
@@ -1011,38 +954,34 @@
def ocr(self, ocr_tool: str | None = None) -> str:
"""Do OCR of ProofreadPage scan.
- The text returned by this function shall be assigned to self.body,
- otherwise the ProofreadPage format will not be maintained.
+ The text returned by this function shall be assigned to
+ :attr:`body`, otherwise the ProofreadPage format will not be
+ maintained.
- It is the user's responsibility to reset quality level accordingly.
+ .. warning:: It is the user's responsibility to reset quality
+ level accordingly.
- :param ocr_tool: 'phetools', 'wmfOCR' or 'googleOCR';
- default is 'phetools'
+ .. versionchanged:: 9.2
+ default for *ocr_tool* is `wmfOCR`.
+ .. versionremoved:: 9.2
+ `phetools` support is not available anymore.
+ :param ocr_tool: 'wmfOCR' or 'googleOCR'; default is 'wmfOCR'
:return: OCR text for the page.
-
:raise TypeError: wrong ocr_tool keyword arg.
:raise ValueError: something went wrong with OCR process.
"""
if ocr_tool is None: # default value
- ocr_tool = self._PHETOOLS
+ ocr_tool = self._WMFOCR
- if ocr_tool not in self._OCR_METHODS:
+ if ocr_tool not in self._OCR_CMDS:
raise TypeError(
- f"ocr_tool must be in {self._OCR_METHODS}, not '{ocr_tool}'.")
-
- # if _multi_page, try _do_hocr() first and fall back to _do_ocr()
- if ocr_tool == self._PHETOOLS and self._multi_page:
- error, text = self._do_hocr()
- if not error and isinstance(text, str):
- return text
- pywikibot.warning(
- f'{self}: phetools hocr failed, falling back to ocr.')
+ f'ocr_tool must be in {self._OCR_METHODS}, not {ocr_tool!r}.')
error, text = self._do_ocr(ocr_tool=ocr_tool)
-
if not error and isinstance(text, str):
return text
+
raise ValueError(
f'{self}: not possible to perform OCR. {text}')
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 0e2c525..ac576da 100755
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -449,28 +449,21 @@
family = 'wikisource'
code = 'en'
-
cached = True
- data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10',
- 'hocr': (False, 'ENTERED, according to Act of Congress, in the '
- 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office '
- 'of the Librarian of Congress, at '
- 'Washington.\n\n'),
- 'ocr': (False, 'EsTEnen, according to Act of Congress, in the '
- 'year 1872,\nBy D. APPLETON & CO.,\nIn the '
- 'Office of the Librarian of Congress, at '
- 'Washington.\n\u000c'),
- 'wmfOCR': (False, 'Estee, according to Act of Congress, in the '
- 'year 1872,\n'
- 'By D. APPLETON & CO.,\n'
- 'In the Office of the Librarian of Congress, '
- 'at Washington.'),
- 'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
- 'the year 1572,\nBY D. APPLETON & CO.\n'
- 'In the Office of the Librarian of '
- 'Congress, at Washington.\n4 334\n'),
- }
+ data = {
+ 'title':
+ 'Page:Popular Science Monthly Volume 1.djvu/10',
+ 'wmfOCR':
+ 'Estee, according to Act of Congress, in the year 1872,\n'
+ 'By D. APPLETON & CO.,\n'
+ 'In the Office of the Librarian of Congress, at Washington.',
+ 'googleOCR':
+ 'ENTERED, according to Act of Congress, in the year 1572,\n'
+ 'BY D. APPLETON & CO.\n'
+ 'In the Office of the Librarian of Congress, at Washington.\n'
+ '4 334\n',
+ }
def setUp(self):
"""Test setUp."""
@@ -484,33 +477,12 @@
with self.assertRaises(TypeError):
self.page.ocr(ocr_tool='dummy')
- def test_do_hocr(self):
- """Test page._do_hocr()."""
- error, text = self.page._do_hocr()
- if error:
- self.skipTest(text)
- ref_error, ref_text = self.data['hocr']
- self.assertEqual(error, ref_error)
- s = difflib.SequenceMatcher(None, text, ref_text)
- self.assertGreater(s.ratio(), 0.9)
-
- def test_do_ocr_phetools(self):
- """Test page._do_ocr(ocr_tool='phetools')."""
- error, text = self.page._do_ocr(ocr_tool='phetools')
- ref_error, ref_text = self.data['ocr']
- if error:
- self.skipTest(text)
- self.assertEqual(error, ref_error)
- s = difflib.SequenceMatcher(None, text, ref_text)
- self.assertGreater(s.ratio(), 0.9)
-
def test_do_ocr_wmfocr(self):
"""Test page._do_ocr(ocr_tool='wmfOCR')."""
error, text = self.page._do_ocr(ocr_tool='wmfOCR')
if error:
self.skipTest(text)
- ref_error, ref_text = self.data['wmfOCR']
- self.assertEqual(error, ref_error)
+ ref_text = self.data['wmfOCR']
s = difflib.SequenceMatcher(None, text, ref_text)
self.assertGreater(s.ratio(), 0.9)
@@ -519,8 +491,7 @@
error, text = self.page._do_ocr(ocr_tool='googleOCR')
if error:
self.skipTest(text)
- ref_error, ref_text = self.data['googleOCR']
- self.assertEqual(error, ref_error)
+ ref_text = self.data['googleOCR']
s = difflib.SequenceMatcher(None, text, ref_text)
self.assertGreater(s.ratio(), 0.9)
@@ -531,7 +502,7 @@
except Exception as exc:
self.assertIsInstance(exc, ValueError)
else:
- _error, ref_text = self.data['wmfOCR']
+ ref_text = self.data['wmfOCR']
s = difflib.SequenceMatcher(None, text, ref_text)
self.assertGreater(s.ratio(), 0.9)
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1036688?usp=email
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I4eecee8a034ef889857071c82d27fcb5ca5b7db8
Gerrit-Change-Number: 1036688
Gerrit-PatchSet: 5
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Sohom Datta <sohomdatta1(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1036734?usp=email )
Change subject: [cleanup] deactivate wait cycle for Page.main_authors()
......................................................................
[cleanup] deactivate wait cycle for Page.main_authors()
Bug: T366100
Change-Id: I3bf4e6e28ebd4b75f53469754be1d3329b026e4f
---
M pywikibot/page/_toolforge.py
1 file changed, 11 insertions(+), 9 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/page/_toolforge.py b/pywikibot/page/_toolforge.py
index 37df61b..ebdf5b2 100644
--- a/pywikibot/page/_toolforge.py
+++ b/pywikibot/page/_toolforge.py
@@ -3,7 +3,7 @@
.. versionadded:: 7.7
"""
#
-# (C) Pywikibot team, 2022-2023
+# (C) Pywikibot team, 2022-2024
#
# Distributed under the terms of the MIT license.
#
@@ -62,19 +62,20 @@
.. note:: Only implemented for main namespace pages.
.. note:: Only wikipedias of :attr:`WIKIBLAME_CODES` are supported.
+ .. attention:: This method does not return new results due to
+ :phab:`366100`.
.. seealso::
- https://wikihistory.toolforge.org
- https://de.wikipedia.org/wiki/Wikipedia:Technik/Cloud/wikihistory
- :param onlynew: If False, use the cached values. If True,
- calculate the Counter data which can take some time; it may
- fail with TimeoutError after ``config.max_retries``. If None
- it calculates new data like for True but uses data from
- cache if new data cannot be calculated in meantime.
+ .. versionchanged:: 9.2
+ do not use any wait cycles due to :phab:`366100`.
+
+ :param onlynew: Currently meaningless
:return: Number of edits for each username
:raise NotImplementedError: unsupported site or unsupported namespace
:raise pywikibot.exceptions.NoPageError: The page does not exist
- :raise pywikibot.exceptions.TimeoutError: Maximum retries exceeded
+ :raise pywikibot.exceptions.TimeoutError: No cached results found
"""
baseurl = 'https://wikihistory.toolforge.org'
pattern = (r'><bdi>(?P<author>.+?)</bdi></a>\s'
@@ -97,10 +98,11 @@
{user: int(cnt)
for user, cnt in re.findall(pattern, r.text)})
+ break # T366100
+
delay = pywikibot.config.retry_wait * 2 ** current_retries
pywikibot.warning('WikiHistory timeout.\n'
- 'Waiting {:.1f} seconds before retrying.'
- .format(delay))
+ f'Waiting {delay:.1f} seconds before retrying.')
pywikibot.sleep(delay)
if onlynew is None and current_retries >= config.max_retries - 2:
url += '&onlynew=1'
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1036734?usp=email
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I3bf4e6e28ebd4b75f53469754be1d3329b026e4f
Gerrit-Change-Number: 1036734
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged