jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/731247 )
Change subject: [FEAT] Add support for Wikimedia OCR engine ......................................................................
[FEAT] Add support for Wikimedia OCR engine
Change-Id: If01635cc9b74198cf24348a92fe39a3a1e49636b --- M pywikibot/proofreadpage.py M tests/proofreadpage_tests.py 2 files changed, 28 insertions(+), 4 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py index 0ae2b2a..55b288c 100644 --- a/pywikibot/proofreadpage.py +++ b/pywikibot/proofreadpage.py @@ -13,6 +13,9 @@ - https://phetools.toolforge.org/ocr.php - inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
+- Wikimedia OCR +- see: https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR + - https://ws-google-ocr.toolforge.org/ - inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js - see also: https://wikisource.org/wiki/Wikisource:Google_OCR @@ -176,6 +179,10 @@ _OCR_CMD = ('https://phetools.toolforge.org/ocr.php?' 'cmd=ocr&url={url_image}&lang={lang}&user={user}')
+ # Wikimedia OCR utility + _WMFOCR_CMD = ('https://ocr.wmcloud.org/api.php?engine=tesseract&' + 'langs[]={lang}&image={url_image}&uselang={lang}') + # googleOCR ocr utility _GOCR_CMD = ('https://ws-google-ocr.toolforge.org/api.php?' 'image={url_image}&lang={lang}') @@ -183,8 +190,10 @@ _MULTI_PAGE_EXT = ['djvu', 'pdf']
_PHETOOLS = 'phetools' + _WMFOCR = 'wmfOCR' _GOOGLE_OCR = 'googleOCR' _OCR_CMDS = {_PHETOOLS: _OCR_CMD, + _WMFOCR: _WMFOCR_CMD, _GOOGLE_OCR: _GOCR_CMD, } _OCR_METHODS = list(_OCR_CMDS.keys()) diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py index 474f5f9..4c20284 100644 --- a/tests/proofreadpage_tests.py +++ b/tests/proofreadpage_tests.py @@ -371,6 +371,11 @@ 'year 1872,\nBy D. APPLETON & CO.,\nIn the ' 'Office of the Librarian of Congress, at ' 'Washington.\n\u000c'), + 'wmfOCR': (False, 'Estee, according to Act of Congress, in the ' + 'year 1872,\n' + 'By D. APPLETON & CO.,\n' + 'In the Office of the Librarian of Congress, ' + 'at Washington.'), 'googleOCR': (False, 'ENTERED, according to Act of Congress, in ' 'the year 1572,\nBY D. APPLETON & CO.\n' 'In the Office of the Librarian of ' @@ -409,6 +414,16 @@ s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9)
+ def test_do_ocr_wmfocr(self): + """Test page._do_ocr(ocr_tool='wmfOCR').""" + error, text = self.page._do_ocr(ocr_tool='wmfOCR') + if error: + self.skipTest(text) + ref_error, ref_text = self.data['wmfOCR'] + self.assertEqual(error, ref_error) + s = difflib.SequenceMatcher(None, text, ref_text) + self.assertGreater(s.ratio(), 0.9) + def test_do_ocr_googleocr(self): """Test page._do_ocr(ocr_tool='googleOCR').""" error, text = self.page._do_ocr(ocr_tool='googleOCR') @@ -419,14 +434,14 @@ s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9)
- def test_ocr_googleocr(self): - """Test page.ocr(ocr_tool='googleOCR').""" + def test_ocr_wmfocr(self): + """Test page.ocr(ocr_tool='wmfOCR').""" try: - text = self.page.ocr(ocr_tool='googleOCR') + text = self.page.ocr(ocr_tool='wmfOCR') except Exception as exc: self.assertIsInstance(exc, ValueError) else: - ref_error, ref_text = self.data['googleOCR'] + ref_error, ref_text = self.data['wmfOCR'] s = difflib.SequenceMatcher(None, text, ref_text) self.assertGreater(s.ratio(), 0.9)
pywikibot-commits@lists.wikimedia.org