jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[FEAT] Add support for Wikimedia OCR engine

Change-Id: If01635cc9b74198cf24348a92fe39a3a1e49636b
---
M pywikibot/proofreadpage.py
M tests/proofreadpage_tests.py
2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 0ae2b2a..55b288c 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -13,6 +13,9 @@
- https://phetools.toolforge.org/ocr.php
- inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js

+- Wikimedia OCR
+- see: https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR
+
- https://ws-google-ocr.toolforge.org/
- inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
- see also: https://wikisource.org/wiki/Wikisource:Google_OCR
@@ -176,6 +179,10 @@
_OCR_CMD = ('https://phetools.toolforge.org/ocr.php?'
'cmd=ocr&url={url_image}&lang={lang}&user={user}')

+ # Wikimedia OCR utility
+ _WMFOCR_CMD = ('https://ocr.wmcloud.org/api.php?engine=tesseract&'
+ 'langs[]={lang}&image={url_image}&uselang={lang}')
+
# googleOCR ocr utility
_GOCR_CMD = ('https://ws-google-ocr.toolforge.org/api.php?'
'image={url_image}&lang={lang}')
@@ -183,8 +190,10 @@
_MULTI_PAGE_EXT = ['djvu', 'pdf']

_PHETOOLS = 'phetools'
+ _WMFOCR = 'wmfOCR'
_GOOGLE_OCR = 'googleOCR'
_OCR_CMDS = {_PHETOOLS: _OCR_CMD,
+ _WMFOCR: _WMFOCR_CMD,
_GOOGLE_OCR: _GOCR_CMD,
}
_OCR_METHODS = list(_OCR_CMDS.keys())
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 474f5f9..4c20284 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -371,6 +371,11 @@
'year 1872,\nBy D. APPLETON & CO.,\nIn the '
'Office of the Librarian of Congress, at '
'Washington.\n\u000c'),
+ 'wmfOCR': (False, 'Estee, according to Act of Congress, in the '
+ 'year 1872,\n'
+ 'By D. APPLETON & CO.,\n'
+ 'In the Office of the Librarian of Congress, '
+ 'at Washington.'),
'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
'the year 1572,\nBY D. APPLETON & CO.\n'
'In the Office of the Librarian of '
@@ -409,6 +414,16 @@
s = difflib.SequenceMatcher(None, text, ref_text)
self.assertGreater(s.ratio(), 0.9)

+ def test_do_ocr_wmfocr(self):
+ """Test page._do_ocr(ocr_tool='wmfOCR')."""
+ error, text = self.page._do_ocr(ocr_tool='wmfOCR')
+ if error:
+ self.skipTest(text)
+ ref_error, ref_text = self.data['wmfOCR']
+ self.assertEqual(error, ref_error)
+ s = difflib.SequenceMatcher(None, text, ref_text)
+ self.assertGreater(s.ratio(), 0.9)
+
def test_do_ocr_googleocr(self):
"""Test page._do_ocr(ocr_tool='googleOCR')."""
error, text = self.page._do_ocr(ocr_tool='googleOCR')
@@ -419,14 +434,14 @@
s = difflib.SequenceMatcher(None, text, ref_text)
self.assertGreater(s.ratio(), 0.9)

- def test_ocr_googleocr(self):
- """Test page.ocr(ocr_tool='googleOCR')."""
+ def test_ocr_wmfocr(self):
+ """Test page.ocr(ocr_tool='wmfOCR')."""
try:
- text = self.page.ocr(ocr_tool='googleOCR')
+ text = self.page.ocr(ocr_tool='wmfOCR')
except Exception as exc:
self.assertIsInstance(exc, ValueError)
else:
- ref_error, ref_text = self.data['googleOCR']
+ ref_error, ref_text = self.data['wmfOCR']
s = difflib.SequenceMatcher(None, text, ref_text)
self.assertGreater(s.ratio(), 0.9)


To view, visit change 731247. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: If01635cc9b74198cf24348a92fe39a3a1e49636b
Gerrit-Change-Number: 731247
Gerrit-PatchSet: 4
Gerrit-Owner: Mpaa <mpaa.wiki@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged