[Gerrit] ...core[master]: [FEAT] Add support for Wikimedia OCR engine - Pywikibot-commits

17 Oct 2021

jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/731247 )

Change subject: [FEAT] Add support for Wikimedia OCR engine
......................................................................

[FEAT] Add support for Wikimedia OCR engine

Change-Id: If01635cc9b74198cf24348a92fe39a3a1e49636b
---
M pywikibot/proofreadpage.py
M tests/proofreadpage_tests.py
2 files changed, 28 insertions(+), 4 deletions(-)

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 0ae2b2a..55b288c 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -13,6 +13,9 @@
 - https://phetools.toolforge.org/ocr.php
 - inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
 
+- Wikimedia OCR
+- see: https://www.mediawiki.org/wiki/Help:Extension:Wikisource/Wikimedia_OCR
+
 - https://ws-google-ocr.toolforge.org/
 - inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
 - see also: https://wikisource.org/wiki/Wikisource:Google_OCR
@@ -176,6 +179,10 @@
     _OCR_CMD = ('https://phetools.toolforge.org/ocr.php?'
                 'cmd=ocr&url={url_image}&lang={lang}&user={user}')
 
+    # Wikimedia OCR utility
+    _WMFOCR_CMD = ('https://ocr.wmcloud.org/api.php?engine=tesseract&'
+                   'langs[]={lang}&image={url_image}&uselang={lang}')
+
     # googleOCR ocr utility
     _GOCR_CMD = ('https://ws-google-ocr.toolforge.org/api.php?'
                  'image={url_image}&lang={lang}')
@@ -183,8 +190,10 @@
     _MULTI_PAGE_EXT = ['djvu', 'pdf']
 
     _PHETOOLS = 'phetools'
+    _WMFOCR = 'wmfOCR'
     _GOOGLE_OCR = 'googleOCR'
     _OCR_CMDS = {_PHETOOLS: _OCR_CMD,
+                 _WMFOCR: _WMFOCR_CMD,
                  _GOOGLE_OCR: _GOCR_CMD,
                  }
     _OCR_METHODS = list(_OCR_CMDS.keys())
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 474f5f9..4c20284 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -371,6 +371,11 @@
                            'year 1872,\nBy D. APPLETON & CO.,\nIn the '
                            'Office of the Librarian of Congress, at '
                            'Washington.\n\u000c'),
+            'wmfOCR': (False, 'Estee, according to Act of Congress, in the
'
+                              'year 1872,\n'
+                              'By D. APPLETON & CO.,\n'
+                              'In the Office of the Librarian of Congress, '
+                              'at Washington.'),
             'googleOCR': (False, 'ENTERED, according to Act of Congress, in
'
                                  'the year 1572,\nBY D. APPLETON & CO.\n'
                                  'In the Office of the Librarian of '
@@ -409,6 +414,16 @@
         s = difflib.SequenceMatcher(None, text, ref_text)
         self.assertGreater(s.ratio(), 0.9)
 
+    def test_do_ocr_wmfocr(self):
+        """Test
page._do_ocr(ocr_tool='wmfOCR')."""
+        error, text = self.page._do_ocr(ocr_tool='wmfOCR')
+        if error:
+            self.skipTest(text)
+        ref_error, ref_text = self.data['wmfOCR']
+        self.assertEqual(error, ref_error)
+        s = difflib.SequenceMatcher(None, text, ref_text)
+        self.assertGreater(s.ratio(), 0.9)
+
     def test_do_ocr_googleocr(self):
         """Test
page._do_ocr(ocr_tool='googleOCR')."""
         error, text = self.page._do_ocr(ocr_tool='googleOCR')
@@ -419,14 +434,14 @@
         s = difflib.SequenceMatcher(None, text, ref_text)
         self.assertGreater(s.ratio(), 0.9)
 
-    def test_ocr_googleocr(self):
-        """Test page.ocr(ocr_tool='googleOCR')."""
+    def test_ocr_wmfocr(self):
+        """Test page.ocr(ocr_tool='wmfOCR')."""
         try:
-            text = self.page.ocr(ocr_tool='googleOCR')
+            text = self.page.ocr(ocr_tool='wmfOCR')
         except Exception as exc:
             self.assertIsInstance(exc, ValueError)
         else:
-            ref_error, ref_text = self.data['googleOCR']
+            ref_error, ref_text = self.data['wmfOCR']
             s = difflib.SequenceMatcher(None, text, ref_text)
             self.assertGreater(s.ratio(), 0.9)
 

-- 
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/731247
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: If01635cc9b74198cf24348a92fe39a3a1e49636b
Gerrit-Change-Number: 731247
Gerrit-PatchSet: 4
Gerrit-Owner: Mpaa &lt;mpaa.wiki(a)gmail.com&gt;
Gerrit-Reviewer: Xqt &lt;info(a)gno.de&gt;
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged