jenkins-bot merged this change.

View Change

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

proofreadpage.py: insert support for googleOCR

In addition to 'phetools', add suppport for googleOCR at:
    https://tools.wmflabs.org/ws-google-ocr/api.php

Communication with toollabs is managed by a pool of threads
to increase performance.

Tests added.

wikisourcetext.py has been modified accordingly to support this
additional option.

Change-Id: Ie56d0534f945b5e5a614a78e4095f1efa52001d0
---
M pywikibot/proofreadpage.py
M scripts/wikisourcetext.py
M tests/proofreadpage_tests.py
3 files changed, 288 insertions(+), 62 deletions(-)

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index e8694c1..e22c2e6 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -14,7 +14,11 @@
 OCR support of page scans via:
 - https://tools.wmflabs.org/phetools/hocr_cgi.py
 - https://tools.wmflabs.org/phetools/ocr.php
-inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
+- inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
+
+- https://tools.wmflabs.org/ws-google-ocr/
+- inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
+- see also: https://wikisource.org/wiki/Wikisource:Google_OCR
 
 """
 #
@@ -27,6 +31,8 @@
 from functools import partial
 import json
 import re
+import requests
+import time
 
 try:
     from bs4 import BeautifulSoup, FeatureNotFound
@@ -111,14 +117,25 @@
     p_open = re.compile(r'<noinclude>')
     p_close = re.compile(r'(</div>|\n\n\n)?</noinclude>')
 
-    # phe-tools ocr utility
-    HOCR_CMD = ('https://tools.wmflabs.org/phetools/hocr_cgi.py?'
-                'cmd=hocr&book={book}&lang={lang}&user={user}')
+    # phetools ocr utility
+    _HOCR_CMD = ('https://tools.wmflabs.org/phetools/hocr_cgi.py?'
+                 'cmd=hocr&book={book}&lang={lang}&user={user}')
 
-    OCR_CMD = ('https://tools.wmflabs.org/phetools/ocr.php?'
-               'cmd=ocr&url={url_image}&lang={lang}&user={user}')
+    _OCR_CMD = ('https://tools.wmflabs.org/phetools/ocr.php?'
+                'cmd=ocr&url={url_image}&lang={lang}&user={user}')
 
-    MULTI_PAGE_EXT = ['djvu', 'pdf']
+    # googleOCR ocr utility
+    _GOCR_CMD = ('https://tools.wmflabs.org/ws-google-ocr/api.php?'
+                 'image={url_image}&lang={lang}')
+
+    _MULTI_PAGE_EXT = ['djvu', 'pdf']
+
+    _PHETOOLS = 'phetools'
+    _googleOCR = 'googleOCR'
+    _OCR_CMDS = {_PHETOOLS: _OCR_CMD,
+                 _googleOCR: _GOCR_CMD,
+                 }
+    _OCR_METHODS = list(_OCR_CMDS.keys())
 
     def __init__(self, source, title=''):
         """Instantiate a ProofreadPage object.
@@ -140,7 +157,7 @@
                                 self.PROOFREAD_LEVELS))
 
         self._base, self._base_ext, self._num = self._parse_title()
-        self._multi_page = self._base_ext in self.MULTI_PAGE_EXT
+        self._multi_page = self._base_ext in self._MULTI_PAGE_EXT
 
     @property
     def _fmt(self):
@@ -539,44 +556,74 @@
 
         return self._url_image
 
-    def _ocr_callback(self, cmd_uri, parser_func=None):
+    def _ocr_callback(self, cmd_uri, parser_func=None, ocr_tool=None):
         """OCR callback function.
 
         @return: tuple (error, text [error description in case of error]).
         """
-        def id(x):
+        def identity(x):
             return x
 
         if not cmd_uri:
             raise ValueError('Parameter cmd_uri is mandatory.')
 
         if parser_func is None:
-            parser_func = id
+            parser_func = identity
 
         if not callable(parser_func):
             raise TypeError('Keyword parser_func must be callable.')
 
+        if ocr_tool not in self._OCR_METHODS:
+            raise TypeError(
+                "ocr_tool must be in %s, not '%s'." %
+                (self._OCR_METHODS, ocr_tool))
+
         # wrong link fail with Exceptions
-        try:
-            response = http.fetch(cmd_uri, charset='utf-8')
-        except Exception as e:
-            pywikibot.error('Querying %s: %s' % (cmd_uri, e))
-            return (True, e)
+        retry = 0
+        while retry < 5:
+            pywikibot.debug('{0}: get URI {1!r}'.format(ocr_tool, cmd_uri),
+                            _logger)
+            try:
+                response = http.fetch(cmd_uri)
+            except requests.exceptions.ReadTimeout as e:
+                retry += 1
+                pywikibot.warning('ReadTimeout %s: %s' % (cmd_uri, e))
+                pywikibot.warning('retrying in %s seconds ...' % (retry * 5))
+                time.sleep(retry * 5)
+            except Exception as e:
+                pywikibot.error('"%s": %s' % (cmd_uri, e))
+                return (True, e)
+            else:
+                pywikibot.debug('{0}: {1}'.format(ocr_tool, response.text),
+                                _logger)
+                break
 
         data = json.loads(response.text)
 
-        assert 'error' in data, 'Error from phe-tools: %s' % data
-        assert data['error'] in [0, 1], 'Error from phe-tools: %s' % data
+        if ocr_tool == self._PHETOOLS:  # phetools
+            assert 'error' in data, 'Error from phetools: %s' % data
+            assert data['error'] in [0, 1, 2, 3], (
+                'Error from phetools: %s' % data)
+            error, _text = bool(data['error']), data['text']
+        else:  # googleOCR
+            if 'error' in data:
+                error, _text = True, data['error']
+            else:
+                error, _text = False, data['text']
 
-        error = bool(data['error'])
         if error:
-            pywikibot.error('Querying %s: %s' % (cmd_uri, data['text']))
-            return (error, data['text'])
+            pywikibot.error('OCR query %s: %s' % (cmd_uri, _text))
+            return (error, _text)
         else:
-            return (error, parser_func(data['text']))
+            return (error, parser_func(_text))
 
     def _do_hocr(self):
-        """Do hocr using //tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr."""
+        """Do hocr using //tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr.
+
+        This is the main method for 'phetools'.
+        Fallback method is ocr.
+
+        """
         def parse_hocr_text(txt):
             """Parse hocr text."""
             soup = Soup(txt)
@@ -595,12 +642,14 @@
                   'user': self.site.user(),
                   }
 
-        cmd_uri = self.HOCR_CMD.format(**params)
+        cmd_uri = self._HOCR_CMD.format(**params)
 
-        return self._ocr_callback(cmd_uri, parser_func=parse_hocr_text)
+        return self._ocr_callback(cmd_uri,
+                                  parser_func=parse_hocr_text,
+                                  ocr_tool=self._PHETOOLS)
 
-    def _do_ocr(self):
-        """Do ocr using //tools.wmflabs.org/phetools/ocr.pmp?cmd=ocr."""
+    def _do_ocr(self, ocr_tool=None):
+        """Do ocr using specified ocr_tool method."""
         try:
             url_image = self.url_image
         except ValueError:
@@ -613,28 +662,56 @@
                   'user': self.site.user(),
                   }
 
-        cmd_uri = self.OCR_CMD.format(**params)
+        try:
+            cmd_fmt = self._OCR_CMDS[ocr_tool]
+        except KeyError:
+            raise TypeError(
+                "ocr_tool must be in %s, not '%s'." %
+                (self._OCR_METHODS, ocr_tool))
 
-        return self._ocr_callback(cmd_uri)
+        cmd_uri = cmd_fmt.format(**params)
 
-    def ocr(self):
-        """Do OCR of Proofreadpage scan.
+        return self._ocr_callback(cmd_uri, ocr_tool=ocr_tool)
 
-        The text returned by this function shalle be assign to self.body,
+    def ocr(self, ocr_tool=None):
+        """Do OCR of ProofreadPage scan.
+
+        The text returned by this function shall be assigned to self.body,
         otherwise the ProofreadPage format will not be maintained.
 
         It is the user's responsibility to reset quality level accordingly.
-        """
-        if self._multi_page:
-            error, text = self._do_hocr()
-            if not error:
-                return text
 
-        error, text = self._do_ocr()
+        @param ocr_tool: 'phetools' or 'googleOCR', default is 'phetools'
+        @type ocr_tool: basestring
+
+        @return: OCR text for the page.
+
+        @raise TypeError: wrong ocr_tool keyword arg.
+        @raise ValueError: something went wrong with OCR process.
+        """
+        if ocr_tool is None:  # default value
+            ocr_tool = self._PHETOOLS
+
+        if ocr_tool not in self._OCR_METHODS:
+            raise TypeError(
+                "ocr_tool must be in %s, not '%s'." %
+                (self._OCR_METHODS, ocr_tool))
+
+        if ocr_tool == self._PHETOOLS:
+            # if _multi_page, try _do_hocr() first and fall back to _do_ocr()
+            if self._multi_page:
+                error, text = self._do_hocr()
+                if not error:
+                    return text
+                pywikibot.warning('%s: phetools hocr failed, '
+                                  'falling back to ocr.' % self)
+
+        error, text = self._do_ocr(ocr_tool=ocr_tool)
+
         if not error:
             return text
         else:
-            raise ValueError('Not possible to perform HOCR/OCR on %s.' % self)
+            raise ValueError('%s: not possible to perform OCR.' % self)
 
 
 class PurgeRequest(Request):
@@ -862,7 +939,8 @@
             end = self.num_pages
 
         if not ((1 <= start <= self.num_pages)
-                and (1 <= end <= self.num_pages) and (start <= end)):
+                and (1 <= end <= self.num_pages)
+                and (start <= end)):
             raise ValueError('start=%s, end=%s are not in valid range (%s, %s)'
                              % (start, end, 1, self.num_pages))
 
diff --git a/scripts/wikisourcetext.py b/scripts/wikisourcetext.py
index 2daacfb..83734d4 100644
--- a/scripts/wikisourcetext.py
+++ b/scripts/wikisourcetext.py
@@ -18,32 +18,40 @@
 
 The following parameters are supported:
 
-    -index:...     name of the index page
+    -index:...  name of the index page.
 
     -pages:<start>-<end>,...<start>-<end>,<start>-<end>
-                   Page range to upload;
-                   optional, start=1, end=djvu file number of images.
-                   Page ranges can be specified as:
+                Page range to upload;
+                optional, start=1, end=djvu file number of images.
+                Page ranges can be specified as:
 
-                   | A-B -> pages A until B
-                   | A-  -> pages A until number of images
-                   | A   -> just page A
-                   | -B  -> pages 1 until B
+                | A-B -> pages A until B
+                | A-  -> pages A until number of images
+                | A   -> just page A
+                | -B  -> pages 1 until B
 
-    -showdiff:     show difference between curent text and new text when
-                   saving the page
+    -showdiff:  show difference between current text and new text when
+                saving the page.
 
-    -ocr:          use https://tools.wmflabs.org/phetools OCR tool to get text;
-                   default is False, i.e. only not-(yet)-existing pages in Page
-                   ns will be treated and text will be fetched via preload.
+    -ocr:       use OCR tools hosted on https://tools.wmflabs.org.
+                By default no OCR is done, i.e. only not-(yet)-existing
+                pages in Page ns will be treated and text will be fetched
+                via preload.
+                If -ocr is provided, default OCR method is:
+                 - https://tools.wmflabs.org/phetools
+                If ocr:googleOCR is given, OCR method is:
+                 - https://tools.wmflabs.org/ws-google-ocr
 
-    -force:        overwrite existing pages;
-                   default is False; valid only if '-ocr' is selected.
+    -threads:n  number of threads used to fetch OCR from OCR tools.
+                default is 5; valid only if '-ocr' is selected.
 
-    -summary:      custom edit summary.
-                   Use quotes if edit summary contains spaces.
+    -force:     overwrite existing pages;
+                default is False; valid only if '-ocr' is selected.
 
-    -always        don't bother asking to confirm any of the changes.
+    -summary:   custom edit summary.
+                Use quotes if edit summary contains spaces.
+
+    -always     don't bother asking to confirm any of the changes.
 """
 #
 # (C) Pywikibot team, 2016-2018
@@ -52,7 +60,11 @@
 #
 from __future__ import absolute_import, division, unicode_literals
 
+import collections
 import itertools
+import sys
+import threading
+import time
 
 import pywikibot
 
@@ -61,6 +73,11 @@
 from pywikibot.bot import SingleSiteBot
 from pywikibot.proofreadpage import IndexPage, ProofreadPage
 
+if sys.version_info[0] > 2:
+    import queue
+else:
+    import Queue as queue
+
 
 class UploadTextBot(SingleSiteBot):
 
@@ -77,6 +94,13 @@
         """
         Initializer.
 
+        If OCR is requested, spawns worker threads, and, if no "force" option
+        is set, filter for existing pages.
+
+        Queues are used for communication to/from threads.
+        A PriorityQueue is used to process pages in the same order as
+        they are generated.
+
         @param generator: page generator
         @type generator: generator
         """
@@ -84,7 +108,8 @@
             'showdiff': False,
             'force': False,
             'ocr': False,
-            'summary': 'Bot: uploading text'
+            'summary': 'Bot: uploading text',
+            'threads': 5
         })
         super(UploadTextBot, self).__init__(**kwargs)
         self.generator = generator
@@ -95,6 +120,59 @@
             self.options['summary'] = i18n.twtranslate(
                 self.site, 'djvutext-creating')
 
+        if self.getOption('ocr'):
+            self._num_threads = self.getOption('threads')
+            self._queue_in = queue.Queue()
+            self._queue_out = queue.PriorityQueue()
+
+            # If not "-force", no reason to get OCR for existing pages
+            # and to process them in Bot.run().
+            if not self.getOption('force'):
+                self.generator = (p for p in self.generator if not p.exists())
+            self._spawn_ocr_threads()
+
+    def _spawn_ocr_threads(self):
+        """Spawn threads for _ocr_worker workers."""
+        for i in range(self._num_threads):
+            worker = threading.Thread(target=self._ocr_worker)
+            worker.setDaemon(True)
+            worker.start()
+
+        self._pages = collections.OrderedDict()
+        for idx, p in enumerate(self.generator):
+            self._pages.setdefault(p, idx)
+        self.generator = (p for p in self._pages)  # recreate gen for run()
+
+        for p, idx in self._pages.items():
+            self._queue_in.put((p, idx))  # idx to preserve order later
+
+    def _ocr_worker(self):
+        """Fetch OCR content from ocr_tool and queue it."""
+        while True:
+            page, idx = self._queue_in.get()
+            try:
+                text_body = page.ocr(ocr_tool=self.getOption('ocr'))
+            except ValueError as e:
+                # TODO: is it a problem in PY2?
+                pywikibot.error(str(e))
+                text_body = None  # Sentinel: signal exception to self.treat()
+
+            self._queue_out.put((idx, text_body))
+            self._queue_in.task_done()
+
+    def _get_ocr(self, page):
+        """Get OCR content for page from PriorityQueue."""
+        # blocks until OCR for expected idx is available
+        expected_idx = self._pages.get(page)
+        while True:
+            if self._queue_out.empty():
+                time.sleep(0.2)  # some pause
+                continue
+            idx, text_body = self._queue_out.queue[0]  # peek first element
+            if idx == expected_idx:
+                idx, text_body = self._queue_out.get()
+                return text_body
+
     def treat(self, page):
         """Process one ProofreadPage page.
 
@@ -114,7 +192,12 @@
             old_text = ''
 
         if self.getOption('ocr'):
-            page.body = page.ocr()
+            _body = self._get_ocr(page)
+            if _body is None:
+                pywikibot.output('No OCR found. Skipping {}'
+                                 .format(page.title(as_link=True)))
+                return
+            page.body = _body
 
         if (page.exists() and
                 not (self.getOption('ocr') and self.getOption('force'))):
@@ -151,7 +234,9 @@
         elif arg == '-summary':
             options['summary'] = value
         elif arg == '-ocr':
-            options['ocr'] = True
+            options['ocr'] = value or 'phetools'
+        elif arg == '-threads':
+            options['threads'] = int(value)
         elif arg == '-force':
             options['force'] = True
         elif arg == '-always':
@@ -197,7 +282,7 @@
     gen_list = []
     for start, end in sorted(pages):
         gen = index.page_gen(start=start, end=end,
-                             filter_ql=[1], content=False)
+                             filter_ql=[1], content=True)
         gen_list.append(gen)
 
     gen = itertools.chain(*gen_list)
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 0f69c73..cd77337 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -320,6 +320,69 @@
         self.assertEqual(page.quality_level, 0)
 
 
+class TestPageOCR(TestCase):
+
+    """Test page ocr functions."""
+
+    family = 'wikisource'
+    code = 'en'
+
+    cached = True
+
+    data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10',
+            'hocr': (False, 'ENTERED, according to Act of Congress, in the '
+                            'year 1872,\nBY D. APPLETON & CO.,\nIn the Ofﬁce '
+                            'of the Librarian of Congress, at '
+                            'Washington.\n\n'),
+            'ocr': (False, 'lam-mam, according to Act of Congress, in the '
+                           'year 157-2,\nBY D. APPLEION Av CO.,\nIn the '
+                           'Of\ufb01ce or the Librarian of '
+                           'Congress, at Washington.\n\n'),
+            'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
+                                 'the year 1572,\nBY D. APPLETON & CO.\n'
+                                 'In the Office of the Librarian of '
+                                 'Congress, at Washington.\n4 334\n'),
+            }
+
+    def setUp(self):
+        """Test setUp."""
+        site = self.get_site()
+        title = self.data['title']
+        self.page = ProofreadPage(site, title)
+        super(TestPageOCR, self).setUp()
+
+    def test_ocr_exceptions(self):
+        """Test page.ocr() exceptions."""
+        self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy')
+
+    def test_do_hocr(self):
+        """Test page._do_hocr()."""
+        error, text = self.page._do_hocr()
+        ref_error, ref_text = self.data['hocr']
+        self.assertEqual(error, ref_error)
+        self.assertEqual(text, ref_text)
+
+    def test_do_ocr_phetools(self):
+        """Test page._do_ocr(ocr_tool='phetools')."""
+        error, text = self.page._do_ocr(ocr_tool='phetools')
+        ref_error, ref_text = self.data['ocr']
+        self.assertEqual(error, ref_error)
+        self.assertEqual(text, ref_text)
+
+    def test_do_ocr_googleocr(self):
+        """Test page._do_ocr(ocr_tool='googleOCR')."""
+        error, text = self.page._do_ocr(ocr_tool='googleOCR')
+        ref_error, ref_text = self.data['googleOCR']
+        self.assertEqual(error, ref_error)
+        self.assertEqual(text, ref_text)
+
+    def test_ocr_googleocr(self):
+        """Test page.ocr(ocr_tool='googleOCR')."""
+        text = self.page.ocr(ocr_tool='googleOCR')
+        ref_error, ref_text = self.data['googleOCR']
+        self.assertEqual(text, ref_text)
+
+
 @require_modules('bs4')
 class TestProofreadPageIndexProperty(TestCase):

To view, visit change 446973. To unsubscribe, or for help writing mail filters, visit settings.