jenkins-bot merged this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
proofreadpage.py: insert support for googleOCR

In addition to 'phetools', add suppport for googleOCR at:
https://tools.wmflabs.org/ws-google-ocr/api.php

Communication with toollabs is managed by a pool of threads
to increase performance.

Tests added.

wikisourcetext.py has been modified accordingly to support this
additional option.

Change-Id: Ie56d0534f945b5e5a614a78e4095f1efa52001d0
---
M pywikibot/proofreadpage.py
M scripts/wikisourcetext.py
M tests/proofreadpage_tests.py
3 files changed, 288 insertions(+), 62 deletions(-)

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index e8694c1..e22c2e6 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -14,7 +14,11 @@
OCR support of page scans via:
- https://tools.wmflabs.org/phetools/hocr_cgi.py
- https://tools.wmflabs.org/phetools/ocr.php
-inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
+- inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
+
+- https://tools.wmflabs.org/ws-google-ocr/
+- inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
+- see also: https://wikisource.org/wiki/Wikisource:Google_OCR

"""
#
@@ -27,6 +31,8 @@
from functools import partial
import json
import re
+import requests
+import time

try:
from bs4 import BeautifulSoup, FeatureNotFound
@@ -111,14 +117,25 @@
p_open = re.compile(r'<noinclude>')
p_close = re.compile(r'(</div>|\n\n\n)?</noinclude>')

- # phe-tools ocr utility
- HOCR_CMD = ('https://tools.wmflabs.org/phetools/hocr_cgi.py?'
- 'cmd=hocr&book={book}&lang={lang}&user={user}')
+ # phetools ocr utility
+ _HOCR_CMD = ('https://tools.wmflabs.org/phetools/hocr_cgi.py?'
+ 'cmd=hocr&book={book}&lang={lang}&user={user}')

- OCR_CMD = ('https://tools.wmflabs.org/phetools/ocr.php?'
- 'cmd=ocr&url={url_image}&lang={lang}&user={user}')
+ _OCR_CMD = ('https://tools.wmflabs.org/phetools/ocr.php?'
+ 'cmd=ocr&url={url_image}&lang={lang}&user={user}')

- MULTI_PAGE_EXT = ['djvu', 'pdf']
+ # googleOCR ocr utility
+ _GOCR_CMD = ('https://tools.wmflabs.org/ws-google-ocr/api.php?'
+ 'image={url_image}&lang={lang}')
+
+ _MULTI_PAGE_EXT = ['djvu', 'pdf']
+
+ _PHETOOLS = 'phetools'
+ _googleOCR = 'googleOCR'
+ _OCR_CMDS = {_PHETOOLS: _OCR_CMD,
+ _googleOCR: _GOCR_CMD,
+ }
+ _OCR_METHODS = list(_OCR_CMDS.keys())

def __init__(self, source, title=''):
"""Instantiate a ProofreadPage object.
@@ -140,7 +157,7 @@
self.PROOFREAD_LEVELS))

self._base, self._base_ext, self._num = self._parse_title()
- self._multi_page = self._base_ext in self.MULTI_PAGE_EXT
+ self._multi_page = self._base_ext in self._MULTI_PAGE_EXT

@property
def _fmt(self):
@@ -539,44 +556,74 @@

return self._url_image

- def _ocr_callback(self, cmd_uri, parser_func=None):
+ def _ocr_callback(self, cmd_uri, parser_func=None, ocr_tool=None):
"""OCR callback function.

@return: tuple (error, text [error description in case of error]).
"""
- def id(x):
+ def identity(x):
return x

if not cmd_uri:
raise ValueError('Parameter cmd_uri is mandatory.')

if parser_func is None:
- parser_func = id
+ parser_func = identity

if not callable(parser_func):
raise TypeError('Keyword parser_func must be callable.')

+ if ocr_tool not in self._OCR_METHODS:
+ raise TypeError(
+ "ocr_tool must be in %s, not '%s'." %
+ (self._OCR_METHODS, ocr_tool))
+
# wrong link fail with Exceptions
- try:
- response = http.fetch(cmd_uri, charset='utf-8')
- except Exception as e:
- pywikibot.error('Querying %s: %s' % (cmd_uri, e))
- return (True, e)
+ retry = 0
+ while retry < 5:
+ pywikibot.debug('{0}: get URI {1!r}'.format(ocr_tool, cmd_uri),
+ _logger)
+ try:
+ response = http.fetch(cmd_uri)
+ except requests.exceptions.ReadTimeout as e:
+ retry += 1
+ pywikibot.warning('ReadTimeout %s: %s' % (cmd_uri, e))
+ pywikibot.warning('retrying in %s seconds ...' % (retry * 5))
+ time.sleep(retry * 5)
+ except Exception as e:
+ pywikibot.error('"%s": %s' % (cmd_uri, e))
+ return (True, e)
+ else:
+ pywikibot.debug('{0}: {1}'.format(ocr_tool, response.text),
+ _logger)
+ break

data = json.loads(response.text)

- assert 'error' in data, 'Error from phe-tools: %s' % data
- assert data['error'] in [0, 1], 'Error from phe-tools: %s' % data
+ if ocr_tool == self._PHETOOLS: # phetools
+ assert 'error' in data, 'Error from phetools: %s' % data
+ assert data['error'] in [0, 1, 2, 3], (
+ 'Error from phetools: %s' % data)
+ error, _text = bool(data['error']), data['text']
+ else: # googleOCR
+ if 'error' in data:
+ error, _text = True, data['error']
+ else:
+ error, _text = False, data['text']

- error = bool(data['error'])
if error:
- pywikibot.error('Querying %s: %s' % (cmd_uri, data['text']))
- return (error, data['text'])
+ pywikibot.error('OCR query %s: %s' % (cmd_uri, _text))
+ return (error, _text)
else:
- return (error, parser_func(data['text']))
+ return (error, parser_func(_text))

def _do_hocr(self):
- """Do hocr using //tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr."""
+ """Do hocr using //tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr.
+
+ This is the main method for 'phetools'.
+ Fallback method is ocr.
+
+ """
def parse_hocr_text(txt):
"""Parse hocr text."""
soup = Soup(txt)
@@ -595,12 +642,14 @@
'user': self.site.user(),
}

- cmd_uri = self.HOCR_CMD.format(**params)
+ cmd_uri = self._HOCR_CMD.format(**params)

- return self._ocr_callback(cmd_uri, parser_func=parse_hocr_text)
+ return self._ocr_callback(cmd_uri,
+ parser_func=parse_hocr_text,
+ ocr_tool=self._PHETOOLS)

- def _do_ocr(self):
- """Do ocr using //tools.wmflabs.org/phetools/ocr.pmp?cmd=ocr."""
+ def _do_ocr(self, ocr_tool=None):
+ """Do ocr using specified ocr_tool method."""
try:
url_image = self.url_image
except ValueError:
@@ -613,28 +662,56 @@
'user': self.site.user(),
}

- cmd_uri = self.OCR_CMD.format(**params)
+ try:
+ cmd_fmt = self._OCR_CMDS[ocr_tool]
+ except KeyError:
+ raise TypeError(
+ "ocr_tool must be in %s, not '%s'." %
+ (self._OCR_METHODS, ocr_tool))

- return self._ocr_callback(cmd_uri)
+ cmd_uri = cmd_fmt.format(**params)

- def ocr(self):
- """Do OCR of Proofreadpage scan.
+ return self._ocr_callback(cmd_uri, ocr_tool=ocr_tool)

- The text returned by this function shalle be assign to self.body,
+ def ocr(self, ocr_tool=None):
+ """Do OCR of ProofreadPage scan.
+
+ The text returned by this function shall be assigned to self.body,
otherwise the ProofreadPage format will not be maintained.

It is the user's responsibility to reset quality level accordingly.
- """
- if self._multi_page:
- error, text = self._do_hocr()
- if not error:
- return text

- error, text = self._do_ocr()
+ @param ocr_tool: 'phetools' or 'googleOCR', default is 'phetools'
+ @type ocr_tool: basestring
+
+ @return: OCR text for the page.
+
+ @raise TypeError: wrong ocr_tool keyword arg.
+ @raise ValueError: something went wrong with OCR process.
+ """
+ if ocr_tool is None: # default value
+ ocr_tool = self._PHETOOLS
+
+ if ocr_tool not in self._OCR_METHODS:
+ raise TypeError(
+ "ocr_tool must be in %s, not '%s'." %
+ (self._OCR_METHODS, ocr_tool))
+
+ if ocr_tool == self._PHETOOLS:
+ # if _multi_page, try _do_hocr() first and fall back to _do_ocr()
+ if self._multi_page:
+ error, text = self._do_hocr()
+ if not error:
+ return text
+ pywikibot.warning('%s: phetools hocr failed, '
+ 'falling back to ocr.' % self)
+
+ error, text = self._do_ocr(ocr_tool=ocr_tool)
+
if not error:
return text
else:
- raise ValueError('Not possible to perform HOCR/OCR on %s.' % self)
+ raise ValueError('%s: not possible to perform OCR.' % self)


class PurgeRequest(Request):
@@ -862,7 +939,8 @@
end = self.num_pages

if not ((1 <= start <= self.num_pages)
- and (1 <= end <= self.num_pages) and (start <= end)):
+ and (1 <= end <= self.num_pages)
+ and (start <= end)):
raise ValueError('start=%s, end=%s are not in valid range (%s, %s)'
% (start, end, 1, self.num_pages))

diff --git a/scripts/wikisourcetext.py b/scripts/wikisourcetext.py
index 2daacfb..83734d4 100644
--- a/scripts/wikisourcetext.py
+++ b/scripts/wikisourcetext.py
@@ -18,32 +18,40 @@

The following parameters are supported:

- -index:... name of the index page
+ -index:... name of the index page.

-pages:<start>-<end>,...<start>-<end>,<start>-<end>
- Page range to upload;
- optional, start=1, end=djvu file number of images.
- Page ranges can be specified as:
+ Page range to upload;
+ optional, start=1, end=djvu file number of images.
+ Page ranges can be specified as:

- | A-B -> pages A until B
- | A- -> pages A until number of images
- | A -> just page A
- | -B -> pages 1 until B
+ | A-B -> pages A until B
+ | A- -> pages A until number of images
+ | A -> just page A
+ | -B -> pages 1 until B

- -showdiff: show difference between curent text and new text when
- saving the page
+ -showdiff: show difference between current text and new text when
+ saving the page.

- -ocr: use https://tools.wmflabs.org/phetools OCR tool to get text;
- default is False, i.e. only not-(yet)-existing pages in Page
- ns will be treated and text will be fetched via preload.
+ -ocr: use OCR tools hosted on https://tools.wmflabs.org.
+ By default no OCR is done, i.e. only not-(yet)-existing
+ pages in Page ns will be treated and text will be fetched
+ via preload.
+ If -ocr is provided, default OCR method is:
+ - https://tools.wmflabs.org/phetools
+ If ocr:googleOCR is given, OCR method is:
+ - https://tools.wmflabs.org/ws-google-ocr

- -force: overwrite existing pages;
- default is False; valid only if '-ocr' is selected.
+ -threads:n number of threads used to fetch OCR from OCR tools.
+ default is 5; valid only if '-ocr' is selected.

- -summary: custom edit summary.
- Use quotes if edit summary contains spaces.
+ -force: overwrite existing pages;
+ default is False; valid only if '-ocr' is selected.

- -always don't bother asking to confirm any of the changes.
+ -summary: custom edit summary.
+ Use quotes if edit summary contains spaces.
+
+ -always don't bother asking to confirm any of the changes.
"""
#
# (C) Pywikibot team, 2016-2018
@@ -52,7 +60,11 @@
#
from __future__ import absolute_import, division, unicode_literals

+import collections
import itertools
+import sys
+import threading
+import time

import pywikibot

@@ -61,6 +73,11 @@
from pywikibot.bot import SingleSiteBot
from pywikibot.proofreadpage import IndexPage, ProofreadPage

+if sys.version_info[0] > 2:
+ import queue
+else:
+ import Queue as queue
+

class UploadTextBot(SingleSiteBot):

@@ -77,6 +94,13 @@
"""
Initializer.

+ If OCR is requested, spawns worker threads, and, if no "force" option
+ is set, filter for existing pages.
+
+ Queues are used for communication to/from threads.
+ A PriorityQueue is used to process pages in the same order as
+ they are generated.
+
@param generator: page generator
@type generator: generator
"""
@@ -84,7 +108,8 @@
'showdiff': False,
'force': False,
'ocr': False,
- 'summary': 'Bot: uploading text'
+ 'summary': 'Bot: uploading text',
+ 'threads': 5
})
super(UploadTextBot, self).__init__(**kwargs)
self.generator = generator
@@ -95,6 +120,59 @@
self.options['summary'] = i18n.twtranslate(
self.site, 'djvutext-creating')

+ if self.getOption('ocr'):
+ self._num_threads = self.getOption('threads')
+ self._queue_in = queue.Queue()
+ self._queue_out = queue.PriorityQueue()
+
+ # If not "-force", no reason to get OCR for existing pages
+ # and to process them in Bot.run().
+ if not self.getOption('force'):
+ self.generator = (p for p in self.generator if not p.exists())
+ self._spawn_ocr_threads()
+
+ def _spawn_ocr_threads(self):
+ """Spawn threads for _ocr_worker workers."""
+ for i in range(self._num_threads):
+ worker = threading.Thread(target=self._ocr_worker)
+ worker.setDaemon(True)
+ worker.start()
+
+ self._pages = collections.OrderedDict()
+ for idx, p in enumerate(self.generator):
+ self._pages.setdefault(p, idx)
+ self.generator = (p for p in self._pages) # recreate gen for run()
+
+ for p, idx in self._pages.items():
+ self._queue_in.put((p, idx)) # idx to preserve order later
+
+ def _ocr_worker(self):
+ """Fetch OCR content from ocr_tool and queue it."""
+ while True:
+ page, idx = self._queue_in.get()
+ try:
+ text_body = page.ocr(ocr_tool=self.getOption('ocr'))
+ except ValueError as e:
+ # TODO: is it a problem in PY2?
+ pywikibot.error(str(e))
+ text_body = None # Sentinel: signal exception to self.treat()
+
+ self._queue_out.put((idx, text_body))
+ self._queue_in.task_done()
+
+ def _get_ocr(self, page):
+ """Get OCR content for page from PriorityQueue."""
+ # blocks until OCR for expected idx is available
+ expected_idx = self._pages.get(page)
+ while True:
+ if self._queue_out.empty():
+ time.sleep(0.2) # some pause
+ continue
+ idx, text_body = self._queue_out.queue[0] # peek first element
+ if idx == expected_idx:
+ idx, text_body = self._queue_out.get()
+ return text_body
+
def treat(self, page):
"""Process one ProofreadPage page.

@@ -114,7 +192,12 @@
old_text = ''

if self.getOption('ocr'):
- page.body = page.ocr()
+ _body = self._get_ocr(page)
+ if _body is None:
+ pywikibot.output('No OCR found. Skipping {}'
+ .format(page.title(as_link=True)))
+ return
+ page.body = _body

if (page.exists() and
not (self.getOption('ocr') and self.getOption('force'))):
@@ -151,7 +234,9 @@
elif arg == '-summary':
options['summary'] = value
elif arg == '-ocr':
- options['ocr'] = True
+ options['ocr'] = value or 'phetools'
+ elif arg == '-threads':
+ options['threads'] = int(value)
elif arg == '-force':
options['force'] = True
elif arg == '-always':
@@ -197,7 +282,7 @@
gen_list = []
for start, end in sorted(pages):
gen = index.page_gen(start=start, end=end,
- filter_ql=[1], content=False)
+ filter_ql=[1], content=True)
gen_list.append(gen)

gen = itertools.chain(*gen_list)
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 0f69c73..cd77337 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -320,6 +320,69 @@
self.assertEqual(page.quality_level, 0)


+class TestPageOCR(TestCase):
+
+ """Test page ocr functions."""
+
+ family = 'wikisource'
+ code = 'en'
+
+ cached = True
+
+ data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10',
+ 'hocr': (False, 'ENTERED, according to Act of Congress, in the '
+ 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office '
+ 'of the Librarian of Congress, at '
+ 'Washington.\n\n'),
+ 'ocr': (False, 'lam-mam, according to Act of Congress, in the '
+ 'year 157-2,\nBY D. APPLEION Av CO.,\nIn the '
+ 'Of\ufb01ce or the Librarian of '
+ 'Congress, at Washington.\n\n'),
+ 'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
+ 'the year 1572,\nBY D. APPLETON & CO.\n'
+ 'In the Office of the Librarian of '
+ 'Congress, at Washington.\n4 334\n'),
+ }
+
+ def setUp(self):
+ """Test setUp."""
+ site = self.get_site()
+ title = self.data['title']
+ self.page = ProofreadPage(site, title)
+ super(TestPageOCR, self).setUp()
+
+ def test_ocr_exceptions(self):
+ """Test page.ocr() exceptions."""
+ self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy')
+
+ def test_do_hocr(self):
+ """Test page._do_hocr()."""
+ error, text = self.page._do_hocr()
+ ref_error, ref_text = self.data['hocr']
+ self.assertEqual(error, ref_error)
+ self.assertEqual(text, ref_text)
+
+ def test_do_ocr_phetools(self):
+ """Test page._do_ocr(ocr_tool='phetools')."""
+ error, text = self.page._do_ocr(ocr_tool='phetools')
+ ref_error, ref_text = self.data['ocr']
+ self.assertEqual(error, ref_error)
+ self.assertEqual(text, ref_text)
+
+ def test_do_ocr_googleocr(self):
+ """Test page._do_ocr(ocr_tool='googleOCR')."""
+ error, text = self.page._do_ocr(ocr_tool='googleOCR')
+ ref_error, ref_text = self.data['googleOCR']
+ self.assertEqual(error, ref_error)
+ self.assertEqual(text, ref_text)
+
+ def test_ocr_googleocr(self):
+ """Test page.ocr(ocr_tool='googleOCR')."""
+ text = self.page.ocr(ocr_tool='googleOCR')
+ ref_error, ref_text = self.data['googleOCR']
+ self.assertEqual(text, ref_text)
+
+
@require_modules('bs4')
class TestProofreadPageIndexProperty(TestCase):


To view, visit change 446973. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Ie56d0534f945b5e5a614a78e4095f1efa52001d0
Gerrit-Change-Number: 446973
Gerrit-PatchSet: 17
Gerrit-Owner: Mpaa <mpaa.wiki@gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb@gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444@gmail.com>
Gerrit-Reviewer: jenkins-bot (75)