jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/446973 )
Change subject: proofreadpage.py: insert support for googleOCR ......................................................................
proofreadpage.py: insert support for googleOCR
In addition to 'phetools', add suppport for googleOCR at: https://tools.wmflabs.org/ws-google-ocr/api.php
Communication with toollabs is managed by a pool of threads to increase performance.
Tests added.
wikisourcetext.py has been modified accordingly to support this additional option.
Change-Id: Ie56d0534f945b5e5a614a78e4095f1efa52001d0 --- M pywikibot/proofreadpage.py M scripts/wikisourcetext.py M tests/proofreadpage_tests.py 3 files changed, 288 insertions(+), 62 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py index e8694c1..e22c2e6 100644 --- a/pywikibot/proofreadpage.py +++ b/pywikibot/proofreadpage.py @@ -14,7 +14,11 @@ OCR support of page scans via: - https://tools.wmflabs.org/phetools/hocr_cgi.py - https://tools.wmflabs.org/phetools/ocr.php -inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js +- inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js + +- https://tools.wmflabs.org/ws-google-ocr/ +- inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js +- see also: https://wikisource.org/wiki/Wikisource:Google_OCR
""" # @@ -27,6 +31,8 @@ from functools import partial import json import re +import requests +import time
try: from bs4 import BeautifulSoup, FeatureNotFound @@ -111,14 +117,25 @@ p_open = re.compile(r'<noinclude>') p_close = re.compile(r'(</div>|\n\n\n)?</noinclude>')
- # phe-tools ocr utility - HOCR_CMD = ('https://tools.wmflabs.org/phetools/hocr_cgi.py?' - 'cmd=hocr&book={book}&lang={lang}&user={user}') + # phetools ocr utility + _HOCR_CMD = ('https://tools.wmflabs.org/phetools/hocr_cgi.py?' + 'cmd=hocr&book={book}&lang={lang}&user={user}')
- OCR_CMD = ('https://tools.wmflabs.org/phetools/ocr.php?' - 'cmd=ocr&url={url_image}&lang={lang}&user={user}') + _OCR_CMD = ('https://tools.wmflabs.org/phetools/ocr.php?' + 'cmd=ocr&url={url_image}&lang={lang}&user={user}')
- MULTI_PAGE_EXT = ['djvu', 'pdf'] + # googleOCR ocr utility + _GOCR_CMD = ('https://tools.wmflabs.org/ws-google-ocr/api.php?' + 'image={url_image}&lang={lang}') + + _MULTI_PAGE_EXT = ['djvu', 'pdf'] + + _PHETOOLS = 'phetools' + _googleOCR = 'googleOCR' + _OCR_CMDS = {_PHETOOLS: _OCR_CMD, + _googleOCR: _GOCR_CMD, + } + _OCR_METHODS = list(_OCR_CMDS.keys())
def __init__(self, source, title=''): """Instantiate a ProofreadPage object. @@ -140,7 +157,7 @@ self.PROOFREAD_LEVELS))
self._base, self._base_ext, self._num = self._parse_title() - self._multi_page = self._base_ext in self.MULTI_PAGE_EXT + self._multi_page = self._base_ext in self._MULTI_PAGE_EXT
@property def _fmt(self): @@ -539,44 +556,74 @@
return self._url_image
- def _ocr_callback(self, cmd_uri, parser_func=None): + def _ocr_callback(self, cmd_uri, parser_func=None, ocr_tool=None): """OCR callback function.
@return: tuple (error, text [error description in case of error]). """ - def id(x): + def identity(x): return x
if not cmd_uri: raise ValueError('Parameter cmd_uri is mandatory.')
if parser_func is None: - parser_func = id + parser_func = identity
if not callable(parser_func): raise TypeError('Keyword parser_func must be callable.')
+ if ocr_tool not in self._OCR_METHODS: + raise TypeError( + "ocr_tool must be in %s, not '%s'." % + (self._OCR_METHODS, ocr_tool)) + # wrong link fail with Exceptions - try: - response = http.fetch(cmd_uri, charset='utf-8') - except Exception as e: - pywikibot.error('Querying %s: %s' % (cmd_uri, e)) - return (True, e) + retry = 0 + while retry < 5: + pywikibot.debug('{0}: get URI {1!r}'.format(ocr_tool, cmd_uri), + _logger) + try: + response = http.fetch(cmd_uri) + except requests.exceptions.ReadTimeout as e: + retry += 1 + pywikibot.warning('ReadTimeout %s: %s' % (cmd_uri, e)) + pywikibot.warning('retrying in %s seconds ...' % (retry * 5)) + time.sleep(retry * 5) + except Exception as e: + pywikibot.error('"%s": %s' % (cmd_uri, e)) + return (True, e) + else: + pywikibot.debug('{0}: {1}'.format(ocr_tool, response.text), + _logger) + break
data = json.loads(response.text)
- assert 'error' in data, 'Error from phe-tools: %s' % data - assert data['error'] in [0, 1], 'Error from phe-tools: %s' % data + if ocr_tool == self._PHETOOLS: # phetools + assert 'error' in data, 'Error from phetools: %s' % data + assert data['error'] in [0, 1, 2, 3], ( + 'Error from phetools: %s' % data) + error, _text = bool(data['error']), data['text'] + else: # googleOCR + if 'error' in data: + error, _text = True, data['error'] + else: + error, _text = False, data['text']
- error = bool(data['error']) if error: - pywikibot.error('Querying %s: %s' % (cmd_uri, data['text'])) - return (error, data['text']) + pywikibot.error('OCR query %s: %s' % (cmd_uri, _text)) + return (error, _text) else: - return (error, parser_func(data['text'])) + return (error, parser_func(_text))
def _do_hocr(self): - """Do hocr using //tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr.""" + """Do hocr using //tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr. + + This is the main method for 'phetools'. + Fallback method is ocr. + + """ def parse_hocr_text(txt): """Parse hocr text.""" soup = Soup(txt) @@ -595,12 +642,14 @@ 'user': self.site.user(), }
- cmd_uri = self.HOCR_CMD.format(**params) + cmd_uri = self._HOCR_CMD.format(**params)
- return self._ocr_callback(cmd_uri, parser_func=parse_hocr_text) + return self._ocr_callback(cmd_uri, + parser_func=parse_hocr_text, + ocr_tool=self._PHETOOLS)
- def _do_ocr(self): - """Do ocr using //tools.wmflabs.org/phetools/ocr.pmp?cmd=ocr.""" + def _do_ocr(self, ocr_tool=None): + """Do ocr using specified ocr_tool method.""" try: url_image = self.url_image except ValueError: @@ -613,28 +662,56 @@ 'user': self.site.user(), }
- cmd_uri = self.OCR_CMD.format(**params) + try: + cmd_fmt = self._OCR_CMDS[ocr_tool] + except KeyError: + raise TypeError( + "ocr_tool must be in %s, not '%s'." % + (self._OCR_METHODS, ocr_tool))
- return self._ocr_callback(cmd_uri) + cmd_uri = cmd_fmt.format(**params)
- def ocr(self): - """Do OCR of Proofreadpage scan. + return self._ocr_callback(cmd_uri, ocr_tool=ocr_tool)
- The text returned by this function shalle be assign to self.body, + def ocr(self, ocr_tool=None): + """Do OCR of ProofreadPage scan. + + The text returned by this function shall be assigned to self.body, otherwise the ProofreadPage format will not be maintained.
It is the user's responsibility to reset quality level accordingly. - """ - if self._multi_page: - error, text = self._do_hocr() - if not error: - return text
- error, text = self._do_ocr() + @param ocr_tool: 'phetools' or 'googleOCR', default is 'phetools' + @type ocr_tool: basestring + + @return: OCR text for the page. + + @raise TypeError: wrong ocr_tool keyword arg. + @raise ValueError: something went wrong with OCR process. + """ + if ocr_tool is None: # default value + ocr_tool = self._PHETOOLS + + if ocr_tool not in self._OCR_METHODS: + raise TypeError( + "ocr_tool must be in %s, not '%s'." % + (self._OCR_METHODS, ocr_tool)) + + if ocr_tool == self._PHETOOLS: + # if _multi_page, try _do_hocr() first and fall back to _do_ocr() + if self._multi_page: + error, text = self._do_hocr() + if not error: + return text + pywikibot.warning('%s: phetools hocr failed, ' + 'falling back to ocr.' % self) + + error, text = self._do_ocr(ocr_tool=ocr_tool) + if not error: return text else: - raise ValueError('Not possible to perform HOCR/OCR on %s.' % self) + raise ValueError('%s: not possible to perform OCR.' % self)
class PurgeRequest(Request): @@ -862,7 +939,8 @@ end = self.num_pages
if not ((1 <= start <= self.num_pages) - and (1 <= end <= self.num_pages) and (start <= end)): + and (1 <= end <= self.num_pages) + and (start <= end)): raise ValueError('start=%s, end=%s are not in valid range (%s, %s)' % (start, end, 1, self.num_pages))
diff --git a/scripts/wikisourcetext.py b/scripts/wikisourcetext.py index 2daacfb..83734d4 100644 --- a/scripts/wikisourcetext.py +++ b/scripts/wikisourcetext.py @@ -18,32 +18,40 @@
The following parameters are supported:
- -index:... name of the index page + -index:... name of the index page.
-pages:<start>-<end>,...<start>-<end>,<start>-<end> - Page range to upload; - optional, start=1, end=djvu file number of images. - Page ranges can be specified as: + Page range to upload; + optional, start=1, end=djvu file number of images. + Page ranges can be specified as:
- | A-B -> pages A until B - | A- -> pages A until number of images - | A -> just page A - | -B -> pages 1 until B + | A-B -> pages A until B + | A- -> pages A until number of images + | A -> just page A + | -B -> pages 1 until B
- -showdiff: show difference between curent text and new text when - saving the page + -showdiff: show difference between current text and new text when + saving the page.
- -ocr: use https://tools.wmflabs.org/phetools OCR tool to get text; - default is False, i.e. only not-(yet)-existing pages in Page - ns will be treated and text will be fetched via preload. + -ocr: use OCR tools hosted on https://tools.wmflabs.org. + By default no OCR is done, i.e. only not-(yet)-existing + pages in Page ns will be treated and text will be fetched + via preload. + If -ocr is provided, default OCR method is: + - https://tools.wmflabs.org/phetools + If ocr:googleOCR is given, OCR method is: + - https://tools.wmflabs.org/ws-google-ocr
- -force: overwrite existing pages; - default is False; valid only if '-ocr' is selected. + -threads:n number of threads used to fetch OCR from OCR tools. + default is 5; valid only if '-ocr' is selected.
- -summary: custom edit summary. - Use quotes if edit summary contains spaces. + -force: overwrite existing pages; + default is False; valid only if '-ocr' is selected.
- -always don't bother asking to confirm any of the changes. + -summary: custom edit summary. + Use quotes if edit summary contains spaces. + + -always don't bother asking to confirm any of the changes. """ # # (C) Pywikibot team, 2016-2018 @@ -52,7 +60,11 @@ # from __future__ import absolute_import, division, unicode_literals
+import collections import itertools +import sys +import threading +import time
import pywikibot
@@ -61,6 +73,11 @@ from pywikibot.bot import SingleSiteBot from pywikibot.proofreadpage import IndexPage, ProofreadPage
+if sys.version_info[0] > 2: + import queue +else: + import Queue as queue +
class UploadTextBot(SingleSiteBot):
@@ -77,6 +94,13 @@ """ Initializer.
+ If OCR is requested, spawns worker threads, and, if no "force" option + is set, filter for existing pages. + + Queues are used for communication to/from threads. + A PriorityQueue is used to process pages in the same order as + they are generated. + @param generator: page generator @type generator: generator """ @@ -84,7 +108,8 @@ 'showdiff': False, 'force': False, 'ocr': False, - 'summary': 'Bot: uploading text' + 'summary': 'Bot: uploading text', + 'threads': 5 }) super(UploadTextBot, self).__init__(**kwargs) self.generator = generator @@ -95,6 +120,59 @@ self.options['summary'] = i18n.twtranslate( self.site, 'djvutext-creating')
+ if self.getOption('ocr'): + self._num_threads = self.getOption('threads') + self._queue_in = queue.Queue() + self._queue_out = queue.PriorityQueue() + + # If not "-force", no reason to get OCR for existing pages + # and to process them in Bot.run(). + if not self.getOption('force'): + self.generator = (p for p in self.generator if not p.exists()) + self._spawn_ocr_threads() + + def _spawn_ocr_threads(self): + """Spawn threads for _ocr_worker workers.""" + for i in range(self._num_threads): + worker = threading.Thread(target=self._ocr_worker) + worker.setDaemon(True) + worker.start() + + self._pages = collections.OrderedDict() + for idx, p in enumerate(self.generator): + self._pages.setdefault(p, idx) + self.generator = (p for p in self._pages) # recreate gen for run() + + for p, idx in self._pages.items(): + self._queue_in.put((p, idx)) # idx to preserve order later + + def _ocr_worker(self): + """Fetch OCR content from ocr_tool and queue it.""" + while True: + page, idx = self._queue_in.get() + try: + text_body = page.ocr(ocr_tool=self.getOption('ocr')) + except ValueError as e: + # TODO: is it a problem in PY2? + pywikibot.error(str(e)) + text_body = None # Sentinel: signal exception to self.treat() + + self._queue_out.put((idx, text_body)) + self._queue_in.task_done() + + def _get_ocr(self, page): + """Get OCR content for page from PriorityQueue.""" + # blocks until OCR for expected idx is available + expected_idx = self._pages.get(page) + while True: + if self._queue_out.empty(): + time.sleep(0.2) # some pause + continue + idx, text_body = self._queue_out.queue[0] # peek first element + if idx == expected_idx: + idx, text_body = self._queue_out.get() + return text_body + def treat(self, page): """Process one ProofreadPage page.
@@ -114,7 +192,12 @@ old_text = ''
if self.getOption('ocr'): - page.body = page.ocr() + _body = self._get_ocr(page) + if _body is None: + pywikibot.output('No OCR found. Skipping {}' + .format(page.title(as_link=True))) + return + page.body = _body
if (page.exists() and not (self.getOption('ocr') and self.getOption('force'))): @@ -151,7 +234,9 @@ elif arg == '-summary': options['summary'] = value elif arg == '-ocr': - options['ocr'] = True + options['ocr'] = value or 'phetools' + elif arg == '-threads': + options['threads'] = int(value) elif arg == '-force': options['force'] = True elif arg == '-always': @@ -197,7 +282,7 @@ gen_list = [] for start, end in sorted(pages): gen = index.page_gen(start=start, end=end, - filter_ql=[1], content=False) + filter_ql=[1], content=True) gen_list.append(gen)
gen = itertools.chain(*gen_list) diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py index 0f69c73..cd77337 100644 --- a/tests/proofreadpage_tests.py +++ b/tests/proofreadpage_tests.py @@ -320,6 +320,69 @@ self.assertEqual(page.quality_level, 0)
+class TestPageOCR(TestCase): + + """Test page ocr functions.""" + + family = 'wikisource' + code = 'en' + + cached = True + + data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10', + 'hocr': (False, 'ENTERED, according to Act of Congress, in the ' + 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office ' + 'of the Librarian of Congress, at ' + 'Washington.\n\n'), + 'ocr': (False, 'lam-mam, according to Act of Congress, in the ' + 'year 157-2,\nBY D. APPLEION Av CO.,\nIn the ' + 'Of\ufb01ce or the Librarian of ' + 'Congress, at Washington.\n\n'), + 'googleOCR': (False, 'ENTERED, according to Act of Congress, in ' + 'the year 1572,\nBY D. APPLETON & CO.\n' + 'In the Office of the Librarian of ' + 'Congress, at Washington.\n4 334\n'), + } + + def setUp(self): + """Test setUp.""" + site = self.get_site() + title = self.data['title'] + self.page = ProofreadPage(site, title) + super(TestPageOCR, self).setUp() + + def test_ocr_exceptions(self): + """Test page.ocr() exceptions.""" + self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy') + + def test_do_hocr(self): + """Test page._do_hocr().""" + error, text = self.page._do_hocr() + ref_error, ref_text = self.data['hocr'] + self.assertEqual(error, ref_error) + self.assertEqual(text, ref_text) + + def test_do_ocr_phetools(self): + """Test page._do_ocr(ocr_tool='phetools').""" + error, text = self.page._do_ocr(ocr_tool='phetools') + ref_error, ref_text = self.data['ocr'] + self.assertEqual(error, ref_error) + self.assertEqual(text, ref_text) + + def test_do_ocr_googleocr(self): + """Test page._do_ocr(ocr_tool='googleOCR').""" + error, text = self.page._do_ocr(ocr_tool='googleOCR') + ref_error, ref_text = self.data['googleOCR'] + self.assertEqual(error, ref_error) + self.assertEqual(text, ref_text) + + def test_ocr_googleocr(self): + """Test page.ocr(ocr_tool='googleOCR').""" + text = self.page.ocr(ocr_tool='googleOCR') + ref_error, ref_text = self.data['googleOCR'] + self.assertEqual(text, ref_text) + + @require_modules('bs4') class TestProofreadPageIndexProperty(TestCase):
pywikibot-commits@lists.wikimedia.org