jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/476906 )
Change subject: Fix W504 issues across python scripts (XVI)
......................................................................
Fix W504 issues across python scripts (XVI)
This is a GCI submission. The changes are mostly
moved line breaks.
Bug: T207836
Change-Id: I07a314c32331da539f53e73dc32980a355a8b30d
---
M pywikibot/__init__.py
M pywikibot/comms/threadedhttp.py
M pywikibot/date.py
M pywikibot/tools/__init__.py
M pywikibot/tools/formatter.py
M pywikibot/userinterfaces/terminal_interface_unix.py
6 files changed, 23 insertions(+), 22 deletions(-)
Approvals:
Xqt: Looks good to me, but someone else must approve
D3r1ck01: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py
index 3d9506c..fb07bcd 100644
--- a/pywikibot/__init__.py
+++ b/pywikibot/__init__.py
@@ -575,8 +575,8 @@
# if precision is given it overwrites the autodetection above
if precision is not None:
- if (isinstance(precision, int) and
- precision in self.PRECISION.values()):
+ if (isinstance(precision, int)
+ and precision in self.PRECISION.values()):
self.precision = precision
elif precision in self.PRECISION:
self.precision = self.PRECISION[precision]
diff --git a/pywikibot/comms/threadedhttp.py b/pywikibot/comms/threadedhttp.py
index d533dbd..065ae62 100644
--- a/pywikibot/comms/threadedhttp.py
+++ b/pywikibot/comms/threadedhttp.py
@@ -153,8 +153,10 @@
charset = 'latin1'
else:
charset = self.charset
- if (self.header_encoding and codecs.lookup(self.header_encoding) !=
- (codecs.lookup(charset) if charset else None)):
+ if (self.header_encoding
+ and codecs.lookup(
+ self.header_encoding) != (
+ codecs.lookup(charset) if charset else None)):
if charset:
pywikibot.warning(
'Encoding "{0}" requested but "{1}" '
@@ -171,8 +173,8 @@
else:
self._encoding = None
- if charset and (isinstance(self._encoding, Exception) or
- not self._encoding):
+ if charset and (isinstance(self._encoding, Exception)
+ or not self._encoding):
try:
self.raw.decode(charset)
except UnicodeError as e:
diff --git a/pywikibot/date.py b/pywikibot/date.py
index 12a0eec..7ba5e27 100644
--- a/pywikibot/date.py
+++ b/pywikibot/date.py
@@ -384,9 +384,9 @@
for s in _reParameters.split(pattern):
if s is None:
continue
- if (len(s) in (2, 3) and s[0] == '%' and
- s[-1] in _digitDecoders and
- (len(s) == 2 or s[1] in _decimalDigits)):
+ if (len(s) in (2, 3) and s[0] == '%'
+ and s[-1] in _digitDecoders
+ and(len(s) == 2 or s[1] in _decimalDigits)):
# Must match a "%2d" or "%d" style
dec = _digitDecoders[s[-1]]
if isinstance(dec, basestring):
@@ -396,7 +396,7 @@
'Invalid pattern {0}: Cannot use zero padding size '
'in {1}!'.format(pattern, s))
newPattern += re.escape(dec)
- strPattern += s # Keep the original text
+ strPattern += s # Keep the original text
else:
if len(s) == 3:
# enforce mandatory field size
diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 73a0baf..e68e9d5 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -1430,12 +1430,11 @@
frame = sys._getframe(stacklevel + 1)
class_name = frame.f_code.co_name
if class_name and class_name != '<module>':
- obj.__full_name__ = (obj.__module__ + '.' +
- class_name + '.' +
- obj.__name__)
+ obj.__full_name__ = '{}.{}.{}'.format(
+ obj.__module__, class_name, obj.__name__)
else:
- obj.__full_name__ = (obj.__module__ + '.' +
- obj.__name__)
+ obj.__full_name__ = '{}.{}'.format(
+ obj.__module__, obj.__name__)
def manage_wrapping(wrapper, obj):
@@ -1515,8 +1514,8 @@
# The decorator being decorated may have args, so both
# syntax need to be supported.
- if (len(outer_args) == 1 and len(outer_kwargs) == 0 and
- callable(outer_args[0])):
+ if (len(outer_args) == 1 and len(outer_kwargs) == 0
+ and callable(outer_args[0])):
add_decorated_full_name(outer_args[0])
return obj(outer_args[0])
else:
diff --git a/pywikibot/tools/formatter.py b/pywikibot/tools/formatter.py
index e983696..cc511ce 100644
--- a/pywikibot/tools/formatter.py
+++ b/pywikibot/tools/formatter.py
@@ -151,8 +151,8 @@
@rtype: unicode
"""
if self.colors.intersection(kwargs): # kwargs use colors
- raise ValueError('Keyword argument(s) use valid color(s): ' +
- '", "'.join(self.colors.intersection(kwargs)))
+ raise ValueError('Keyword argument(s) use valid color(s): '
+ + '", "'.join(self.colors.intersection(kwargs)))
if not isinstance(format_string, UnicodeType):
raise TypeError('expected {0}, got {1}'
.format(type(''), type(format_string)))
diff --git a/pywikibot/userinterfaces/terminal_interface_unix.py b/pywikibot/userinterfaces/terminal_interface_unix.py
index 517bb2c..9e06aa0 100755
--- a/pywikibot/userinterfaces/terminal_interface_unix.py
+++ b/pywikibot/userinterfaces/terminal_interface_unix.py
@@ -61,9 +61,9 @@
if sys.version_info[0] == 2:
# .encoding does not mean we can write unicode
# to the stream pre-2.7.
- if (sys.version_info >= (2, 7) and
- hasattr(targetStream, 'encoding') and
- targetStream.encoding):
+ if (sys.version_info >= (2, 7)
+ and hasattr(targetStream, 'encoding')
+ and targetStream.encoding):
text = text.encode(targetStream.encoding, 'replace').decode(
targetStream.encoding)
else:
--
To view, visit https://gerrit.wikimedia.org/r/476906
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I07a314c32331da539f53e73dc32980a355a8b30d
Gerrit-Change-Number: 476906
Gerrit-PatchSet: 1
Gerrit-Owner: To matih <maatteeoh(a)wp.pl>
Gerrit-Reviewer: D3r1ck01 <alangiderick(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/475597 )
Change subject: proofreadpage.py: test connection with wmflabs
......................................................................
proofreadpage.py: test connection with wmflabs
Add test to check connectivity with wmflabs servers.
This is useful to check if tests fail due to problems
with such requests.
Change-Id: I9dff9289740d966158a8f0205b8ba54adbb9f4ed
---
M tests/proofreadpage_tests.py
1 file changed, 11 insertions(+), 0 deletions(-)
Approvals:
Mpaa: Looks good to me, approved
jenkins-bot: Verified
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index cd77337..74474b3 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -11,6 +11,7 @@
import pywikibot
+from pywikibot.comms import http
from pywikibot.data import api
from pywikibot.proofreadpage import IndexPage, ProofreadPage
@@ -362,6 +363,16 @@
self.assertEqual(error, ref_error)
self.assertEqual(text, ref_text)
+ def test_do_ocr_phetools_raw_request(self):
+ """Test page._do_ocr connection with wmflabs."""
+ uri = ('https://tools.wmflabs.org/phetools/ocr.php?cmd=ocr'
+ '&url=https://upload.wikimedia.org/wikipedia/commons/'
+ 'thumb/a/ac/Popular_Science_Monthly_Volume_1.djvu/'
+ 'page10-1024px-Popular_Science_Monthly_Volume_1.djvu.jpg'
+ '&lang=en&user=None')
+ response = http.fetch(uri)
+ self.assertEqual(response.status, 200)
+
def test_do_ocr_phetools(self):
"""Test page._do_ocr(ocr_tool='phetools')."""
error, text = self.page._do_ocr(ocr_tool='phetools')
--
To view, visit https://gerrit.wikimedia.org/r/475597
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I9dff9289740d966158a8f0205b8ba54adbb9f4ed
Gerrit-Change-Number: 475597
Gerrit-PatchSet: 1
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/446973 )
Change subject: proofreadpage.py: insert support for googleOCR
......................................................................
proofreadpage.py: insert support for googleOCR
In addition to 'phetools', add suppport for googleOCR at:
https://tools.wmflabs.org/ws-google-ocr/api.php
Communication with toollabs is managed by a pool of threads
to increase performance.
Tests added.
wikisourcetext.py has been modified accordingly to support this
additional option.
Change-Id: Ie56d0534f945b5e5a614a78e4095f1efa52001d0
---
M pywikibot/proofreadpage.py
M scripts/wikisourcetext.py
M tests/proofreadpage_tests.py
3 files changed, 288 insertions(+), 62 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index e8694c1..e22c2e6 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -14,7 +14,11 @@
OCR support of page scans via:
- https://tools.wmflabs.org/phetools/hocr_cgi.py
- https://tools.wmflabs.org/phetools/ocr.php
-inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
+- inspired by https://en.wikisource.org/wiki/MediaWiki:Gadget-ocr.js
+
+- https://tools.wmflabs.org/ws-google-ocr/
+- inspired by https://wikisource.org/wiki/MediaWiki:GoogleOCR.js
+- see also: https://wikisource.org/wiki/Wikisource:Google_OCR
"""
#
@@ -27,6 +31,8 @@
from functools import partial
import json
import re
+import requests
+import time
try:
from bs4 import BeautifulSoup, FeatureNotFound
@@ -111,14 +117,25 @@
p_open = re.compile(r'<noinclude>')
p_close = re.compile(r'(</div>|\n\n\n)?</noinclude>')
- # phe-tools ocr utility
- HOCR_CMD = ('https://tools.wmflabs.org/phetools/hocr_cgi.py?'
- 'cmd=hocr&book={book}&lang={lang}&user={user}')
+ # phetools ocr utility
+ _HOCR_CMD = ('https://tools.wmflabs.org/phetools/hocr_cgi.py?'
+ 'cmd=hocr&book={book}&lang={lang}&user={user}')
- OCR_CMD = ('https://tools.wmflabs.org/phetools/ocr.php?'
- 'cmd=ocr&url={url_image}&lang={lang}&user={user}')
+ _OCR_CMD = ('https://tools.wmflabs.org/phetools/ocr.php?'
+ 'cmd=ocr&url={url_image}&lang={lang}&user={user}')
- MULTI_PAGE_EXT = ['djvu', 'pdf']
+ # googleOCR ocr utility
+ _GOCR_CMD = ('https://tools.wmflabs.org/ws-google-ocr/api.php?'
+ 'image={url_image}&lang={lang}')
+
+ _MULTI_PAGE_EXT = ['djvu', 'pdf']
+
+ _PHETOOLS = 'phetools'
+ _googleOCR = 'googleOCR'
+ _OCR_CMDS = {_PHETOOLS: _OCR_CMD,
+ _googleOCR: _GOCR_CMD,
+ }
+ _OCR_METHODS = list(_OCR_CMDS.keys())
def __init__(self, source, title=''):
"""Instantiate a ProofreadPage object.
@@ -140,7 +157,7 @@
self.PROOFREAD_LEVELS))
self._base, self._base_ext, self._num = self._parse_title()
- self._multi_page = self._base_ext in self.MULTI_PAGE_EXT
+ self._multi_page = self._base_ext in self._MULTI_PAGE_EXT
@property
def _fmt(self):
@@ -539,44 +556,74 @@
return self._url_image
- def _ocr_callback(self, cmd_uri, parser_func=None):
+ def _ocr_callback(self, cmd_uri, parser_func=None, ocr_tool=None):
"""OCR callback function.
@return: tuple (error, text [error description in case of error]).
"""
- def id(x):
+ def identity(x):
return x
if not cmd_uri:
raise ValueError('Parameter cmd_uri is mandatory.')
if parser_func is None:
- parser_func = id
+ parser_func = identity
if not callable(parser_func):
raise TypeError('Keyword parser_func must be callable.')
+ if ocr_tool not in self._OCR_METHODS:
+ raise TypeError(
+ "ocr_tool must be in %s, not '%s'." %
+ (self._OCR_METHODS, ocr_tool))
+
# wrong link fail with Exceptions
- try:
- response = http.fetch(cmd_uri, charset='utf-8')
- except Exception as e:
- pywikibot.error('Querying %s: %s' % (cmd_uri, e))
- return (True, e)
+ retry = 0
+ while retry < 5:
+ pywikibot.debug('{0}: get URI {1!r}'.format(ocr_tool, cmd_uri),
+ _logger)
+ try:
+ response = http.fetch(cmd_uri)
+ except requests.exceptions.ReadTimeout as e:
+ retry += 1
+ pywikibot.warning('ReadTimeout %s: %s' % (cmd_uri, e))
+ pywikibot.warning('retrying in %s seconds ...' % (retry * 5))
+ time.sleep(retry * 5)
+ except Exception as e:
+ pywikibot.error('"%s": %s' % (cmd_uri, e))
+ return (True, e)
+ else:
+ pywikibot.debug('{0}: {1}'.format(ocr_tool, response.text),
+ _logger)
+ break
data = json.loads(response.text)
- assert 'error' in data, 'Error from phe-tools: %s' % data
- assert data['error'] in [0, 1], 'Error from phe-tools: %s' % data
+ if ocr_tool == self._PHETOOLS: # phetools
+ assert 'error' in data, 'Error from phetools: %s' % data
+ assert data['error'] in [0, 1, 2, 3], (
+ 'Error from phetools: %s' % data)
+ error, _text = bool(data['error']), data['text']
+ else: # googleOCR
+ if 'error' in data:
+ error, _text = True, data['error']
+ else:
+ error, _text = False, data['text']
- error = bool(data['error'])
if error:
- pywikibot.error('Querying %s: %s' % (cmd_uri, data['text']))
- return (error, data['text'])
+ pywikibot.error('OCR query %s: %s' % (cmd_uri, _text))
+ return (error, _text)
else:
- return (error, parser_func(data['text']))
+ return (error, parser_func(_text))
def _do_hocr(self):
- """Do hocr using //tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr."""
+ """Do hocr using //tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr.
+
+ This is the main method for 'phetools'.
+ Fallback method is ocr.
+
+ """
def parse_hocr_text(txt):
"""Parse hocr text."""
soup = Soup(txt)
@@ -595,12 +642,14 @@
'user': self.site.user(),
}
- cmd_uri = self.HOCR_CMD.format(**params)
+ cmd_uri = self._HOCR_CMD.format(**params)
- return self._ocr_callback(cmd_uri, parser_func=parse_hocr_text)
+ return self._ocr_callback(cmd_uri,
+ parser_func=parse_hocr_text,
+ ocr_tool=self._PHETOOLS)
- def _do_ocr(self):
- """Do ocr using //tools.wmflabs.org/phetools/ocr.pmp?cmd=ocr."""
+ def _do_ocr(self, ocr_tool=None):
+ """Do ocr using specified ocr_tool method."""
try:
url_image = self.url_image
except ValueError:
@@ -613,28 +662,56 @@
'user': self.site.user(),
}
- cmd_uri = self.OCR_CMD.format(**params)
+ try:
+ cmd_fmt = self._OCR_CMDS[ocr_tool]
+ except KeyError:
+ raise TypeError(
+ "ocr_tool must be in %s, not '%s'." %
+ (self._OCR_METHODS, ocr_tool))
- return self._ocr_callback(cmd_uri)
+ cmd_uri = cmd_fmt.format(**params)
- def ocr(self):
- """Do OCR of Proofreadpage scan.
+ return self._ocr_callback(cmd_uri, ocr_tool=ocr_tool)
- The text returned by this function shalle be assign to self.body,
+ def ocr(self, ocr_tool=None):
+ """Do OCR of ProofreadPage scan.
+
+ The text returned by this function shall be assigned to self.body,
otherwise the ProofreadPage format will not be maintained.
It is the user's responsibility to reset quality level accordingly.
- """
- if self._multi_page:
- error, text = self._do_hocr()
- if not error:
- return text
- error, text = self._do_ocr()
+ @param ocr_tool: 'phetools' or 'googleOCR', default is 'phetools'
+ @type ocr_tool: basestring
+
+ @return: OCR text for the page.
+
+ @raise TypeError: wrong ocr_tool keyword arg.
+ @raise ValueError: something went wrong with OCR process.
+ """
+ if ocr_tool is None: # default value
+ ocr_tool = self._PHETOOLS
+
+ if ocr_tool not in self._OCR_METHODS:
+ raise TypeError(
+ "ocr_tool must be in %s, not '%s'." %
+ (self._OCR_METHODS, ocr_tool))
+
+ if ocr_tool == self._PHETOOLS:
+ # if _multi_page, try _do_hocr() first and fall back to _do_ocr()
+ if self._multi_page:
+ error, text = self._do_hocr()
+ if not error:
+ return text
+ pywikibot.warning('%s: phetools hocr failed, '
+ 'falling back to ocr.' % self)
+
+ error, text = self._do_ocr(ocr_tool=ocr_tool)
+
if not error:
return text
else:
- raise ValueError('Not possible to perform HOCR/OCR on %s.' % self)
+ raise ValueError('%s: not possible to perform OCR.' % self)
class PurgeRequest(Request):
@@ -862,7 +939,8 @@
end = self.num_pages
if not ((1 <= start <= self.num_pages)
- and (1 <= end <= self.num_pages) and (start <= end)):
+ and (1 <= end <= self.num_pages)
+ and (start <= end)):
raise ValueError('start=%s, end=%s are not in valid range (%s, %s)'
% (start, end, 1, self.num_pages))
diff --git a/scripts/wikisourcetext.py b/scripts/wikisourcetext.py
index 2daacfb..83734d4 100644
--- a/scripts/wikisourcetext.py
+++ b/scripts/wikisourcetext.py
@@ -18,32 +18,40 @@
The following parameters are supported:
- -index:... name of the index page
+ -index:... name of the index page.
-pages:<start>-<end>,...<start>-<end>,<start>-<end>
- Page range to upload;
- optional, start=1, end=djvu file number of images.
- Page ranges can be specified as:
+ Page range to upload;
+ optional, start=1, end=djvu file number of images.
+ Page ranges can be specified as:
- | A-B -> pages A until B
- | A- -> pages A until number of images
- | A -> just page A
- | -B -> pages 1 until B
+ | A-B -> pages A until B
+ | A- -> pages A until number of images
+ | A -> just page A
+ | -B -> pages 1 until B
- -showdiff: show difference between curent text and new text when
- saving the page
+ -showdiff: show difference between current text and new text when
+ saving the page.
- -ocr: use https://tools.wmflabs.org/phetools OCR tool to get text;
- default is False, i.e. only not-(yet)-existing pages in Page
- ns will be treated and text will be fetched via preload.
+ -ocr: use OCR tools hosted on https://tools.wmflabs.org.
+ By default no OCR is done, i.e. only not-(yet)-existing
+ pages in Page ns will be treated and text will be fetched
+ via preload.
+ If -ocr is provided, default OCR method is:
+ - https://tools.wmflabs.org/phetools
+ If ocr:googleOCR is given, OCR method is:
+ - https://tools.wmflabs.org/ws-google-ocr
- -force: overwrite existing pages;
- default is False; valid only if '-ocr' is selected.
+ -threads:n number of threads used to fetch OCR from OCR tools.
+ default is 5; valid only if '-ocr' is selected.
- -summary: custom edit summary.
- Use quotes if edit summary contains spaces.
+ -force: overwrite existing pages;
+ default is False; valid only if '-ocr' is selected.
- -always don't bother asking to confirm any of the changes.
+ -summary: custom edit summary.
+ Use quotes if edit summary contains spaces.
+
+ -always don't bother asking to confirm any of the changes.
"""
#
# (C) Pywikibot team, 2016-2018
@@ -52,7 +60,11 @@
#
from __future__ import absolute_import, division, unicode_literals
+import collections
import itertools
+import sys
+import threading
+import time
import pywikibot
@@ -61,6 +73,11 @@
from pywikibot.bot import SingleSiteBot
from pywikibot.proofreadpage import IndexPage, ProofreadPage
+if sys.version_info[0] > 2:
+ import queue
+else:
+ import Queue as queue
+
class UploadTextBot(SingleSiteBot):
@@ -77,6 +94,13 @@
"""
Initializer.
+ If OCR is requested, spawns worker threads, and, if no "force" option
+ is set, filter for existing pages.
+
+ Queues are used for communication to/from threads.
+ A PriorityQueue is used to process pages in the same order as
+ they are generated.
+
@param generator: page generator
@type generator: generator
"""
@@ -84,7 +108,8 @@
'showdiff': False,
'force': False,
'ocr': False,
- 'summary': 'Bot: uploading text'
+ 'summary': 'Bot: uploading text',
+ 'threads': 5
})
super(UploadTextBot, self).__init__(**kwargs)
self.generator = generator
@@ -95,6 +120,59 @@
self.options['summary'] = i18n.twtranslate(
self.site, 'djvutext-creating')
+ if self.getOption('ocr'):
+ self._num_threads = self.getOption('threads')
+ self._queue_in = queue.Queue()
+ self._queue_out = queue.PriorityQueue()
+
+ # If not "-force", no reason to get OCR for existing pages
+ # and to process them in Bot.run().
+ if not self.getOption('force'):
+ self.generator = (p for p in self.generator if not p.exists())
+ self._spawn_ocr_threads()
+
+ def _spawn_ocr_threads(self):
+ """Spawn threads for _ocr_worker workers."""
+ for i in range(self._num_threads):
+ worker = threading.Thread(target=self._ocr_worker)
+ worker.setDaemon(True)
+ worker.start()
+
+ self._pages = collections.OrderedDict()
+ for idx, p in enumerate(self.generator):
+ self._pages.setdefault(p, idx)
+ self.generator = (p for p in self._pages) # recreate gen for run()
+
+ for p, idx in self._pages.items():
+ self._queue_in.put((p, idx)) # idx to preserve order later
+
+ def _ocr_worker(self):
+ """Fetch OCR content from ocr_tool and queue it."""
+ while True:
+ page, idx = self._queue_in.get()
+ try:
+ text_body = page.ocr(ocr_tool=self.getOption('ocr'))
+ except ValueError as e:
+ # TODO: is it a problem in PY2?
+ pywikibot.error(str(e))
+ text_body = None # Sentinel: signal exception to self.treat()
+
+ self._queue_out.put((idx, text_body))
+ self._queue_in.task_done()
+
+ def _get_ocr(self, page):
+ """Get OCR content for page from PriorityQueue."""
+ # blocks until OCR for expected idx is available
+ expected_idx = self._pages.get(page)
+ while True:
+ if self._queue_out.empty():
+ time.sleep(0.2) # some pause
+ continue
+ idx, text_body = self._queue_out.queue[0] # peek first element
+ if idx == expected_idx:
+ idx, text_body = self._queue_out.get()
+ return text_body
+
def treat(self, page):
"""Process one ProofreadPage page.
@@ -114,7 +192,12 @@
old_text = ''
if self.getOption('ocr'):
- page.body = page.ocr()
+ _body = self._get_ocr(page)
+ if _body is None:
+ pywikibot.output('No OCR found. Skipping {}'
+ .format(page.title(as_link=True)))
+ return
+ page.body = _body
if (page.exists() and
not (self.getOption('ocr') and self.getOption('force'))):
@@ -151,7 +234,9 @@
elif arg == '-summary':
options['summary'] = value
elif arg == '-ocr':
- options['ocr'] = True
+ options['ocr'] = value or 'phetools'
+ elif arg == '-threads':
+ options['threads'] = int(value)
elif arg == '-force':
options['force'] = True
elif arg == '-always':
@@ -197,7 +282,7 @@
gen_list = []
for start, end in sorted(pages):
gen = index.page_gen(start=start, end=end,
- filter_ql=[1], content=False)
+ filter_ql=[1], content=True)
gen_list.append(gen)
gen = itertools.chain(*gen_list)
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 0f69c73..cd77337 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -320,6 +320,69 @@
self.assertEqual(page.quality_level, 0)
+class TestPageOCR(TestCase):
+
+ """Test page ocr functions."""
+
+ family = 'wikisource'
+ code = 'en'
+
+ cached = True
+
+ data = {'title': 'Page:Popular Science Monthly Volume 1.djvu/10',
+ 'hocr': (False, 'ENTERED, according to Act of Congress, in the '
+ 'year 1872,\nBY D. APPLETON & CO.,\nIn the Office '
+ 'of the Librarian of Congress, at '
+ 'Washington.\n\n'),
+ 'ocr': (False, 'lam-mam, according to Act of Congress, in the '
+ 'year 157-2,\nBY D. APPLEION Av CO.,\nIn the '
+ 'Of\ufb01ce or the Librarian of '
+ 'Congress, at Washington.\n\n'),
+ 'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
+ 'the year 1572,\nBY D. APPLETON & CO.\n'
+ 'In the Office of the Librarian of '
+ 'Congress, at Washington.\n4 334\n'),
+ }
+
+ def setUp(self):
+ """Test setUp."""
+ site = self.get_site()
+ title = self.data['title']
+ self.page = ProofreadPage(site, title)
+ super(TestPageOCR, self).setUp()
+
+ def test_ocr_exceptions(self):
+ """Test page.ocr() exceptions."""
+ self.assertRaises(TypeError, self.page.ocr, ocr_tool='dummy')
+
+ def test_do_hocr(self):
+ """Test page._do_hocr()."""
+ error, text = self.page._do_hocr()
+ ref_error, ref_text = self.data['hocr']
+ self.assertEqual(error, ref_error)
+ self.assertEqual(text, ref_text)
+
+ def test_do_ocr_phetools(self):
+ """Test page._do_ocr(ocr_tool='phetools')."""
+ error, text = self.page._do_ocr(ocr_tool='phetools')
+ ref_error, ref_text = self.data['ocr']
+ self.assertEqual(error, ref_error)
+ self.assertEqual(text, ref_text)
+
+ def test_do_ocr_googleocr(self):
+ """Test page._do_ocr(ocr_tool='googleOCR')."""
+ error, text = self.page._do_ocr(ocr_tool='googleOCR')
+ ref_error, ref_text = self.data['googleOCR']
+ self.assertEqual(error, ref_error)
+ self.assertEqual(text, ref_text)
+
+ def test_ocr_googleocr(self):
+ """Test page.ocr(ocr_tool='googleOCR')."""
+ text = self.page.ocr(ocr_tool='googleOCR')
+ ref_error, ref_text = self.data['googleOCR']
+ self.assertEqual(text, ref_text)
+
+
@require_modules('bs4')
class TestProofreadPageIndexProperty(TestCase):
--
To view, visit https://gerrit.wikimedia.org/r/446973
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Ie56d0534f945b5e5a614a78e4095f1efa52001d0
Gerrit-Change-Number: 446973
Gerrit-PatchSet: 17
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444(a)gmail.com>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/475588 )
Change subject: [PEP8] Fix W504 errors in scripts/interwiki.py
......................................................................
[PEP8] Fix W504 errors in scripts/interwiki.py
Fixed the W504 error occurrences in the following file:
- scripts/interwiki.py
Bug: T207836
Change-Id: I705097ee01f477ebfad278de404daf458d393118
---
M scripts/interwiki.py
1 file changed, 23 insertions(+), 22 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/interwiki.py b/scripts/interwiki.py
index f8909bd..725cffc 100755
--- a/scripts/interwiki.py
+++ b/scripts/interwiki.py
@@ -985,8 +985,8 @@
if linkedPage in self.foundIn:
# We have seen this page before, don't ask again.
return False
- elif (self.originPage and
- self.originPage.namespace() != linkedPage.namespace()):
+ if (self.originPage
+ and self.originPage.namespace() != linkedPage.namespace()):
# Allow for a mapping between different namespaces
crossFrom = self.originPage.site.family.crossnamespace.get(
self.originPage.namespace(), {})
@@ -1060,9 +1060,9 @@
pywikibot.output('NOTE: Ignoring {} for {} in wiktionary mode'
.format(page, self.originPage))
return True
- elif (page.title() != self.originPage.title() and
- self.originPage.namespace().case == 'case-sensitive' and
- page.namespace().case == 'case-sensitive'):
+ if (page.title() != self.originPage.title()
+ and self.originPage.namespace().case == 'case-sensitive'
+ and page.namespace().case == 'case-sensitive'):
pywikibot.output(
'NOTE: Ignoring {} for {} in wiktionary mode because both '
'languages are uncapitalized.'
@@ -1296,8 +1296,9 @@
elif page.isStaticRedirect():
self.conf.note('not following static {}redirects.'
.format(redir))
- elif (page.site.family == redirectTargetPage.site.family and
- not self.skipPage(page, redirectTargetPage, counter)):
+ elif (page.site.family == redirectTargetPage.site.family
+ and not self.skipPage(page, redirectTargetPage,
+ counter)):
if self.addIfNew(redirectTargetPage, counter, page):
if config.interwiki_shownew:
pywikibot.output('{}: {} gives new {}redirect {}'
@@ -1626,8 +1627,9 @@
(not frgnSiteDone and site != lclSite and site in new):
if site == lclSite:
lclSiteDone = True # even if we fail the update
- if (site.family.name in config.usernames and
- site.code in config.usernames[site.family.name]):
+ if (site.family.name in config.usernames
+ and site.code in config.usernames[
+ site.family.name]):
try:
if self.replaceLinks(new[site], new):
updatedSites.append(site)
@@ -1637,9 +1639,8 @@
notUpdatedSites.append(site)
except GiveUpOnPage:
break
- elif (not self.conf.strictlimittwo and
- site in new and
- site != lclSite):
+ elif (not self.conf.strictlimittwo
+ and site in new and site != lclSite):
old = {}
try:
for link in new[site].iterlanglinks():
@@ -1652,12 +1653,12 @@
mods, mcomment, adding, removing, modifying \
= compareLanguages(old, new, lclSite,
self.conf.summary)
- if ((len(removing) > 0 and not self.conf.autonomous) or
- (len(modifying) > 0 and self.problemfound) or
- (len(old) == 0) or
- (self.conf.needlimit and
- len(adding) + len(modifying) >=
- self.conf.needlimit + 1)):
+ if (len(removing) > 0 and not self.conf.autonomous
+ or len(modifying) > 0 and self.problemfound
+ or len(old) == 0
+ or (self.conf.needlimit
+ and len(adding) + len(modifying)
+ >= self.conf.needlimit + 1)):
try:
if self.replaceLinks(new[site], new):
updatedSites.append(site)
@@ -1809,8 +1810,8 @@
rmPage = old[rmsite]
# put it to new means don't delete it
if (
- not self.conf.cleanup or
- unicode(rmPage) not in self.conf.remove
+ not self.conf.cleanup
+ or unicode(rmPage) not in self.conf.remove
):
new[rmsite] = rmPage
pywikibot.warning(
@@ -2426,8 +2427,8 @@
elif arg.startswith('-years'):
# Look if user gave a specific year at which to start
# Must be a natural number or negative integer.
- if len(arg) > 7 and (arg[7:].isdigit() or
- (arg[7] == '-' and arg[8:].isdigit())):
+ if len(arg) > 7 and (arg[7:].isdigit()
+ or (arg[7] == '-' and arg[8:].isdigit())):
startyear = int(arg[7:])
else:
startyear = 1
--
To view, visit https://gerrit.wikimedia.org/r/475588
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I705097ee01f477ebfad278de404daf458d393118
Gerrit-Change-Number: 475588
Gerrit-PatchSet: 3
Gerrit-Owner: Nathan fraignt <nathanklumpenaar10(a)outlook.com>
Gerrit-Reviewer: D3r1ck01 <alangiderick(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Nathan fraignt <nathanklumpenaar10(a)outlook.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/462498 )
Change subject: [IMPR] use enumerate instead of a counter variable
......................................................................
[IMPR] use enumerate instead of a counter variable
Also fix the total number
Change-Id: I4ee945befa9cd8648355bffb19886b8d51a92ad2
---
M scripts/redirect.py
1 file changed, 4 insertions(+), 5 deletions(-)
Approvals:
Dvorapa: Looks good to me, but someone else must approve
Dalba: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/redirect.py b/scripts/redirect.py
index 5fff9d7..f387739 100755
--- a/scripts/redirect.py
+++ b/scripts/redirect.py
@@ -340,15 +340,14 @@
break
elif self.xmlFilename:
redict = self.get_redirects_from_dump()
- num = 0
- for (key, value) in redict.items():
- num += 1
+ total = len(redict)
+ for num, (key, value) in enumerate(redict.items(), start=1):
# check if the value - that is, the redirect target - is a
# redirect as well
if num > self.offset and value in redict:
- yield key
pywikibot.output('\nChecking redirect {0} of {1}...'
- .format(num + 1, len(redict)))
+ .format(num, total))
+ yield key
elif self.page_title:
yield self.page_title
else:
--
To view, visit https://gerrit.wikimedia.org/r/462498
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I4ee945befa9cd8648355bffb19886b8d51a92ad2
Gerrit-Change-Number: 462498
Gerrit-PatchSet: 4
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Framawiki <framawiki(a)tools.wmflabs.org>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot (75)
Gerrit-CC: Matěj Suchánek <matejsuchanek97(a)gmail.com>
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/475270 )
Change subject: Localisation updates from https://translatewiki.net.
......................................................................
Localisation updates from https://translatewiki.net.
Change-Id: I6132bb65239ae4f717b74920512aecce4c27d905
---
M category/frr.json
M unusedfiles/frr.json
2 files changed, 3 insertions(+), 1 deletion(-)
Approvals:
L10n-bot: Looks good to me, approved
jenkins-bot: Verified
diff --git a/category/frr.json b/category/frr.json
index 524e332..0667327 100644
--- a/category/frr.json
+++ b/category/frr.json
@@ -13,6 +13,8 @@
"category-replacing": "Bot: Bütje kategorii %(oldcat)s ütj mä %(newcat)s",
"category-section-title": "Werjuunshistoore faan det ual %(oldcat)s",
"category-strip-cfd-templates": "Bot: Nem CFD-föörlaagen wech, wan det aktjuun tu aanj as",
+ "category-strip-sort-keys": "Bot: Nem sortiarkaier wech, wan det aktjuun tu aanj as",
+ "category-strip-both": "Bot: Nem CFD-föörlaagen an sortiarkaier wech, wan det aktjuun tu aanj as",
"category-version-history": "Bot: Seekere werjuunshistoore faan det ual %(oldcat)s",
"category-was-disbanded": "Bot: Kategorii as apliaset wurden",
"category-was-moved": "Bot: Kategorii as fersköwen wurden tu [[:Category:%(newcat)s|%(title)s]]"
diff --git a/unusedfiles/frr.json b/unusedfiles/frr.json
index b742086..ea3a893 100644
--- a/unusedfiles/frr.json
+++ b/unusedfiles/frr.json
@@ -4,5 +4,5 @@
"Murma174"
]
},
- "unusedfiles-comment": "Bot: Bilen tu wechnemen"
+ "unusedfiles-comment": "Bot: Datei as kääntiakend üs alianstunen"
}
--
To view, visit https://gerrit.wikimedia.org/r/475270
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/i18n
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I6132bb65239ae4f717b74920512aecce4c27d905
Gerrit-Change-Number: 475270
Gerrit-PatchSet: 1
Gerrit-Owner: L10n-bot <l10n-bot(a)translatewiki.net>
Gerrit-Reviewer: L10n-bot <l10n-bot(a)translatewiki.net>
Gerrit-Reviewer: jenkins-bot (75)