jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/339906 )
Change subject: djvu.py: added features and refactored ......................................................................
djvu.py: added features and refactored
Added: - cache control decorator - page number check decorator - retrieval of page info from djvu file
Refactored: - tiny wrapper of subprocess Popen() to reduce code repetition
Added tests. Renamed test file myfile.djvu to myfilé.djvu to test non-ascii filenames.
Change-Id: Idf465abe0f9aab3d7c213098ae02335269740ecf --- M pywikibot/tools/djvu.py R tests/data/djvu/myfilé.djvu M tests/djvu_tests.py 3 files changed, 219 insertions(+), 45 deletions(-)
Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
diff --git a/pywikibot/tools/djvu.py b/pywikibot/tools/djvu.py index da7c595..1eee4e3 100644 --- a/pywikibot/tools/djvu.py +++ b/pywikibot/tools/djvu.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """Wrapper around djvulibre to access djvu files properties and content.""" # -# (C) Pywikibot team, 2015-2016 +# (C) Pywikibot team, 2015-2017 # # Distributed under the terms of the MIT license. # @@ -10,10 +10,54 @@
__version__ = '$Id$'
-import os.path +import collections +import os +import re import subprocess
-from pywikibot.logging import error +import pywikibot + +from pywikibot.tools import ( + deprecated, deprecated_args, + StringTypes, +) + + +def _call_cmd(args, lib='djvulibre'): + """ + Tiny wrapper around subprocess.Popen(). + + @param args: same as Popen() + @type args: sequence or string + + @param library: library to be logged in logging messages + @type library: string + + @param log: log process output; errors are always logged. + @type library: bool + + + @return: returns a tuple (res, stdoutdata), where + res is True if dp.returncode != 0 else False + """ + if not isinstance(args, StringTypes): + # upcast if any param in sequence args is not in StringTypes + args = [str(a) if not isinstance(a, StringTypes) else a for a in args] + cmd = ' '.join(args) + else: + cmd = args + + dp = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdoutdata, stderrdata = dp.communicate() + + if dp.returncode != 0: + pywikibot.error('{0} error; {1}'.format(lib, cmd)) + pywikibot.error('{0}'.format(stderrdata)) + return (False, stdoutdata) + + pywikibot.log('SUCCESS: {0} (PID: {1})'.format(cmd, dp.pid)) + + return (True, stdoutdata)
class DjVuFile(object): @@ -28,39 +72,143 @@
"""
- def __init__(self, file_djvu): + @deprecated_args(file_djvu='file') + def __init__(self, file): """ Constructor.
- @param file_djvu: filename (including path) to djvu file - @type file_djvu: string/unicode + @param file: filename (including path) to djvu file + @type file: string/unicode """ - file_djvu = os.path.expanduser(file_djvu) + file = os.path.expanduser(file) + file = os.path.abspath(file) # Check file exists and has read permissions. - with open(file_djvu): - self.file_djvu = file_djvu + with open(file): + self.file = file + self.dirname = os.path.dirname(file)
- def number_of_images(self): - """Return the (cached) number of images in the djvu file.""" - if not hasattr(self, '_image_count'): - dp = subprocess.Popen(['djvused', '-e', 'n', self.file_djvu], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = dp.communicate() - if dp.returncode != 0: - error('djvulibre library error!\n%s' % stderrdata) - self._image_count = int(stdoutdata) - return self._image_count + # pattern for parsing of djvudump output. + self._pat_form = re.compile( + r' *?FORM:DJVU *?[\d+] *?(?P<id>{[^}]*?})? *?[P(?P<n>\d+)]') + self._pat_info = re.compile(r'DjVu.*?(?P<size>\d+x\d+).*?(?P<dpi>\d+) dpi')
- def has_text(self): - """Test if the djvu file has a text-layer.""" + @property + @deprecated('DjVuFile.file') + def file_djvu(self): + """Deprecated file_djvu instance variable.""" + return self.file + + def check_cache(fn): # flake8: disable=N805 + """Decorator to check if cache shall be cleared.""" + cache = ['_page_count', '_has_text', '_page_info'] + + def wrapper(obj, *args, **kwargs): + force = kwargs.get('force', False) + if force: + for el in cache: + obj.__dict__.pop(el, None) + _res = fn(obj, *args, **kwargs) + return _res + return wrapper + + def check_page_number(fn): # flake8: disable=N805 + """Decorator to check if page number is valid. + + @raises ValueError + """ + def wrapper(obj, *args, **kwargs): + n = args[0] + force = kwargs.get('force', False) + if not (1 <= n <= obj.number_of_images(force=force)): + raise ValueError('Page %d not in file %s [%d-%d]' + % (n, obj.file, n, obj.number_of_images())) + _res = fn(obj, *args, **kwargs) + return _res + return wrapper + + @check_cache + def number_of_images(self, force=False): + """ + Return the number of images in the djvu file. + + @param force: if True, refresh the cached data + @type force: bool + """ + if not hasattr(self, '_page_count'): + res, stdoutdata = _call_cmd(['djvused', '-e', 'n', self.file]) + if not res: + return False + self._page_count = int(stdoutdata) + return self._page_count + + @check_page_number + def page_info(self, n, force=False): + """ + Return a tuple (id, (size, dpi)) for page n of djvu file. + + @param force: if True, refresh the cached data + @type force: bool + """ + if not hasattr(self, '_page_info'): + self._get_page_info(force=force) + return self._page_info[n] + + @check_cache + def _get_page_info(self, force=False): + """ + Return a dict of tuples (id, (size, dpi)) for all pages of djvu file. + + @param force: if True, refresh the cached data + @type force: bool + """ + if not hasattr(self, '_page_info'): + self._page_info = {} + + res, stdoutdata = _call_cmd(['djvudump', self.file]) + if not res: + return False + + has_text = False + for line in stdoutdata.decode('utf-8').split('\n'): + if 'TXTz' in line: + has_text = True + + if 'FORM:DJVU' in line: + m = self._pat_form.search(line) + if m: + key, id = int(m.group('n')), m.group('id') + else: + key, id = '', 1 + + if 'INFO' in line: + m = self._pat_info.search(line) + if m: + size, dpi = m.group('size'), int(m.group('dpi')) + else: + size, dpi = None, None + else: + continue + + self._page_info[key] = (id, (size, dpi)) + self._has_text = has_text + return self._page_info + + def get_most_common_info(self): + """Return most common size and dpi for pages in djvu file.""" + cnt = collections.Counter(s_d for _, s_d in self._get_page_info().values()) + (size, dpi), _ = cnt.most_common()[0] + return size, dpi + + @check_cache + def has_text(self, force=False): + """ + Test if the djvu file has a text-layer. + + @param force: if True, refresh the cached data + @type force: bool + """ if not hasattr(self, '_has_text'): - dp = subprocess.Popen(['djvudump', self.file_djvu], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = dp.communicate() - if dp.returncode != 0: - error('djvulibre library error!\n%s' % stderrdata) - txt = stdoutdata.decode('utf-8') - self._has_text = 'TXTz' in txt + self._get_page_info(force=force) return self._has_text
def _remove_control_chars(self, data): @@ -79,17 +227,23 @@ txt = txt.strip('\x0c\n ') return txt
- def get_page(self, n): - """Get page n for djvu file.""" - if not self.has_text(): - raise ValueError('Djvu file %s has no text layer.' % self.file_djvu) - if not (1 <= n <= self.number_of_images()): - raise ValueError('Requested page number %d is not in file %s' - ' page range [%d-%d]' - % (n, self.file_djvu, 1, self.number_of_images())) - dp = subprocess.Popen(['djvutxt', '--page=%d' % n, self.file_djvu], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - (stdoutdata, stderrdata) = dp.communicate() - if dp.returncode != 0: - error('djvulibre library error!\n%s' % stderrdata) + @check_page_number + @check_cache + def get_page(self, n, force=False): + """ + Get page n for djvu file. + + @param force: if True, refresh the cached data + @type force: bool + """ + if not self.has_text(force=force): + raise ValueError('Djvu file %s has no text layer.' % self.file) + res, stdoutdata = _call_cmd(['djvutxt', '--page=%d' % n, self.file]) + if not res: + return False return self._remove_control_chars(stdoutdata) + + # This is to be used only if this class is subclassed and the decorators + # needs to be used by the child. + check_page_number = staticmethod(check_page_number) + check_cache = staticmethod(check_cache) diff --git a/tests/data/djvu/myfile.djvu "b/tests/data/djvu/myfil\303\251.djvu" old mode 100755 new mode 100644 similarity index 100% rename from tests/data/djvu/myfile.djvu rename to "tests/data/djvu/myfil\303\251.djvu" Binary files differ diff --git a/tests/djvu_tests.py b/tests/djvu_tests.py index ce38022..31a6fb0 100644 --- a/tests/djvu_tests.py +++ b/tests/djvu_tests.py @@ -1,9 +1,9 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -"""Unit tests for djvutext.py script.""" +"""Unit tests for djvu.py."""
# -# (C) Pywikibot team, 2015 +# (C) Pywikibot team, 2017 # # Distributed under the terms of the MIT license. # @@ -27,7 +27,7 @@ net = False
file_djvu_not_existing = join_djvu_data_path('not_existing.djvu') - file_djvu = join_djvu_data_path('myfile.djvu') + file_djvu = join_djvu_data_path('myfilé.djvu') # test non-ascii name file_djvu_wo_text = join_djvu_data_path('myfile_wo_text.djvu') test_txt = 'A file with non-ASCII characters, \nlike é or ç'
@@ -44,13 +44,24 @@ def test_file_existance(self): """Test file existence checks.""" djvu = DjVuFile(self.file_djvu) - self.assertEqual(self.file_djvu, djvu.file_djvu) + self.assertEqual(self.file_djvu, djvu.file) self.assertRaises(IOError, DjVuFile, self.file_djvu_not_existing)
def test_number_of_images(self): """Test page number generator.""" djvu = DjVuFile(self.file_djvu) self.assertEqual(djvu.number_of_images(), 4) + + def test_page_info(self): + """Test page info retrieval.""" + djvu = DjVuFile(self.file_djvu) + self.assertEqual(djvu.page_info(1), + ('{myfile.djvu}', ('1092x221', 600))) + + def test_get_most_common_info(self): + """Test page number generator.""" + djvu = DjVuFile(self.file_djvu) + self.assertEqual(djvu.get_most_common_info(), ('1092x221', 600))
def test_has_text(self): """Test if djvu file contains text.""" @@ -78,6 +89,15 @@ self.assertFalse(djvu.has_text()) self.assertRaises(ValueError, djvu.get_page, 100)
+ def test_clear_cache(self): + """Test if djvu file contains text.""" + djvu = DjVuFile(self.file_djvu) + self.assertTrue(djvu.has_text()) + djvu._has_text = False + self.assertFalse(djvu.has_text()) + self.assertTrue(djvu.has_text(force=True)) + + if __name__ == '__main__': # pragma: no cover try: unittest.main()