jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/339906 )
Change subject: djvu.py: added features and refactored
......................................................................
djvu.py: added features and refactored
Added:
- cache control decorator
- page number check decorator
- retrieval of page info from djvu file
Refactored:
- tiny wrapper of subprocess Popen() to reduce code repetition
Added tests.
Renamed test file myfile.djvu to myfilé.djvu to test non-ascii
filenames.
Change-Id: Idf465abe0f9aab3d7c213098ae02335269740ecf
---
M pywikibot/tools/djvu.py
R tests/data/djvu/myfilé.djvu
M tests/djvu_tests.py
3 files changed, 219 insertions(+), 45 deletions(-)
Approvals:
jenkins-bot: Verified
Xqt: Looks good to me, approved
diff --git a/pywikibot/tools/djvu.py b/pywikibot/tools/djvu.py
index da7c595..1eee4e3 100644
--- a/pywikibot/tools/djvu.py
+++ b/pywikibot/tools/djvu.py
@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
"""Wrapper around djvulibre to access djvu files properties and
content."""
#
-# (C) Pywikibot team, 2015-2016
+# (C) Pywikibot team, 2015-2017
#
# Distributed under the terms of the MIT license.
#
@@ -10,10 +10,54 @@
__version__ = '$Id$'
-import os.path
+import collections
+import os
+import re
import subprocess
-from pywikibot.logging import error
+import pywikibot
+
+from pywikibot.tools import (
+ deprecated, deprecated_args,
+ StringTypes,
+)
+
+
+def _call_cmd(args, lib='djvulibre'):
+ """
+ Tiny wrapper around subprocess.Popen().
+
+ @param args: same as Popen()
+ @type args: sequence or string
+
+ @param library: library to be logged in logging messages
+ @type library: string
+
+ @param log: log process output; errors are always logged.
+ @type library: bool
+
+
+ @return: returns a tuple (res, stdoutdata), where
+ res is True if dp.returncode != 0 else False
+ """
+ if not isinstance(args, StringTypes):
+ # upcast if any param in sequence args is not in StringTypes
+ args = [str(a) if not isinstance(a, StringTypes) else a for a in args]
+ cmd = ' '.join(args)
+ else:
+ cmd = args
+
+ dp = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdoutdata, stderrdata = dp.communicate()
+
+ if dp.returncode != 0:
+ pywikibot.error('{0} error; {1}'.format(lib, cmd))
+ pywikibot.error('{0}'.format(stderrdata))
+ return (False, stdoutdata)
+
+ pywikibot.log('SUCCESS: {0} (PID: {1})'.format(cmd, dp.pid))
+
+ return (True, stdoutdata)
class DjVuFile(object):
@@ -28,39 +72,143 @@
"""
- def __init__(self, file_djvu):
+ @deprecated_args(file_djvu='file')
+ def __init__(self, file):
"""
Constructor.
- @param file_djvu: filename (including path) to djvu file
- @type file_djvu: string/unicode
+ @param file: filename (including path) to djvu file
+ @type file: string/unicode
"""
- file_djvu = os.path.expanduser(file_djvu)
+ file = os.path.expanduser(file)
+ file = os.path.abspath(file)
# Check file exists and has read permissions.
- with open(file_djvu):
- self.file_djvu = file_djvu
+ with open(file):
+ self.file = file
+ self.dirname = os.path.dirname(file)
- def number_of_images(self):
- """Return the (cached) number of images in the djvu
file."""
- if not hasattr(self, '_image_count'):
- dp = subprocess.Popen(['djvused', '-e', 'n',
self.file_djvu],
- stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- (stdoutdata, stderrdata) = dp.communicate()
- if dp.returncode != 0:
- error('djvulibre library error!\n%s' % stderrdata)
- self._image_count = int(stdoutdata)
- return self._image_count
+ # pattern for parsing of djvudump output.
+ self._pat_form = re.compile(
+ r' *?FORM:DJVU *?\[\d+\] *?(?P<id>{[^\}]*?})?
*?\[P(?P<n>\d+)\]')
+ self._pat_info =
re.compile(r'DjVu.*?(?P<size>\d+x\d+).*?(?P<dpi>\d+) dpi')
- def has_text(self):
- """Test if the djvu file has a text-layer."""
+ @property
+ @deprecated('DjVuFile.file')
+ def file_djvu(self):
+ """Deprecated file_djvu instance variable."""
+ return self.file
+
+ def check_cache(fn): # flake8: disable=N805
+ """Decorator to check if cache shall be
cleared."""
+ cache = ['_page_count', '_has_text', '_page_info']
+
+ def wrapper(obj, *args, **kwargs):
+ force = kwargs.get('force', False)
+ if force:
+ for el in cache:
+ obj.__dict__.pop(el, None)
+ _res = fn(obj, *args, **kwargs)
+ return _res
+ return wrapper
+
+ def check_page_number(fn): # flake8: disable=N805
+ """Decorator to check if page number is valid.
+
+ @raises ValueError
+ """
+ def wrapper(obj, *args, **kwargs):
+ n = args[0]
+ force = kwargs.get('force', False)
+ if not (1 <= n <= obj.number_of_images(force=force)):
+ raise ValueError('Page %d not in file %s [%d-%d]'
+ % (n, obj.file, n, obj.number_of_images()))
+ _res = fn(obj, *args, **kwargs)
+ return _res
+ return wrapper
+
+ @check_cache
+ def number_of_images(self, force=False):
+ """
+ Return the number of images in the djvu file.
+
+ @param force: if True, refresh the cached data
+ @type force: bool
+ """
+ if not hasattr(self, '_page_count'):
+ res, stdoutdata = _call_cmd(['djvused', '-e', 'n',
self.file])
+ if not res:
+ return False
+ self._page_count = int(stdoutdata)
+ return self._page_count
+
+ @check_page_number
+ def page_info(self, n, force=False):
+ """
+ Return a tuple (id, (size, dpi)) for page n of djvu file.
+
+ @param force: if True, refresh the cached data
+ @type force: bool
+ """
+ if not hasattr(self, '_page_info'):
+ self._get_page_info(force=force)
+ return self._page_info[n]
+
+ @check_cache
+ def _get_page_info(self, force=False):
+ """
+ Return a dict of tuples (id, (size, dpi)) for all pages of djvu file.
+
+ @param force: if True, refresh the cached data
+ @type force: bool
+ """
+ if not hasattr(self, '_page_info'):
+ self._page_info = {}
+
+ res, stdoutdata = _call_cmd(['djvudump', self.file])
+ if not res:
+ return False
+
+ has_text = False
+ for line in stdoutdata.decode('utf-8').split('\n'):
+ if 'TXTz' in line:
+ has_text = True
+
+ if 'FORM:DJVU' in line:
+ m = self._pat_form.search(line)
+ if m:
+ key, id = int(m.group('n')), m.group('id')
+ else:
+ key, id = '', 1
+
+ if 'INFO' in line:
+ m = self._pat_info.search(line)
+ if m:
+ size, dpi = m.group('size'), int(m.group('dpi'))
+ else:
+ size, dpi = None, None
+ else:
+ continue
+
+ self._page_info[key] = (id, (size, dpi))
+ self._has_text = has_text
+ return self._page_info
+
+ def get_most_common_info(self):
+ """Return most common size and dpi for pages in djvu
file."""
+ cnt = collections.Counter(s_d for _, s_d in self._get_page_info().values())
+ (size, dpi), _ = cnt.most_common()[0]
+ return size, dpi
+
+ @check_cache
+ def has_text(self, force=False):
+ """
+ Test if the djvu file has a text-layer.
+
+ @param force: if True, refresh the cached data
+ @type force: bool
+ """
if not hasattr(self, '_has_text'):
- dp = subprocess.Popen(['djvudump', self.file_djvu],
- stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- (stdoutdata, stderrdata) = dp.communicate()
- if dp.returncode != 0:
- error('djvulibre library error!\n%s' % stderrdata)
- txt = stdoutdata.decode('utf-8')
- self._has_text = 'TXTz' in txt
+ self._get_page_info(force=force)
return self._has_text
def _remove_control_chars(self, data):
@@ -79,17 +227,23 @@
txt = txt.strip('\x0c\n ')
return txt
- def get_page(self, n):
- """Get page n for djvu file."""
- if not self.has_text():
- raise ValueError('Djvu file %s has no text layer.' % self.file_djvu)
- if not (1 <= n <= self.number_of_images()):
- raise ValueError('Requested page number %d is not in file %s'
- ' page range [%d-%d]'
- % (n, self.file_djvu, 1, self.number_of_images()))
- dp = subprocess.Popen(['djvutxt', '--page=%d' % n,
self.file_djvu],
- stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- (stdoutdata, stderrdata) = dp.communicate()
- if dp.returncode != 0:
- error('djvulibre library error!\n%s' % stderrdata)
+ @check_page_number
+ @check_cache
+ def get_page(self, n, force=False):
+ """
+ Get page n for djvu file.
+
+ @param force: if True, refresh the cached data
+ @type force: bool
+ """
+ if not self.has_text(force=force):
+ raise ValueError('Djvu file %s has no text layer.' % self.file)
+ res, stdoutdata = _call_cmd(['djvutxt', '--page=%d' % n,
self.file])
+ if not res:
+ return False
return self._remove_control_chars(stdoutdata)
+
+ # This is to be used only if this class is subclassed and the decorators
+ # needs to be used by the child.
+ check_page_number = staticmethod(check_page_number)
+ check_cache = staticmethod(check_cache)
diff --git a/tests/data/djvu/myfile.djvu "b/tests/data/djvu/myfil\303\251.djvu"
old mode 100755
new mode 100644
similarity index 100%
rename from tests/data/djvu/myfile.djvu
rename to "tests/data/djvu/myfil\303\251.djvu"
Binary files differ
diff --git a/tests/djvu_tests.py b/tests/djvu_tests.py
index ce38022..31a6fb0 100644
--- a/tests/djvu_tests.py
+++ b/tests/djvu_tests.py
@@ -1,9 +1,9 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
-"""Unit tests for djvutext.py script."""
+"""Unit tests for djvu.py."""
#
-# (C) Pywikibot team, 2015
+# (C) Pywikibot team, 2017
#
# Distributed under the terms of the MIT license.
#
@@ -27,7 +27,7 @@
net = False
file_djvu_not_existing = join_djvu_data_path('not_existing.djvu')
- file_djvu = join_djvu_data_path('myfile.djvu')
+ file_djvu = join_djvu_data_path('myfilé.djvu') # test non-ascii name
file_djvu_wo_text = join_djvu_data_path('myfile_wo_text.djvu')
test_txt = 'A file with non-ASCII characters, \nlike é or ç'
@@ -44,13 +44,24 @@
def test_file_existance(self):
"""Test file existence checks."""
djvu = DjVuFile(self.file_djvu)
- self.assertEqual(self.file_djvu, djvu.file_djvu)
+ self.assertEqual(self.file_djvu, djvu.file)
self.assertRaises(IOError, DjVuFile, self.file_djvu_not_existing)
def test_number_of_images(self):
"""Test page number generator."""
djvu = DjVuFile(self.file_djvu)
self.assertEqual(djvu.number_of_images(), 4)
+
+ def test_page_info(self):
+ """Test page info retrieval."""
+ djvu = DjVuFile(self.file_djvu)
+ self.assertEqual(djvu.page_info(1),
+ ('{myfile.djvu}', ('1092x221', 600)))
+
+ def test_get_most_common_info(self):
+ """Test page number generator."""
+ djvu = DjVuFile(self.file_djvu)
+ self.assertEqual(djvu.get_most_common_info(), ('1092x221', 600))
def test_has_text(self):
"""Test if djvu file contains text."""
@@ -78,6 +89,15 @@
self.assertFalse(djvu.has_text())
self.assertRaises(ValueError, djvu.get_page, 100)
+ def test_clear_cache(self):
+ """Test if djvu file contains text."""
+ djvu = DjVuFile(self.file_djvu)
+ self.assertTrue(djvu.has_text())
+ djvu._has_text = False
+ self.assertFalse(djvu.has_text())
+ self.assertTrue(djvu.has_text(force=True))
+
+
if __name__ == '__main__': # pragma: no cover
try:
unittest.main()
--
To view, visit
https://gerrit.wikimedia.org/r/339906
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Idf465abe0f9aab3d7c213098ae02335269740ecf
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Magul <tomasz.magulski(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>