[Gerrit] pywikibot/core[master]: djvu.py: added features and refactored - Pywikibot-commits

5 Mar 2017

jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/339906 )
Change subject: djvu.py: added features and refactored
......................................................................
djvu.py: added features and refactored
Added:
- cache control decorator
- page number check decorator
- retrieval of page info from djvu file
Refactored:
- tiny wrapper of subprocess Popen() to reduce code repetition
Added tests.
Renamed test file myfile.djvu to myfilé.djvu to test non-ascii
filenames.
Change-Id: Idf465abe0f9aab3d7c213098ae02335269740ecf
---
M pywikibot/tools/djvu.py
R tests/data/djvu/myfilé.djvu
M tests/djvu_tests.py
3 files changed, 219 insertions(+), 45 deletions(-)
Approvals:
  jenkins-bot: Verified
  Xqt: Looks good to me, approved

diff --git a/pywikibot/tools/djvu.py b/pywikibot/tools/djvu.py
index da7c595..1eee4e3 100644
--- a/pywikibot/tools/djvu.py
+++ b/pywikibot/tools/djvu.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 """Wrapper around djvulibre to access djvu files properties and content."""
 #
-# (C) Pywikibot team, 2015-2016
+# (C) Pywikibot team, 2015-2017
 #
 # Distributed under the terms of the MIT license.
 #
@@ -10,10 +10,54 @@
__version__ = '$Id$'
-import os.path
+import collections
+import os
+import re
 import subprocess
-from pywikibot.logging import error
+import pywikibot
+
+from pywikibot.tools import (
+    deprecated, deprecated_args,
+    StringTypes,
+)
+
+
+def _call_cmd(args, lib='djvulibre'):
+    """
+    Tiny wrapper around subprocess.Popen().
+
+    @param args: same as Popen()
+    @type args: sequence or string
+
+    @param library: library to be logged in logging messages
+    @type library: string
+
+    @param log: log process output; errors are always logged.
+    @type library: bool
+
+
+    @return: returns a tuple (res, stdoutdata), where
+        res is True if dp.returncode != 0 else False
+    """
+    if not isinstance(args, StringTypes):
+        # upcast if any param in sequence args is not in StringTypes
+        args = [str(a) if not isinstance(a, StringTypes) else a for a in args]
+        cmd = ' '.join(args)
+    else:
+        cmd = args
+
+    dp = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdoutdata, stderrdata = dp.communicate()
+
+    if dp.returncode != 0:
+        pywikibot.error('{0} error; {1}'.format(lib, cmd))
+        pywikibot.error('{0}'.format(stderrdata))
+        return (False, stdoutdata)
+
+    pywikibot.log('SUCCESS: {0} (PID: {1})'.format(cmd, dp.pid))
+
+    return (True, stdoutdata)
class DjVuFile(object):
@@ -28,39 +72,143 @@
"""
-    def __init__(self, file_djvu):
+    @deprecated_args(file_djvu='file')
+    def __init__(self, file):
         """
         Constructor.
-        @param file_djvu: filename (including path) to djvu file
-        @type file_djvu: string/unicode
+        @param file: filename (including path) to djvu file
+        @type file: string/unicode
         """
-        file_djvu = os.path.expanduser(file_djvu)
+        file = os.path.expanduser(file)
+        file = os.path.abspath(file)
         # Check file exists and has read permissions.
-        with open(file_djvu):
-            self.file_djvu = file_djvu
+        with open(file):
+            self.file = file
+        self.dirname = os.path.dirname(file)
-    def number_of_images(self):
-        """Return the (cached) number of images in the djvu file."""
-        if not hasattr(self, '_image_count'):
-            dp = subprocess.Popen(['djvused', '-e', 'n', self.file_djvu],
-                                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            (stdoutdata, stderrdata) = dp.communicate()
-            if dp.returncode != 0:
-                error('djvulibre library error!\n%s' % stderrdata)
-            self._image_count = int(stdoutdata)
-        return self._image_count
+        # pattern for parsing of djvudump output.
+        self._pat_form = re.compile(
+            r' *?FORM:DJVU *?[\d+] *?(?P<id>{[^}]*?})? *?[P(?P<n>\d+)]')
+        self._pat_info = re.compile(r'DjVu.*?(?P<size>\d+x\d+).*?(?P<dpi>\d+) dpi')
-    def has_text(self):
-        """Test if the djvu file has a text-layer."""
+    @property
+    @deprecated('DjVuFile.file')
+    def file_djvu(self):
+        """Deprecated file_djvu instance variable."""
+        return self.file
+
+    def check_cache(fn):  # flake8: disable=N805
+        """Decorator to check if cache shall be cleared."""
+        cache = ['_page_count', '_has_text', '_page_info']
+
+        def wrapper(obj, *args, **kwargs):
+            force = kwargs.get('force', False)
+            if force:
+                for el in cache:
+                    obj.__dict__.pop(el, None)
+            _res = fn(obj, *args, **kwargs)
+            return _res
+        return wrapper
+
+    def check_page_number(fn):  # flake8: disable=N805
+        """Decorator to check if page number is valid.
+
+        @raises ValueError
+        """
+        def wrapper(obj, *args, **kwargs):
+            n = args[0]
+            force = kwargs.get('force', False)
+            if not (1 <= n <= obj.number_of_images(force=force)):
+                raise ValueError('Page %d not in file %s [%d-%d]'
+                                 % (n, obj.file, n, obj.number_of_images()))
+            _res = fn(obj, *args, **kwargs)
+            return _res
+        return wrapper
+
+    @check_cache
+    def number_of_images(self, force=False):
+        """
+        Return the number of images in the djvu file.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
+        if not hasattr(self, '_page_count'):
+            res, stdoutdata = _call_cmd(['djvused', '-e', 'n', self.file])
+            if not res:
+                return False
+            self._page_count = int(stdoutdata)
+        return self._page_count
+
+    @check_page_number
+    def page_info(self, n, force=False):
+        """
+        Return a tuple (id, (size, dpi)) for page n of djvu file.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
+        if not hasattr(self, '_page_info'):
+            self._get_page_info(force=force)
+        return self._page_info[n]
+
+    @check_cache
+    def _get_page_info(self, force=False):
+        """
+        Return a dict of tuples (id, (size, dpi)) for all pages of djvu file.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
+        if not hasattr(self, '_page_info'):
+            self._page_info = {}
+
+            res, stdoutdata = _call_cmd(['djvudump', self.file])
+            if not res:
+                return False
+
+            has_text = False
+            for line in stdoutdata.decode('utf-8').split('\n'):
+                if 'TXTz' in line:
+                    has_text = True
+
+                if 'FORM:DJVU' in line:
+                    m = self._pat_form.search(line)
+                    if m:
+                        key, id = int(m.group('n')), m.group('id')
+                    else:
+                        key, id = '', 1
+
+                if 'INFO' in line:
+                    m = self._pat_info.search(line)
+                    if m:
+                        size, dpi = m.group('size'), int(m.group('dpi'))
+                    else:
+                        size, dpi = None, None
+                else:
+                    continue
+
+                self._page_info[key] = (id, (size, dpi))
+            self._has_text = has_text
+        return self._page_info
+
+    def get_most_common_info(self):
+        """Return most common size and dpi for pages in djvu file."""
+        cnt = collections.Counter(s_d for _, s_d in self._get_page_info().values())
+        (size, dpi), _ = cnt.most_common()[0]
+        return size, dpi
+
+    @check_cache
+    def has_text(self, force=False):
+        """
+        Test if the djvu file has a text-layer.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
         if not hasattr(self, '_has_text'):
-            dp = subprocess.Popen(['djvudump', self.file_djvu],
-                                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            (stdoutdata, stderrdata) = dp.communicate()
-            if dp.returncode != 0:
-                error('djvulibre library error!\n%s' % stderrdata)
-            txt = stdoutdata.decode('utf-8')
-            self._has_text = 'TXTz' in txt
+            self._get_page_info(force=force)
         return self._has_text
def _remove_control_chars(self, data):
@@ -79,17 +227,23 @@
         txt = txt.strip('\x0c\n ')
         return txt
-    def get_page(self, n):
-        """Get page n for djvu file."""
-        if not self.has_text():
-            raise ValueError('Djvu file %s has no text layer.' % self.file_djvu)
-        if not (1 <= n <= self.number_of_images()):
-            raise ValueError('Requested page number %d is not in file %s'
-                             ' page range [%d-%d]'
-                             % (n, self.file_djvu, 1, self.number_of_images()))
-        dp = subprocess.Popen(['djvutxt', '--page=%d' % n, self.file_djvu],
-                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        (stdoutdata, stderrdata) = dp.communicate()
-        if dp.returncode != 0:
-            error('djvulibre library error!\n%s' % stderrdata)
+    @check_page_number
+    @check_cache
+    def get_page(self, n, force=False):
+        """
+        Get page n for djvu file.
+
+        @param force: if True, refresh the cached data
+        @type force: bool
+        """
+        if not self.has_text(force=force):
+            raise ValueError('Djvu file %s has no text layer.' % self.file)
+        res, stdoutdata = _call_cmd(['djvutxt', '--page=%d' % n, self.file])
+        if not res:
+            return False
         return self._remove_control_chars(stdoutdata)
+
+    # This is to be used only if this class is subclassed and the decorators
+    # needs to be used by the child.
+    check_page_number = staticmethod(check_page_number)
+    check_cache = staticmethod(check_cache)
diff --git a/tests/data/djvu/myfile.djvu "b/tests/data/djvu/myfil\303\251.djvu"
old mode 100755
new mode 100644
similarity index 100%
rename from tests/data/djvu/myfile.djvu
rename to "tests/data/djvu/myfil\303\251.djvu"
Binary files differ
diff --git a/tests/djvu_tests.py b/tests/djvu_tests.py
index ce38022..31a6fb0 100644
--- a/tests/djvu_tests.py
+++ b/tests/djvu_tests.py
@@ -1,9 +1,9 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
-"""Unit tests for djvutext.py script."""
+"""Unit tests for djvu.py."""
#
-# (C) Pywikibot team, 2015
+# (C) Pywikibot team, 2017
 #
 # Distributed under the terms of the MIT license.
 #
@@ -27,7 +27,7 @@
     net = False
file_djvu_not_existing = join_djvu_data_path('not_existing.djvu')
-    file_djvu = join_djvu_data_path('myfile.djvu')
+    file_djvu = join_djvu_data_path('myfilé.djvu')  # test non-ascii name
     file_djvu_wo_text = join_djvu_data_path('myfile_wo_text.djvu')
     test_txt = 'A file with non-ASCII characters, \nlike é or ç'
@@ -44,13 +44,24 @@
     def test_file_existance(self):
         """Test file existence checks."""
         djvu = DjVuFile(self.file_djvu)
-        self.assertEqual(self.file_djvu, djvu.file_djvu)
+        self.assertEqual(self.file_djvu, djvu.file)
         self.assertRaises(IOError, DjVuFile, self.file_djvu_not_existing)
def test_number_of_images(self):
         """Test page number generator."""
         djvu = DjVuFile(self.file_djvu)
         self.assertEqual(djvu.number_of_images(), 4)
+
+    def test_page_info(self):
+        """Test page info retrieval."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertEqual(djvu.page_info(1),
+                         ('{myfile.djvu}', ('1092x221', 600)))
+
+    def test_get_most_common_info(self):
+        """Test page number generator."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertEqual(djvu.get_most_common_info(), ('1092x221', 600))
def test_has_text(self):
         """Test if djvu file contains text."""
@@ -78,6 +89,15 @@
         self.assertFalse(djvu.has_text())
         self.assertRaises(ValueError, djvu.get_page, 100)
+    def test_clear_cache(self):
+        """Test if djvu file contains text."""
+        djvu = DjVuFile(self.file_djvu)
+        self.assertTrue(djvu.has_text())
+        djvu._has_text = False
+        self.assertFalse(djvu.has_text())
+        self.assertTrue(djvu.has_text(force=True))
+
+
 if __name__ == '__main__':  # pragma: no cover
     try:
         unittest.main()
-- 
To view, visit https://gerrit.wikimedia.org/r/339906
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Idf465abe0f9aab3d7c213098ae02335269740ecf
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa mpaa.wiki@gmail.com
Gerrit-Reviewer: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: Magul tomasz.magulski@gmail.com
Gerrit-Reviewer: Mpaa mpaa.wiki@gmail.com
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot <>