jenkins-bot merged this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
tools: Support LZMA and XZ formats

The module to support these formats, lzma, was added Python 3.3 [1].
The format has been proven to do better compression ratios at the cost
of longer compression times.

[1] https://bugs.python.org/issue6715

Change-Id: I821a48beb755d284576df8c6c9a8f6e8595cb086
---
M pywikibot/tools/__init__.py
A tests/data/xml/article-pyrus.xml.lzma
A tests/data/xml/article-pyrus.xml.xz
M tests/tools_tests.py
4 files changed, 84 insertions(+), 13 deletions(-)

diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 76e0f7a..6888440 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -56,6 +56,11 @@
warn('package bz2 and bz2file were not found', ImportWarning)
bz2 = bz2_import_error

+try:
+ import lzma
+except ImportError as lzma_import_error:
+ lzma = lzma_import_error
+

if PYTHON_VERSION < (3, 5):
# although deprecated in 3 completely no message was emitted until 3.5
@@ -1249,10 +1254,11 @@
"""
Open a file and uncompress it if needed.

- This function supports bzip2, gzip and 7zip as compression containers. It
- uses the packages available in the standard library for bzip2 and gzip so
- they are always available. 7zip is only available when a 7za program is
- available and only supports reading from it.
+ This function supports bzip2, gzip, 7zip, lzma, and xz as compression
+ containers. It uses the packages available in the standard library for
+ bzip2, gzip, lzma, and xz so they are always available. 7zip is only
+ available when a 7za program is available and only supports reading
+ from it.

The compression is either selected via the magic number or file ending.

@@ -1274,6 +1280,11 @@
@raises OSError: When it's not a 7z archive but the file extension is 7z.
It is also raised by bz2 when its content is invalid. gzip does not
immediately raise that error but only on reading it.
+ @raises lzma.LZMAError: When error occurs during compression or
+ decompression or when initializing the state with lzma or xz.
+ @raises ImportError: When file is compressed with bz2 but neither bz2 nor
+ bz2file is importable, or when file is compressed with lzma or xz but
+ lzma is not importable.
@return: A file-like object returning the uncompressed data in binary mode.
@rtype: file-like object
"""
@@ -1297,6 +1308,9 @@
extension = 'gz'
elif magic_number.startswith(b"7z\xBC\xAF'\x1C"):
extension = '7z'
+ # Unfortunately, legacy LZMA container format has no magic number
+ elif magic_number.startswith(b'\xFD7zXZ\x00'):
+ extension = 'xz'
else:
extension = ''

@@ -1304,9 +1318,9 @@
if isinstance(bz2, ImportError):
raise bz2
return bz2.BZ2File(filename, mode)
- elif extension == 'gz':
+ if extension == 'gz':
return gzip.open(filename, mode)
- elif extension == '7z':
+ if extension == '7z':
if mode != 'rb':
raise NotImplementedError('It is not possible to write a 7z file.')

@@ -1327,9 +1341,16 @@
'Unexpected STDERR output from 7za {0}'.format(stderr))
else:
return process.stdout
- else:
- # assume it's an uncompressed file
- return open(filename, 'rb')
+ if extension == 'lzma':
+ if isinstance(lzma, ImportError):
+ raise lzma
+ return lzma.open(filename, mode, format=lzma.FORMAT_ALONE)
+ if extension == 'xz':
+ if isinstance(lzma, ImportError):
+ raise lzma
+ return lzma.open(filename, mode, format=lzma.FORMAT_XZ)
+ # assume it's an uncompressed file
+ return open(filename, 'rb')


def merge_unique_dicts(*args, **kwargs):
diff --git a/tests/data/xml/article-pyrus.xml.lzma b/tests/data/xml/article-pyrus.xml.lzma
new file mode 100644
index 0000000..816634d
--- /dev/null
+++ b/tests/data/xml/article-pyrus.xml.lzma
Binary files differ
diff --git a/tests/data/xml/article-pyrus.xml.xz b/tests/data/xml/article-pyrus.xml.xz
new file mode 100644
index 0000000..1d3b79e
--- /dev/null
+++ b/tests/data/xml/article-pyrus.xml.xz
Binary files differ
diff --git a/tests/tools_tests.py b/tests/tools_tests.py
index 20ff4f1..ce8722b 100644
--- a/tests/tools_tests.py
+++ b/tests/tools_tests.py
@@ -138,7 +138,7 @@
"""Test open_archive when bz2 and bz2file are not available."""
old_bz2 = tools.bz2
bz2_import_error = ('This is a fake exception message that is '
- 'used when bz2 and bz2file is not importable')
+ 'used when bz2 and bz2file are not importable')
try:
tools.bz2 = ImportError(bz2_import_error)
self.assertRaisesRegex(ImportError,
@@ -167,6 +167,38 @@
self.base_file + '_invalid.7z',
use_extension=True)

+ def test_open_archive_lzma(self):
+ """Test open_archive with lzma compressor in the standard library."""
+ if isinstance(tools.lzma, ImportError):
+ raise unittest.SkipTest('lzma not importable')
+ self.assertEqual(
+ self._get_content(self.base_file + '.lzma'), self.original_content)
+ # Legacy LZMA container formet has no magic, skipping
+ # use_extension=False test here
+ self.assertEqual(
+ self._get_content(self.base_file + '.xz'), self.original_content)
+ self.assertEqual(
+ self._get_content(self.base_file + '.xz', use_extension=False),
+ self.original_content)
+
+ def test_open_archive_without_lzma(self):
+ """Test open_archive when lzma is not available."""
+ old_lzma = tools.lzma
+ lzma_import_error = ('This is a fake exception message that is '
+ 'used when lzma is not importable')
+ try:
+ tools.lzma = ImportError(lzma_import_error)
+ self.assertRaisesRegex(ImportError,
+ lzma_import_error,
+ self._get_content,
+ self.base_file + '.lzma')
+ self.assertRaisesRegex(ImportError,
+ lzma_import_error,
+ self._get_content,
+ self.base_file + '.xz')
+ finally:
+ tools.lzma = old_lzma
+

class OpenCompressedTestCase(OpenArchiveTestCase, DeprecationTestCase):

@@ -176,9 +208,10 @@

def _get_content(self, *args, **kwargs):
"""Use open_compressed and return content using a with-statement."""
- # open_archive default is True, so if it's False it's not the default
- # so use the non-default of open_compressed (which is True)
- if kwargs.get('use_extension') is False:
+ # open_archive default is True, but open_compressed default is False.
+ # The test cases assumes a default of True and we need to make
+ # open_compressed acknowledge that.
+ if 'use_extension' not in kwargs:
kwargs['use_extension'] = True

with tools.open_compressed(*args, **kwargs) as f:
@@ -259,6 +292,23 @@
'/dev/null.7z',
mode='wb')

+ def test_write_archive_lzma(self):
+ """Test writing a lzma archive."""
+ if isinstance(tools.lzma, ImportError):
+ raise unittest.SkipTest('lzma not importable')
+
+ content = self._write_content('.lzma')
+ with open(self.base_file + '.lzma', 'rb') as f:
+ self.assertEqual(content, f.read())
+
+ def test_write_archive_xz(self):
+ """Test writing a xz archive."""
+ if isinstance(tools.lzma, ImportError):
+ raise unittest.SkipTest('lzma not importable')
+
+ content = self._write_content('.xz')
+ self.assertEqual(content[:6], b'\xFD7zXZ\x00')
+

class MergeUniqueDicts(TestCase):


To view, visit change 481321. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I821a48beb755d284576df8c6c9a8f6e8595cb086
Gerrit-Change-Number: 481321
Gerrit-PatchSet: 4
Gerrit-Owner: Zhuyifei1999 <zhuyifei1999@gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb@gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki@gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus@gmx.de>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: Zhuyifei1999 <zhuyifei1999@gmail.com>
Gerrit-Reviewer: jenkins-bot (75)