jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/481321 )
Change subject: tools: Support LZMA and XZ formats ......................................................................
tools: Support LZMA and XZ formats
The module to support these formats, lzma, was added Python 3.3 [1]. The format has been proven to do better compression ratios at the cost of longer compression times.
[1] https://bugs.python.org/issue6715
Change-Id: I821a48beb755d284576df8c6c9a8f6e8595cb086 --- M pywikibot/tools/__init__.py A tests/data/xml/article-pyrus.xml.lzma A tests/data/xml/article-pyrus.xml.xz M tests/tools_tests.py 4 files changed, 84 insertions(+), 13 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py index 76e0f7a..6888440 100644 --- a/pywikibot/tools/__init__.py +++ b/pywikibot/tools/__init__.py @@ -56,6 +56,11 @@ warn('package bz2 and bz2file were not found', ImportWarning) bz2 = bz2_import_error
+try: + import lzma +except ImportError as lzma_import_error: + lzma = lzma_import_error +
if PYTHON_VERSION < (3, 5): # although deprecated in 3 completely no message was emitted until 3.5 @@ -1249,10 +1254,11 @@ """ Open a file and uncompress it if needed.
- This function supports bzip2, gzip and 7zip as compression containers. It - uses the packages available in the standard library for bzip2 and gzip so - they are always available. 7zip is only available when a 7za program is - available and only supports reading from it. + This function supports bzip2, gzip, 7zip, lzma, and xz as compression + containers. It uses the packages available in the standard library for + bzip2, gzip, lzma, and xz so they are always available. 7zip is only + available when a 7za program is available and only supports reading + from it.
The compression is either selected via the magic number or file ending.
@@ -1274,6 +1280,11 @@ @raises OSError: When it's not a 7z archive but the file extension is 7z. It is also raised by bz2 when its content is invalid. gzip does not immediately raise that error but only on reading it. + @raises lzma.LZMAError: When error occurs during compression or + decompression or when initializing the state with lzma or xz. + @raises ImportError: When file is compressed with bz2 but neither bz2 nor + bz2file is importable, or when file is compressed with lzma or xz but + lzma is not importable. @return: A file-like object returning the uncompressed data in binary mode. @rtype: file-like object """ @@ -1297,6 +1308,9 @@ extension = 'gz' elif magic_number.startswith(b"7z\xBC\xAF'\x1C"): extension = '7z' + # Unfortunately, legacy LZMA container format has no magic number + elif magic_number.startswith(b'\xFD7zXZ\x00'): + extension = 'xz' else: extension = ''
@@ -1304,9 +1318,9 @@ if isinstance(bz2, ImportError): raise bz2 return bz2.BZ2File(filename, mode) - elif extension == 'gz': + if extension == 'gz': return gzip.open(filename, mode) - elif extension == '7z': + if extension == '7z': if mode != 'rb': raise NotImplementedError('It is not possible to write a 7z file.')
@@ -1327,9 +1341,16 @@ 'Unexpected STDERR output from 7za {0}'.format(stderr)) else: return process.stdout - else: - # assume it's an uncompressed file - return open(filename, 'rb') + if extension == 'lzma': + if isinstance(lzma, ImportError): + raise lzma + return lzma.open(filename, mode, format=lzma.FORMAT_ALONE) + if extension == 'xz': + if isinstance(lzma, ImportError): + raise lzma + return lzma.open(filename, mode, format=lzma.FORMAT_XZ) + # assume it's an uncompressed file + return open(filename, 'rb')
def merge_unique_dicts(*args, **kwargs): diff --git a/tests/data/xml/article-pyrus.xml.lzma b/tests/data/xml/article-pyrus.xml.lzma new file mode 100644 index 0000000..816634d --- /dev/null +++ b/tests/data/xml/article-pyrus.xml.lzma Binary files differ diff --git a/tests/data/xml/article-pyrus.xml.xz b/tests/data/xml/article-pyrus.xml.xz new file mode 100644 index 0000000..1d3b79e --- /dev/null +++ b/tests/data/xml/article-pyrus.xml.xz Binary files differ diff --git a/tests/tools_tests.py b/tests/tools_tests.py index 20ff4f1..ce8722b 100644 --- a/tests/tools_tests.py +++ b/tests/tools_tests.py @@ -138,7 +138,7 @@ """Test open_archive when bz2 and bz2file are not available.""" old_bz2 = tools.bz2 bz2_import_error = ('This is a fake exception message that is ' - 'used when bz2 and bz2file is not importable') + 'used when bz2 and bz2file are not importable') try: tools.bz2 = ImportError(bz2_import_error) self.assertRaisesRegex(ImportError, @@ -167,6 +167,38 @@ self.base_file + '_invalid.7z', use_extension=True)
+ def test_open_archive_lzma(self): + """Test open_archive with lzma compressor in the standard library.""" + if isinstance(tools.lzma, ImportError): + raise unittest.SkipTest('lzma not importable') + self.assertEqual( + self._get_content(self.base_file + '.lzma'), self.original_content) + # Legacy LZMA container formet has no magic, skipping + # use_extension=False test here + self.assertEqual( + self._get_content(self.base_file + '.xz'), self.original_content) + self.assertEqual( + self._get_content(self.base_file + '.xz', use_extension=False), + self.original_content) + + def test_open_archive_without_lzma(self): + """Test open_archive when lzma is not available.""" + old_lzma = tools.lzma + lzma_import_error = ('This is a fake exception message that is ' + 'used when lzma is not importable') + try: + tools.lzma = ImportError(lzma_import_error) + self.assertRaisesRegex(ImportError, + lzma_import_error, + self._get_content, + self.base_file + '.lzma') + self.assertRaisesRegex(ImportError, + lzma_import_error, + self._get_content, + self.base_file + '.xz') + finally: + tools.lzma = old_lzma +
class OpenCompressedTestCase(OpenArchiveTestCase, DeprecationTestCase):
@@ -176,9 +208,10 @@
def _get_content(self, *args, **kwargs): """Use open_compressed and return content using a with-statement.""" - # open_archive default is True, so if it's False it's not the default - # so use the non-default of open_compressed (which is True) - if kwargs.get('use_extension') is False: + # open_archive default is True, but open_compressed default is False. + # The test cases assumes a default of True and we need to make + # open_compressed acknowledge that. + if 'use_extension' not in kwargs: kwargs['use_extension'] = True
with tools.open_compressed(*args, **kwargs) as f: @@ -259,6 +292,23 @@ '/dev/null.7z', mode='wb')
+ def test_write_archive_lzma(self): + """Test writing a lzma archive.""" + if isinstance(tools.lzma, ImportError): + raise unittest.SkipTest('lzma not importable') + + content = self._write_content('.lzma') + with open(self.base_file + '.lzma', 'rb') as f: + self.assertEqual(content, f.read()) + + def test_write_archive_xz(self): + """Test writing a xz archive.""" + if isinstance(tools.lzma, ImportError): + raise unittest.SkipTest('lzma not importable') + + content = self._write_content('.xz') + self.assertEqual(content[:6], b'\xFD7zXZ\x00') +
class MergeUniqueDicts(TestCase):
pywikibot-commits@lists.wikimedia.org