jenkins-bot has submitted this change and it was merged.
Change subject: [FEAT] upload: Get stash info before continuing ......................................................................
[FEAT] upload: Get stash info before continuing
If the response does contain warnings it won't contain the offset for the continuation (T73737). So it requests the stash info for the given file key to determine the offset. This also allows to properly reuse the file key as it will create a new file key instance when the offset is 0. It also prevents that the API shows an error if the offset is neither 0 nor the actual size uploaded.
And if the offset is given it can verify that it matches the stashed data. It also supports comparing the SHA1 of the already uploaded part to minimize the chance that a different file is uploaded for continuation. It now also verifies that the offset is not larger than the file size. In case the file is large and doing an SHA1 would be time consuming it's possible to not verify them.
The offset is not present if the warning occurred in the first chunk which would have caused that the first chunk is discarded when continuing to upload. To do that the file key must be defined before it issues the warnings.
Also when uploading in the unchunked mode and a file key is provided it did upload the complete file even though the offset was set to `False` to indicate the file is completely uploaded.
One test is an expected failure due to T112416.
Change-Id: I610a0744d74dc5a2523d96eabde818f76dd6c5e8 --- M pywikibot/site.py M tests/upload_tests.py 2 files changed, 221 insertions(+), 19 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/site.py b/pywikibot/site.py index 8cb66a8..9e7dde9 100644 --- a/pywikibot/site.py +++ b/pywikibot/site.py @@ -16,6 +16,7 @@ #
import datetime +import hashlib import itertools import os import re @@ -5177,10 +5178,20 @@ raise return self._uploaddisabled
+ def stash_info(self, file_key, props=False): + """Get the stash info for a given file key.""" + if not props: + props = False + req = self._simple_request( + action='query', prop='stashimageinfo', siifilekey=file_key, + siiprop=props) + return req.submit()['query']['stashimageinfo'][0] + @deprecate_arg('imagepage', 'filepage') def upload(self, filepage, source_filename=None, source_url=None, comment=None, text=None, watch=False, ignore_warnings=False, - chunk_size=0, _file_key=None, _offset=0, report_success=None): + chunk_size=0, _file_key=None, _offset=0, _verify_stash=None, + report_success=None): """Upload a file to the wiki.
Either source_filename or source_url, but not both, must be provided. @@ -5215,8 +5226,15 @@ @type _file_key: str or None @param _offset: When file_key is not None this can be an integer to continue a previously canceled chunked upload. If False it treats - that as a finished upload. By default starts at 0. + that as a finished upload. If True it requests the stash info from + the server to determine the offset. By default starts at 0. @type _offset: int or bool + @param _verify_stash: Requests the SHA1 and file size uploaded and + compares it to the local file. Also verifies that _offset is + matching the file size if the _offset is an int. If _offset is False + if verifies that the file size match with the local file. If None + it'll verifies the stash when a file key and offset is given. + @type _verify_stash: bool or None @param report_success: If the upload was successful it'll print a success message and if ignore_warnings is set to False it'll raise an UploadWarning if a warning occurred. If it's None (default) @@ -5231,7 +5249,7 @@ api.UploadWarning( warning, upload_warnings.get(warning, '%(msg)s') % {'msg': data}, - _file_key, response.get('offset', 0)) + _file_key, response['offset']) for warning, data in response['warnings'].items()]
upload_warnings = { @@ -5287,7 +5305,76 @@ token = self.tokens['edit'] result = None file_page_title = filepage.title(withNamespace=False) - if _file_key and _offset is False: + file_size = None + offset = _offset + # make sure file actually exists + if source_filename: + if os.path.isfile(source_filename): + file_size = os.path.getsize(source_filename) + elif offset is not False: + raise ValueError("File '%s' does not exist." + % source_filename) + + if source_filename and _file_key: + assert offset is False or file_size is not None + if _verify_stash is None: + _verify_stash = True + if (offset is not False and offset is not True and + offset > file_size): + raise ValueError( + 'For the file key "{0}" the offset was set to {1} ' + 'while the file is only {2} bytes large.'.format( + _file_key, offset, file_size)) + + if _verify_stash or offset is True: + if not _file_key: + raise ValueError('Without a file key it cannot request the ' + 'stash information') + if not source_filename: + raise ValueError('Can request stash information only when ' + 'using a file name.') + props = ['size'] + if _verify_stash: + props += ['sha1'] + stash_info = self.stash_info(_file_key, props) + if offset is True: + offset = stash_info['size'] + elif offset is False: + if file_size != stash_info['size']: + raise ValueError( + 'For the file key "{0}" the server reported a size {1} ' + 'while the file size is {2}'.format( + _file_key, stash_info['size'], file_size)) + elif offset is not False and offset != stash_info['size']: + raise ValueError( + 'For the file key "{0}" the server reported a size {1} ' + 'while the offset was {2}'.format( + _file_key, stash_info['size'], offset)) + + if _verify_stash: + # The SHA1 was also requested so calculate and compare it + assert 'sha1' in stash_info, \ + 'sha1 not in stash info: {0}'.format(stash_info) + sha1 = hashlib.sha1() + bytes_to_read = offset + with open(source_filename, 'rb') as f: + while bytes_to_read > 0: + read_bytes = f.read(min(bytes_to_read, 1 << 20)) + assert read_bytes # make sure we actually read bytes + bytes_to_read -= len(read_bytes) + sha1.update(read_bytes) + sha1 = sha1.hexdigest() + if sha1 != stash_info['sha1']: + raise ValueError( + 'The SHA1 of {0} bytes of the stashed "{1}" is {2} ' + 'while the local file is {3}'.format( + offset, _file_key, stash_info['sha1'], sha1)) + + assert offset is not True + if _file_key and file_size is None: + assert offset is False + + if _file_key and offset is False or offset == file_size: pywikibot.log('Reused already upload file using ' 'filekey "{0}"'.format(_file_key)) # TODO: Use sessionkey instead of filekey if necessary @@ -5299,10 +5386,6 @@ # TODO: Dummy value to allow also Unicode names, see bug 73661 mime_filename = 'FAKE-NAME' # upload local file - # make sure file actually exists - if not os.path.isfile(source_filename): - raise ValueError("File '%s' does not exist." - % source_filename) throttle = True filesize = os.path.getsize(source_filename) chunked_upload = (chunk_size > 0 and chunk_size < filesize and @@ -5313,7 +5396,6 @@ 'action': 'upload', 'token': token, 'text': text, 'filename': file_page_title, 'comment': comment}) if chunked_upload: - offset = _offset if offset > 0: pywikibot.log('Continuing upload from byte ' '{0}'.format(offset)) @@ -5343,13 +5425,16 @@ if error.code == u'uploaddisabled': self._uploaddisabled = True raise error + _file_key = data['filekey'] if 'warnings' in data and not ignore_all_warnings: if callable(ignore_warnings): + if 'offset' not in data: + data['offset'] = True if ignore_warnings(create_warnings_list(data)): # Future warnings of this run can be ignored ignore_warnings = True ignore_all_warnings = True - offset = result.get('offset', 0) + offset = data['offset'] continue else: return False @@ -5357,7 +5442,6 @@ if 'offset' not in result: result['offset'] = 0 break - _file_key = data['filekey'] throttle = False if 'offset' in data: new_offset = int(data['offset']) @@ -5376,13 +5460,16 @@ final_request['filekey'] = _file_key break else: # not chunked upload - file_contents = f.read() - filetype = (mimetypes.guess_type(source_filename)[0] or - 'application/octet-stream') - final_request.mime_params = { - 'file': (file_contents, filetype.split('/'), - {'filename': mime_filename}) - } + if _file_key: + final_request['filekey'] = _file_key + else: + file_contents = f.read() + filetype = (mimetypes.guess_type(source_filename)[0] or + 'application/octet-stream') + final_request.mime_params = { + 'file': (file_contents, filetype.split('/'), + {'filename': mime_filename}) + } else: # upload by URL if "upload_by_url" not in self.userinfo["rights"]: @@ -5417,11 +5504,13 @@ _file_key = None pywikibot.warning('No filekey defined.') if not report_success: + if 'offset' not in result: + result['offset'] = True if ignore_warnings(create_warnings_list(result)): return self.upload( filepage, source_filename, source_url, comment, text, watch, True, chunk_size, _file_key, - result.get('offset', False), report_success=False) + result['offset'], report_success=False) else: return False warn('When ignore_warnings=False in APISite.upload will change ' diff --git a/tests/upload_tests.py b/tests/upload_tests.py index 4d1b1ea..aed4a18 100644 --- a/tests/upload_tests.py +++ b/tests/upload_tests.py @@ -17,6 +17,8 @@
import pywikibot
+from pywikibot.data.api import APIError + from tests import _images_dir from tests.aspects import unittest, TestCase
@@ -29,6 +31,9 @@
family = 'wikipedia' code = 'test' + + sounds_png = os.path.join(_images_dir, 'MP_sounds.png') + arrow_png = os.path.join(_images_dir, '1rightarrow.png')
def test_png(self): """Test uploading a png using Site.upload.""" @@ -46,6 +51,114 @@ comment='pywikibot test', ignore_warnings=True, chunk_size=1024)
+ def _init_upload(self, chunk_size): + """Do an initial upload causing an abort because of warnings.""" + def warn_callback(warnings): + """A simple callback not automatically finishing the upload.""" + self.assertCountEqual([w.code for w in warnings], expected_warns) + # by now we know there are only two but just make sure + assert len(warnings) == len(expected_warns) + assert len(expected_warns) in [1, 2] + if len(expected_warns) == 2: + self.assertEqual(warnings[0].file_key, warnings[1].file_key) + self.assertEqual(warnings[0].offset, warnings[1].offset) + self._file_key = warnings[0].file_key + self._offset = warnings[0].offset + + if chunk_size: + expected_warns = ['exists'] + else: + expected_warns = ['duplicate', 'exists'] + + # First upload the warning with warnings enabled + page = pywikibot.FilePage(self.site, 'MP_sounds-pwb.png') + self.assertFalse(hasattr(self, '_file_key')) + self.site.upload(page, source_filename=self.sounds_png, + comment='pywikibot test', chunk_size=chunk_size, + ignore_warnings=warn_callback) + + # Check that the warning happened and it's cached + self.assertTrue(hasattr(self, '_file_key')) + self.assertIs(self._offset, True) + self.assertRegex(self._file_key, r'[0-9a-z]+.[0-9a-z]+.\d+.png') + self._verify_stash() + + def _verify_stash(self): + info = self.site.stash_info(self._file_key, ['size', 'sha1']) + if info['size'] == 1024: + self.assertEqual('3503db342c8dfb0a38db0682b7370ddd271fa163', + info['sha1']) + else: + self.assertEqual('0408a0f6a5e057e701f3aed96b0d1fb913c3d9d0', + info['sha1']) + + def _finish_upload(self, chunk_size, file_name): + """Finish the upload.""" + # Finish/continue upload with the given file key + page = pywikibot.FilePage(self.site, 'MP_sounds-pwb.png') + self.site.upload(page, source_filename=file_name, + comment='pywikibot test', chunk_size=chunk_size, + ignore_warnings=True, report_success=False, + _file_key=self._file_key, _offset=self._offset) + + def _test_continue_filekey(self, chunk_size): + """Test uploading a chunk first and finish in a separate upload.""" + self._init_upload(chunk_size) + self._finish_upload(chunk_size, self.sounds_png) + + # Check if it's still cached + with self.assertRaises(APIError) as cm: + self.site.stash_info(self._file_key) + self.assertEqual(cm.exception.code, 'siiinvalidsessiondata') + self.assertTrue(cm.exception.info.startswith('File not found'), + 'info ({0}) did not start with ' + '"File not found"'.format(cm.exception.info)) + + def test_continue_filekey_once(self): + """Test continuing to upload a file without using chunked mode.""" + self._test_continue_filekey(0) + + @unittest.expectedFailure # see T112416 + def test_continue_filekey_chunked(self): + """Test continuing to upload a file with using chunked mode.""" + self._test_continue_filekey(1024) + + def test_sha1_missmatch(self): + """Test trying to continue with a different file.""" + self._init_upload(1024) + with self.assertRaises(ValueError) as cm: + self._finish_upload(1024, self.arrow_png) + self.assertEqual( + str(cm.exception), + 'The SHA1 of 1024 bytes of the stashed "{0}" is ' + '3503db342c8dfb0a38db0682b7370ddd271fa163 while the local file is ' + '3dd334f11aa1e780d636416dc0649b96b67588b6'.format(self._file_key)) + self._verify_stash() + + def test_offset_missmatch(self): + """Test trying to continue with a different offset.""" + self._init_upload(1024) + self._offset = 0 + with self.assertRaises(ValueError) as cm: + self._finish_upload(1024, self.sounds_png) + self.assertEqual( + str(cm.exception), + 'For the file key "{0}" the server reported a size 1024 while the ' + 'offset was 0'.format(self._file_key)) + self._verify_stash() + + def test_offset_oversize(self): + """Test trying to continue with an offset which is to large.""" + self._init_upload(1024) + self._offset = 2000 + with self.assertRaises(ValueError) as cm: + self._finish_upload(1024, self.sounds_png) + self.assertEqual( + str(cm.exception), + 'For the file key "{0}" the offset was set to 2000 while the file ' + 'is only 1276 bytes large.'.format(self._file_key)) + self._verify_stash() +
if __name__ == '__main__': try:
pywikibot-commits@lists.wikimedia.org