Xqt submitted this change.

View Change

Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
site: Support async chunked uploads (T129216)

Async chunked uploads (MW >= 1.21) will tell the server to defer jobs
such as assemble chunks and publishing files to the job queue, so that
it would not suffer from the very short timeout of HTTP requests. This
will enable larger files (more than one gigabyte) to be uploaded by
pywikibot without failing after 503 (T128358) like non-async chunked
uploads.

The mechanics is that when a such job is submitted, in queue, or in
progress, the MediaWiki server shall send a result='Poll' response.
Polling means to periodically send an action='upload', checkstatus=True
request and handle the result accordingly, with the same logic as
in the response that initiated the poll.

In addition, handling result in responses has been seperated into
if-elif cases to avoid irrelevant warning messages in logs. (T133443)

The upload.py script has learned a new flag (-async) which is passed
to the APISite upload function.

Bug: T129216
Bug: T133443
Change-Id: If9e3330c2f4065e491f20475bee0c7da7ccbd096
---
M pywikibot/site/_apisite.py
M pywikibot/specialbots/_upload.py
M scripts/upload.py
3 files changed, 175 insertions(+), 107 deletions(-)

diff --git a/pywikibot/site/_apisite.py b/pywikibot/site/_apisite.py
index 9f978b9..ed6e281 100644
--- a/pywikibot/site/_apisite.py
+++ b/pywikibot/site/_apisite.py
@@ -2533,6 +2533,7 @@
watch: bool = False,
ignore_warnings=False,
chunk_size: int = 0,
+ asynchronous: bool = False,
_file_key: Optional[str] = None,
_offset: Union[bool, int] = 0,
_verify_stash: Optional[bool] = None,
@@ -2568,6 +2569,8 @@
U{https://www.mediawiki.org/wiki/API:Upload#Chunked_uploading}).
It will only upload in chunks, if the chunk size is positive
but lower than the file size.
+ @param asynchronous: Make potentially large file operations
+ asynchronous on the server side when possible.
@param _file_key: Reuses an already uploaded file using the filekey. If
None (default) it will upload the file.
@param _offset: When file_key is not None this can be an integer to
@@ -2657,6 +2660,7 @@
file_page_title = filepage.title(with_ns=False)
file_size = None
offset = _offset
+
# make sure file actually exists
if source_filename:
if os.path.isfile(source_filename):
@@ -2720,10 +2724,17 @@
pywikibot.log('Reused already upload file using '
'filekey "{}"'.format(_file_key))
# TODO: Use sessionkey instead of filekey if necessary
- final_request = self._simple_request(action='upload', token=token,
- filename=file_page_title,
- comment=comment, text=text,
- filekey=_file_key)
+ final_request = self._request(
+ parameters={
+ 'action': 'upload',
+ 'token': token,
+ 'filename': file_page_title,
+ 'comment': comment,
+ 'text': text,
+ 'async': asynchronous,
+ 'filekey': _file_key
+ })
+
elif source_filename:
# TODO: Dummy value to allow also Unicode names, see bug T75661
mime_filename = 'FAKE-NAME'
@@ -2740,37 +2751,49 @@
if offset > 0:
pywikibot.log('Continuing upload from byte {}'
.format(offset))
+ poll = False
while True:
- f.seek(offset)
- chunk = f.read(chunk_size)
- # workaround (hack) for T132676
- # append another '\r' so that one is the payload and
- # the second is used for newline when mangled by email
- # package.
- if (len(chunk) < chunk_size
- or (offset + len(chunk)) == filesize
- and chunk[-1] == b'\r'[0]):
- chunk += b'\r'

- mime_params = {
- 'chunk': (chunk,
- ('application', 'octet-stream'),
- {'filename': mime_filename})
- }
- req = self._request(
- throttle=throttle,
- mime=mime_params,
- parameters={
- 'action': 'upload',
- 'token': token,
- 'stash': True,
- 'filesize': filesize,
- 'offset': offset,
- 'filename': file_page_title,
- 'ignorewarnings': ignore_all_warnings})
+ if poll:
+ # run a poll; not possible in first iteration
+ assert _file_key
+ req = self._simple_request(
+ action='upload',
+ token=token,
+ filekey=_file_key,
+ checkstatus=True)
+ else:
+ f.seek(offset)
+ chunk = f.read(chunk_size)
+ # workaround (hack) for T132676
+ # append another '\r' so that one is the payload
+ # and the second is used for newline when mangled
+ # by email package.
+ if (len(chunk) < chunk_size
+ or (offset + len(chunk)) == filesize
+ and chunk[-1] == b'\r'[0]):
+ chunk += b'\r'

- if _file_key:
- req['filekey'] = _file_key
+ mime_params = {
+ 'chunk': (chunk,
+ ('application', 'octet-stream'),
+ {'filename': mime_filename})
+ }
+ req = self._request(
+ throttle=throttle,
+ mime=mime_params,
+ parameters={
+ 'action': 'upload',
+ 'token': token,
+ 'stash': True,
+ 'filesize': filesize,
+ 'offset': offset,
+ 'filename': file_page_title,
+ 'async': asynchronous,
+ 'ignorewarnings': ignore_all_warnings})
+
+ if _file_key:
+ req['filekey'] = _file_key

try:
data = req.submit()['upload']
@@ -2807,8 +2830,12 @@
raise error
if 'nochange' in data: # in simulation mode
break
- _file_key = data['filekey']
- if 'warnings' in data and not ignore_all_warnings:
+
+ # Polls may not contain file key in response
+ _file_key = data.get('filekey', _file_key)
+ if data['result'] == 'Warning':
+ assert('warnings' in data
+ and not ignore_all_warnings)
if callable(ignore_warnings):
restart = False
if 'offset' not in data:
@@ -2845,23 +2872,35 @@
result = data
result.setdefault('offset', 0)
break
- throttle = False
- if 'offset' in data:
- new_offset = int(data['offset'])
- if offset + len(chunk) != new_offset:
- pywikibot.log('Old offset: {}; Returned '
- 'offset: {}; Chunk size: {}'
- .format(offset, new_offset,
- len(chunk)))
- pywikibot.warning('Unexpected offset.')
- offset = new_offset
- else:
- pywikibot.warning('Offset was not supplied.')
- offset += len(chunk)
- if data['result'] != 'Continue': # finished
+
+ if data['result'] == 'Continue':
+ throttle = False
+ if 'offset' in data:
+ new_offset = int(data['offset'])
+ if offset + len(chunk) != new_offset:
+ pywikibot.log('Old offset: {0}; Returned '
+ 'offset: {1}; Chunk size: '
+ '{2}'.format(offset,
+ new_offset,
+ len(chunk)))
+ pywikibot.warning('Unexpected offset.')
+ offset = new_offset
+ else:
+ pywikibot.warning('Offset was not supplied.')
+ offset += len(chunk)
+ elif data['result'] == 'Poll':
+ poll = True
+ pywikibot.log('Waiting for server to '
+ 'assemble chunks.')
+ elif data['result'] == 'Success': # finished
pywikibot.log('Finished uploading last chunk.')
final_request['filekey'] = _file_key
+ final_request['async'] = asynchronous
break
+ else:
+ raise Error(
+ 'Unrecognized result: %s' % data['result'])
+
else: # not chunked upload
if _file_key:
final_request['filekey'] = _file_key
@@ -2883,70 +2922,88 @@
action='upload', filename=file_page_title,
url=source_url, comment=comment, text=text, token=token)

- if not result:
- final_request['watch'] = watch
- final_request['ignorewarnings'] = ignore_all_warnings
- try:
- result = final_request.submit()
- self._uploaddisabled = False
- except APIError as error:
- # TODO: catch and process foreseeable errors
- if error.code == 'uploaddisabled':
- self._uploaddisabled = True
- raise error
- result = result['upload']
- pywikibot.debug(result, _logger)
+ while True:
+ if not result:
+ final_request['watch'] = watch
+ final_request['ignorewarnings'] = ignore_all_warnings
+ try:
+ result = final_request.submit()
+ self._uploaddisabled = False
+ except api.APIError as error:
+ # TODO: catch and process foreseeable errors
+ if error.code == 'uploaddisabled':
+ self._uploaddisabled = True
+ raise error
+ result = result['upload']
+ pywikibot.debug(result, _logger)

- if 'warnings' in result and not ignore_all_warnings:
- if 'filekey' in result:
- _file_key = result['filekey']
- elif 'sessionkey' in result:
- # TODO: Probably needs to be reflected in the API call above
- _file_key = result['sessionkey']
- pywikibot.warning('Using sessionkey instead of filekey.')
- else:
- _file_key = None
- pywikibot.warning('No filekey defined.')
+ if 'result' not in result:
+ raise Error('Upload: unrecognized response: {}'.format(result))

- if not report_success:
- result.setdefault('offset', True)
- if ignore_warnings(create_warnings_list(result)):
- return self.upload(
- filepage, source_filename=source_filename,
- source_url=source_url, comment=comment, text=text,
- watch=watch, ignore_warnings=True,
- chunk_size=chunk_size, _file_key=_file_key,
- _offset=result['offset'], report_success=False
- )
- return False
+ if result['result'] == 'Warning':
+ assert 'warnings' in result and not ignore_all_warnings
+ if 'filekey' in result:
+ _file_key = result['filekey']
+ elif 'sessionkey' in result:
+ # TODO: Probably needs to be reflected in the API call
+ # above
+ _file_key = result['sessionkey']
+ pywikibot.warning('Using sessionkey instead of filekey.')
+ else:
+ _file_key = None
+ pywikibot.warning('No filekey defined.')

- warn('When ignore_warnings=False in APISite.upload will change '
- 'from raising an UploadError into behaving like being a '
- 'callable returning False.', DeprecationWarning, 3)
- if len(result['warnings']) > 1:
- warn('The upload returned {} warnings: {}'
- .format(len(result['warnings']),
- ', '.join(result['warnings'])),
- UserWarning, 3)
- warning = list(result['warnings'].keys())[0]
- message = result['warnings'][warning]
- raise UploadError(warning, upload_warnings[warning]
- .format(msg=message),
- file_key=_file_key,
- offset=result.get('offset', False))
- if 'result' not in result:
- pywikibot.output('Upload: unrecognized response: {}'
- .format(result))
+ if not report_success:
+ result.setdefault('offset', True)
+ if ignore_warnings(create_warnings_list(result)):
+ return self.upload(
+ filepage, source_filename=source_filename,
+ source_url=source_url, comment=comment,
+ text=text, watch=watch, ignore_warnings=True,
+ chunk_size=chunk_size, asynchronous=asynchronous,
+ _file_key=_file_key, offset=result['offset'],
+ report_success=False)
+ return False

- if result['result'] == 'Success':
- if report_success:
- pywikibot.output('Upload successful.')
- # If we receive a nochange, that would mean we're in simulation
- # mode, don't attempt to access imageinfo
- if 'nochange' not in result:
- filepage._load_file_revisions([result['imageinfo']])
+ warn('When ignore_warnings=False in APISite.upload will '
+ 'change from raising an UploadWarning into behaving like '
+ 'being a callable returning False.',
+ DeprecationWarning, 3)
+ if len(result['warnings']) > 1:
+ warn('The upload returned {} warnings: {}'
+ .format(len(result['warnings']),
+ ', '.join(result['warnings'])),
+ UserWarning, 3)
+ warning = list(result['warnings'].keys())[0]
+ message = result['warnings'][warning]
+ raise UploadError(warning,
+ upload_warnings[warning]
+ .format(msg=message),
+ file_key=_file_key,
+ offset=result.get('offset', False))

- return result['result'] == 'Success'
+ if result['result'] == 'Poll':
+ # Polling is meaningless without a file key
+ assert _file_key
+ pywikibot.log('Waiting for upload to be published.')
+ result = None
+ final_request = self._simple_request(
+ action='upload',
+ token=token,
+ filekey=_file_key,
+ checkstatus=True)
+ continue
+
+ if result['result'] == 'Success':
+ if report_success:
+ pywikibot.output('Upload successful.')
+ # If we receive a nochange, that would mean we're in simulation
+ # mode, don't attempt to access imageinfo
+ if 'nochange' not in result:
+ filepage._load_file_revisions([result['imageinfo']])
+ return True
+
+ raise Error('Unrecognized result: %s' % data['result'])

def get_property_names(self, force: bool = False):
"""
diff --git a/pywikibot/specialbots/_upload.py b/pywikibot/specialbots/_upload.py
index 49cb816..ed67485 100644
--- a/pywikibot/specialbots/_upload.py
+++ b/pywikibot/specialbots/_upload.py
@@ -47,6 +47,7 @@
target_site=None,
aborts: Union[bool, list, None] = None,
chunk_size: int = 0,
+ asynchronous: bool = False,
summary: Optional[str] = None,
filename_prefix: Optional[str] = None, **kwargs):
"""Initializer.
@@ -74,6 +75,8 @@
@param chunk_size: Upload the file in chunks (more overhead, but
restartable) specified in bytes. If no value is specified the file
will be uploaded as whole.
+ @param asynchronous: Make potentially large file operations
+ asynchronous on the server side when possible.
@param filename_prefix: Specify prefix for the title of every
file's page.
@keyword always: Disables any input, requires that either
@@ -101,6 +104,7 @@
self.ignore_warning = ignore_warning
self.aborts = aborts or []
self.chunk_size = chunk_size
+ self.asynchronous = asynchronous
self.summary = summary
self.filename_prefix = filename_prefix

@@ -394,6 +398,7 @@
ignore_warnings=ignore_warnings,
chunk_size=self.chunk_size,
_file_key=_file_key, _offset=_offset,
+ asynchronous=self.asynchronous,
comment=self.summary)
except APIError as error:
if error.code == 'uploaddisabled':
diff --git a/scripts/upload.py b/scripts/upload.py
index 6fba731..768d853 100755
--- a/scripts/upload.py
+++ b/scripts/upload.py
@@ -24,6 +24,8 @@
'Mi': Mebibytes (1024x1024 B)

The suffixes are case insensitive.
+ -async Make potentially large file operations asynchronous on the
+ server side when possible.
-always Don't ask the user anything. This will imply -keep and
-noverify and require that either -abortonwarn or -ignorewarn
is defined for all. It will also require a valid file name and
@@ -113,6 +115,7 @@
aborts = set()
ignorewarn = set()
chunk_size = 0
+ asynchronous = False
recursive = False
description_file = None

@@ -150,6 +153,8 @@
elif arg == '-chunked':
match = CHUNK_SIZE_REGEX.match(option)
chunk_size = get_chunk_size(match)
+ elif arg == '-async':
+ asynchronous = True
elif arg == '-descfile':
description_file = value
elif not url:
@@ -213,6 +218,7 @@
keep_filename=keep_filename,
verify_description=verify_description, aborts=aborts,
ignore_warning=ignorewarn, chunk_size=chunk_size,
+ asynchronous=asynchronous,
always=always, summary=summary,
filename_prefix=filename_prefix)
bot.run()

To view, visit change 679021. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: If9e3330c2f4065e491f20475bee0c7da7ccbd096
Gerrit-Change-Number: 679021
Gerrit-PatchSet: 11
Gerrit-Owner: Inductiveload <inductiveload@gmail.com>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Isaacandy <isaac@iznd.xyz>
Gerrit-Reviewer: JJMC89 <JJMC89.Wikimedia@gmail.com>
Gerrit-Reviewer: Siebrand <siebrand@kitano.nl>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: Zhuyifei1999 <zhuyifei1999@gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: DannyS712 <dannys712.wiki@gmail.com>
Gerrit-MessageType: merged