Mpaa has submitted this change and it was merged.
Change subject: [FEAT] Chunked uploads ......................................................................
[FEAT] Chunked uploads
This allows chunked uploads by setting the parameter 'chunk_size' to a value between 0 and the file size (both exclusive). It will also only work if the version is 1.20 or newer.
The upload.py script supports this mode via the '-chunked' parameter.
This also adds the capability to run the API request without throttle so that it don't has to wait after each request.
See: https://www.mediawiki.org/wiki/API:Upload#Chunked_uploading
Change-Id: I80b2bba9e63832173d5b697db1f4ea419ca1122f --- M pywikibot/data/api.py M pywikibot/site.py M scripts/upload.py 3 files changed, 156 insertions(+), 36 deletions(-)
Approvals: Mpaa: Looks good to me, approved
diff --git a/pywikibot/data/api.py b/pywikibot/data/api.py index 8132a27..11df17b 100644 --- a/pywikibot/data/api.py +++ b/pywikibot/data/api.py @@ -129,6 +129,8 @@ @param site: The Site to which the request will be submitted. If not supplied, uses the user's configured default Site. @param mime: If true, send in "multipart/form-data" format (default False) + @param mime_params: A dictionary of parameter which should only be + transferred via mime mode. If not None sets mime to True. @param max_retries: (optional) Maximum number of times to retry after errors, defaults to 25 @param retry_wait: (optional) Minimum time to wait after an error, @@ -143,7 +145,15 @@ self.site = kwargs.pop("site") except KeyError: self.site = pywikibot.Site() - self.mime = kwargs.pop("mime", False) + if 'mime_params' in kwargs: + self.mime_params = kwargs.pop('mime_params') + # mime may not be different from mime_params + if 'mime' in kwargs and kwargs.pop('mime') != self.mime: + raise ValueError('If mime_params is set, mime may not differ ' + 'from it.') + else: + self.mime = kwargs.pop('mime', False) + self.throttle = kwargs.pop('throttle', False) self.max_retries = kwargs.pop("max_retries", pywikibot.config.max_retries) self.retry_wait = kwargs.pop("retry_wait", pywikibot.config.retry_wait) self.params = {} @@ -210,6 +220,23 @@ def iteritems(self): return iter(self.params.items())
+ @property + def mime(self): + """Return whether mime parameters are defined.""" + return self.mime_params is not None + + @mime.setter + def mime(self, value): + """ + Change whether mime parameter should be defined. + + This will clear the mime parameters. + """ + try: + self.mime_params = dict(value) + except TypeError: + self.mime_params = {} if value else None + def http_params(self): """Return the parameters formatted for inclusion in an HTTP request.
@@ -218,7 +245,9 @@ unicode (may be |-separated list) str in site encoding (may be |-separated list) """ - + if self.mime_params and set(self.params.keys()) & set(self.mime_params.keys()): + raise ValueError('The mime_params and params may not share the ' + 'same keys.') for key in self.params: if isinstance(self.params[key], bytes): self.params[key] = self.params[key].decode(self.site.encoding()) @@ -296,6 +325,23 @@ message = None return message == ERR_MSG
+ @staticmethod + def _generate_MIME_part(key, content, keytype, headers): + if not keytype: + try: + content.encode("ascii") + keytype = ("text", "plain") + except UnicodeError: + keytype = ("application", "octet-stream") + submsg = MIMENonMultipart(*keytype) + content_headers = {'name': key} + if headers: + content_headers.update(headers) + submsg.add_header("Content-disposition", "form-data", + **content_headers) + submsg.set_payload(content) + return submsg + def submit(self): """Submit a query and parse the response.
@@ -308,7 +354,10 @@ simulate = self._simulate(action) if simulate: return simulate - self.site.throttle(write=self.write) + if self.throttle: + self.site.throttle(write=self.write) + else: + pywikibot.log("Action '{0}' is submitted not throttled.".format(action)) uri = self.site.scriptpath() + "/api.php" ssl = False if self.site.family.name in config.available_ssl_project: @@ -328,22 +377,15 @@ filetype = mimetypes.guess_type(local_filename)[0] \ or 'application/octet-stream' file_content = file(local_filename, "rb").read() - submsg = MIMENonMultipart(*filetype.split("/")) - submsg.add_header("Content-disposition", - "form-data", name=key, - filename=local_filename) - submsg.set_payload(file_content) + submsg = Request._generate_MIME_part( + key, file_content, filetype.split('/'), + {'filename': local_filename}) else: - try: - self.params[key].encode("ascii") - keytype = ("text", "plain") - except UnicodeError: - keytype = ("application", "octet-stream") - submsg = MIMENonMultipart(*keytype) - submsg.add_header("Content-disposition", "form-data", - name=key) - submsg.set_payload(self.params[key]) + submsg = Request._generate_MIME_part( + key, self.params[key], None, None) container.attach(submsg) + for key, value in self.mime_params.items(): + container.attach(Request._generate_MIME_part(key, *value)) # strip the headers to get the HTTP message body body = container.as_string() marker = "\n\n" # separates headers from body diff --git a/pywikibot/site.py b/pywikibot/site.py index fb8202d..21e0989 100644 --- a/pywikibot/site.py +++ b/pywikibot/site.py @@ -3858,7 +3858,8 @@
@deprecate_arg('imagepage', 'filepage') def upload(self, filepage, source_filename=None, source_url=None, - comment=None, text=None, watch=False, ignore_warnings=False): + comment=None, text=None, watch=False, ignore_warnings=False, + chunk_size=0): """Upload a file to the wiki.
Either source_filename or source_url, but not both, must be provided. @@ -3875,7 +3876,11 @@ @param watch: If true, add filepage to the bot user's watchlist @param ignore_warnings: if true, ignore API warnings and force upload (for example, to overwrite an existing file); default False - + @param chunk_size: The chunk size in bytesfor chunked uploading (see + U{https://www.mediawiki.org/wiki/API:Upload#Chunked_uploading%7D). It + will only upload in chunks, if the version number is 1.20 or higher + and the chunk size is positive but lower than the file size. + @type chunk_size: int """ upload_warnings = { # map API warning codes to user error messages @@ -3909,18 +3914,51 @@ if not text: text = comment token = self.token(filepage, "edit") + result = None if source_filename: # upload local file # make sure file actually exists if not os.path.isfile(source_filename): raise ValueError("File '%s' does not exist." % source_filename) - # TODO: if file size exceeds some threshold (to be determined), - # upload by chunks (--> os.path.getsize(source_filename)) + additional_parameters = {} + throttle = True + filesize = os.path.getsize(source_filename) + if (chunk_size > 0 and chunk_size < filesize and + LV(self.version()) >= LV('1.20')): + offset = 0 + file_key = None + with open(source_filename, 'rb') as f: + while True: + f.seek(offset) + chunk = f.read(chunk_size) + req = api.Request(site=self, action='upload', token=token, + stash='1', offset=offset, filesize=filesize, + filename=filepage.title(withNamespace=False), + mime_params={}, throttle=throttle) + req.mime_params['chunk'] = (chunk, None, {'filename': req.params['filename']}) + if file_key: + req['filekey'] = file_key + # TODO: Proper error and warning handling + data = req.submit()['upload'] + if 'warnings' in data: + result = data + break + file_key = data['filekey'] + throttle = False + new_offset = int(data['offset']) + if offset + len(chunk) != new_offset: + pywikibot.warning('Unexpected offset.') + offset = new_offset + if data['result'] != 'Continue': # finished + additional_parameters['filekey'] = file_key + break + else: + additional_parameters = {'file': source_filename, 'mime': True} req = api.Request(site=self, action="upload", token=token, filename=filepage.title(withNamespace=False), - file=source_filename, comment=comment, - text=text, mime=True) + comment=comment, text=text, throttle=throttle, + **additional_parameters) else: # upload by URL if "upload_by_url" not in self.userinfo["rights"]: @@ -3930,16 +3968,17 @@ req = api.Request(site=self, action="upload", token=token, filename=filepage.title(withNamespace=False), url=source_url, comment=comment, text=text) - if watch: - req["watch"] = "" - if ignore_warnings: - req["ignorewarnings"] = "" - try: - result = req.submit() - except api.APIError: - # TODO: catch and process foreseeable errors - raise - result = result["upload"] + if not result: + if watch: + req["watch"] = "" + if ignore_warnings: + req["ignorewarnings"] = "" + try: + result = req.submit() + except api.APIError: + # TODO: catch and process foreseeable errors + raise + result = result["upload"] pywikibot.debug(result, _logger) if "warnings" in result: warning = list(result["warnings"].keys())[0] diff --git a/scripts/upload.py b/scripts/upload.py index 8a9a44d..39f9df5 100755 --- a/scripts/upload.py +++ b/scripts/upload.py @@ -11,6 +11,15 @@ is given -abortonwarn: Abort upload on the specified warning type. If no warning type is specified abort on all warnings. + -chunked: Upload the file in chunks (more overhead, but restartable). If + no value is specified the chunk size is 1 MiB. The value must + be a number which can be preceded by a suffix. The units are: + No suffix: Bytes + 'k': Kilobytes (1000 B) + 'M': Megabytes (1000000 B) + 'Ki': Kibibytes (1024 B) + 'Mi': Mebibytes (1024x1024 B) + The suffixes are case insenstive.
If any other arguments are given, the first is the URL or filename to upload, and the rest is a proposed description to go with the upload. If none of these @@ -34,6 +43,8 @@ import urllib import urlparse import tempfile +import re +import math import pywikibot import pywikibot.data.api from pywikibot import config @@ -43,7 +54,7 @@ def __init__(self, url, urlEncoding=None, description=u'', useFilename=None, keepFilename=False, verifyDescription=True, ignoreWarning=False, - targetSite=None, uploadByUrl=False, aborts=[]): + targetSite=None, uploadByUrl=False, aborts=[], chunk_size=0): """ @param ignoreWarning: Set this to True if you want to upload even if another file would be overwritten or another mistake would be @@ -58,6 +69,7 @@ self.verifyDescription = verifyDescription self.ignoreWarning = ignoreWarning self.aborts = aborts + self.chunk_size = chunk_size if config.upload_to_commons: self.targetSite = targetSite or pywikibot.Site('commons', 'commons') @@ -224,7 +236,8 @@ else: temp = self.url site.upload(imagepage, source_filename=temp, - ignore_warnings=self.ignoreWarning) + ignore_warnings=self.ignoreWarning, + chunk_size=self.chunk_size)
except pywikibot.data.api.UploadWarning as warn: pywikibot.output(u"We got a warning message: {0}".format(warn.message)) @@ -266,6 +279,8 @@ useFilename = None verifyDescription = True aborts = set() + chunk_size = 0 + chunk_size_regex = re.compile(r'^-chunked(?::(\d+(?:.\d+)?)[ \t]*(k|ki|m|mi)?b?)?$', re.I)
# process all global bot args # returns a list of non-global args, i.e. args for upload.py @@ -282,6 +297,30 @@ aborts.add(arg[len('-abortonwarn:'):]) else: aborts = True + elif arg.startswith('-chunked'): + match = chunk_size_regex.match(arg) + if match: + if match.group(1): # number was in there + base = float(match.group(1)) + if match.group(2): # suffix too + suffix = match.group(2).lower() + if suffix == "k": + suffix = 1000 + elif suffix == "m": + suffix = 1000000 + elif suffix == "ki": + suffix = 1 << 10 + elif suffix == "mi": + suffix = 1 << 20 + else: + pass # huh? + else: + suffix = 1 + chunk_size = math.trunc(base * suffix) + else: + chunk_size = 1 << 20 # default to 1 MiB + else: + pywikibot.error('Chunk size parameter is not valid.') elif url == u'': url = arg else: @@ -290,7 +329,7 @@ bot = UploadRobot(url, description=description, useFilename=useFilename, keepFilename=keepFilename, verifyDescription=verifyDescription, - aborts=aborts) + aborts=aborts, chunk_size=chunk_size) bot.run()
if __name__ == "__main__":
pywikibot-commits@lists.wikimedia.org