SVN: [7552] branches/rewrite/scripts/upload.py - Pywikipedia-svn

28 Oct 2009

Revision: 7552
Author:   russblau
Date:     2009-10-28 17:37:09 +0000 (Wed, 28 Oct 2009)
Log Message:
-----------
Create copy for rewrite branch.
Added Paths:
-----------
    branches/rewrite/scripts/upload.py
Copied: branches/rewrite/scripts/upload.py (from rev 7545, trunk/pywikipedia/upload.py)
===================================================================

--- branches/rewrite/scripts/upload.py	                        (rev 0)
+++ branches/rewrite/scripts/upload.py	2009-10-28 17:37:09 UTC (rev 7552)
@@ -0,0 +1,426 @@
+# -*- coding: utf-8 -*-
+"""
+Script to upload images to wikipedia.
+
+Arguments:
+
+  -keep         Keep the filename as is
+  -filename     Target filename
+  -noverify     Do not ask for verification of the upload description if one is given
+
+If any other arguments are given, the first is the URL or filename
+to upload, and the rest is a proposed description to go with the
+upload. If none of these are given, the user is asked for the
+file or URL to upload. The bot will then upload the image to the wiki.
+
+The script will ask for the location of an image, if not given as a parameter,
+and for a description.
+"""
+#
+# (C) Rob W.W. Hooft, Andre Engels 2003-2004
+#
+# Distributed under the terms of the MIT license.
+#
+__version__='$Id$'
+
+import os, sys, time
+import urllib, mimetypes
+import wikipedia, config, query
+
+def post_multipart(site, address, fields, files, cookies):
+    """
+    Post fields and files to an http host as multipart/form-data.
+    fields is a sequence of (name, value) elements for regular form fields.
+    files is a sequence of (name, filename, value) elements for data to be uploaded as files
+    Return the server's response page.
+    """
+    contentType, body = encode_multipart_formdata(fields, files)
+    return site.postData(address, body, contentType = contentType, cookies = cookies)
+
+def encode_multipart_formdata(fields, files):
+    """
+    fields is a sequence of (name, value) elements for regular form fields.
+    files is a sequence of (name, filename, value) elements for data to be uploaded as files
+    Return (content_type, body) ready for httplib.HTTP instance
+    """
+    boundary = '----------ThIs_Is_tHe_bouNdaRY_$'
+    lines = []
+    for (key, value) in fields:
+        lines.append('--' + boundary)
+        lines.append('Content-Disposition: form-data; name="%s"' % key)
+        lines.append('')
+        lines.append(value)
+    for (key, filename, value) in files:
+        lines.append('--' + boundary)
+        lines.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
+        lines.append('Content-Type: %s' % get_content_type(filename))
+        lines.append('')
+        lines.append(value)
+    lines.append('--' + boundary + '--')
+    lines.append('')
+    body = '\r\n'.join(lines)
+    content_type = 'multipart/form-data; boundary=%s' % boundary
+    return content_type, body
+
+def get_content_type(filename):
+    return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
+
+
+class UploadRobot:
+    def __init__(self, url, urlEncoding = None, description = u'', useFilename = None, keepFilename = False,
+                 verifyDescription = True, ignoreWarning = False, targetSite = None, uploadByUrl = False):
+        """
+        ignoreWarning - Set this to True if you want to upload even if another
+                        file would be overwritten or another mistake would be
+                        risked.
+        """
+        self._retrieved = False
+        self.url = url
+        self.urlEncoding = urlEncoding
+        self.description = description
+        self.useFilename = useFilename
+        self.keepFilename = keepFilename
+        self.verifyDescription = verifyDescription
+        self.ignoreWarning = ignoreWarning
+        if config.upload_to_commons:
+            self.targetSite = targetSite or wikipedia.getSite('commons', 'commons')
+        else:
+            self.targetSite = targetSite or wikipedia.getSite()
+        self.targetSite.forceLogin()
+        self.uploadByUrl = uploadByUrl
+
+    def urlOK(self):
+        '''
+        Returns true iff the URL references an online site or an
+        existing local file.
+        '''
+        return self.url != '' and ('://' in self.url or os.path.exists(self.url))
+
+    def read_file_content(self):
+        if not self._retrieved or self.uploadByUrl:
+            # Get file contents
+            wikipedia.output(u'Reading file %s' % self.url)
+            if '://' in self.url:
+                resume = False
+                dt = 15
+
+                while not self._retrieved:
+                    uo = wikipedia.MyURLopener()
+                    if resume:
+                        wikipedia.output(u"Resume download...")
+                        uo.addheader('Range', 'bytes=%s-' % rlen)
+
+                    file = uo.open(self.url)
+
+                    if 'text/html' in file.info().getheader('Content-Type'):
+                        print "Couldn't download the image: the requested URL was not found on this server."
+                        return
+
+                    content_len = file.info().getheader('Content-Length')
+                    accept_ranges = file.info().getheader('Accept-Ranges') == 'bytes'
+
+                    if resume:
+                        self._contents += file.read()
+                    else:
+                        self._contents = file.read()
+
+                    file.close()
+                    self._retrieved = True
+
+                    if content_len:
+                        rlen = len(self._contents)
+                        content_len = int(content_len)
+                        if rlen < content_len:
+                            self._retrieved = False
+                            wikipedia.output(u"Connection closed at byte %s (%s left)" % (rlen, content_len))
+                            if accept_ranges and rlen > 0:
+                                resume = True
+                            wikipedia.output(u"Sleeping for %d seconds..." % dt)
+                            time.sleep(dt)
+                            if dt <= 60:
+                                dt += 15
+                            elif dt < 360:
+                                dt += 60
+                    else:
+                        if wikipedia.verbose:
+                            wikipedia.output(u"WARNING: No check length to retrieved data is possible.")
+            else:
+                # Opening local files with MyURLopener would be possible, but we
+                # don't do it because it only accepts ASCII characters in the
+                # filename.
+                file = open(self.url,"rb")
+                self._contents = file.read()
+                file.close()
+    
+    def process_filename(self):
+        # Isolate the pure name
+        filename = self.url
+        
+        if '/' in filename:
+            filename = filename.split('/')[-1]
+        
+        if '\' in filename:
+            filename = filename.split('\')[-1]
+        
+        if self.urlEncoding:
+            filename = urllib.unquote(filename.decode(self.urlEncoding))
+        
+        if self.useFilename:
+            filename = self.useFilename
+        if not self.keepFilename:
+            wikipedia.output(u"The filename on the target wiki will default to: %s" % filename)
+            # ask newfn until it's valid
+            ok = False
+            # FIXME: these 2 belong somewhere else, presumably in family
+            forbidden = '/' # to be extended
+            allowed_formats = (u'gif', u'jpg', u'jpeg', u'mid', u'midi', u'ogg', u'png', u'svg', u'xcf', u'djvu')
+            while not ok:
+                ok = True
+                newfn = wikipedia.input(u'Enter a better name, or press enter to accept:')
+                if newfn == "":
+                    newfn = filename
+                ext = os.path.splitext(newfn)[1].lower().strip('.')
+                for c in forbidden:
+                    if c in newfn:
+                        print "Invalid character: %s. Please try again" % c
+                        ok = False
+                if ext not in allowed_formats and ok:
+                    choice = wikipedia.inputChoice(u"File format is not one of [%s], but %s. Continue?" % (u' '.join(allowed_formats), ext), ['yes', 'no'], ['y', 'N'], 'N')
+                    if choice == 'n':
+                        ok = False
+            if newfn != '':
+                filename = newfn
+        # MediaWiki doesn't allow spaces in the file name.
+        # Replace them here to avoid an extra confirmation form
+        filename = filename.replace(' ', '_')
+        # A proper description for the submission.
+        wikipedia.output(u"The suggested description is:")
+        wikipedia.output(self.description)
+        if self.verifyDescription:
+                newDescription = u''
+                choice = wikipedia.inputChoice(u'Do you want to change this description?', ['Yes', 'No'], ['y', 'N'], 'n')
+                if choice == 'y':
+                        import editarticle
+                        editor = editarticle.TextEditor()
+                        newDescription = editor.edit(self.description)
+                # if user saved / didn't press Cancel
+                if newDescription:
+                        self.description = newDescription
+        return filename
+    
+    def upload_image(self, debug=False):
+        """Gets the image at URL self.url, and uploads it to the target wiki.
+           Returns the filename which was used to upload the image.
+           If the upload fails, the user is asked whether to try again or not.
+           If the user chooses not to retry, returns null.
+        """
+        try:
+            if config.use_api and self.targetSite.versionnumber() >= 16:
+                x = self.targetSite.api_address()
+                del x
+            else:
+                raise NotImplementedError
+        except NotImplementedError:
+            return self._uploadImageOld(debug)
+        
+        if not hasattr(self,'_contents'):
+            self.read_file_content()
+        
+        filename = self.process_filename()
+        
+        params = {
+            'action': 'upload',
+            'token': self.targetSite.getToken(),
+            'comment': self.description,
+            'filename': filename,
+            #'': '',
+        }
+        if self.uploadByUrl:
+            params['url'] = self.url
+        else:
+            params['file'] = self._contents
+        
+        if self.ignoreWarning:
+            params['ignorewarnings'] = 1
+        
+        wikipedia.output(u'Uploading file to %s via API....' % self.targetSite)
+        
+        data = query.GetData(params, self.targetSite)
+        
+        if wikipedia.verbose:
+            wikipedia.output("%s" % data)
+        
+        if 'error' in data: # error occured
+            errCode = data['error']['code']
+            wikipedia.output("%s" % data)
+        else:
+            data = data['upload']
+            if data['result'] == u'Warning': #upload success but return warning.
+                warn = data['warnings'].keys()[0]
+                wikipedia.output("We got a warning message:", newline=False)
+                warFn = data['warnings'][warn]
+                if warn == 'duplicate-archive':
+                    wikipedia.output("The file is duplicate a deleted file %s." % warFn)
+                elif warn == 'was-deleted':
+                    wikipedia.output("This file was deleted for %s." % warFn)
+                elif warn == 'emptyfile':
+                    wikipedia.output("File %s is an empty file." % warFn)
+                elif warn == 'exists':
+                    wikipedia.output("File %s is exists." % warFn)
+                elif warn == 'duplicate':
+                    wikipedia.output("Uploaded file is duplicate with %s." % warFn)
+                elif warn == 'badfilename':
+                    wikipedia.output("Target filename is invaild.")
+                elif warn == 'filetype-unwanted-type':
+                    wikipedia.output("File %s type is unwatched type." % warFn)
+                answer = wikipedia.inputChoice(u"Do you want to ignore?", ['Yes', 'No'], ['y', 'N'], 'N')
+                if answer == "y":
+                    self.ignoreWarning = 1
+                    self.keepFilename = True
+                    return self.upload_image(debug)
+                else:
+                    wikipedia.output("Upload aborted.")
+                    return
+                
+            elif data['result'] == u'Success': #No any warning, upload and online complete.
+                wikipedia.output(u"Upload successful.")
+                return filename #data['filename']
+        
+
+    def _uploadImageOld(self, debug=False):
+        if not hasattr(self,'_contents'):
+            self.read_file_content()
+        
+        filename = self.process_filename()
+        # Convert the filename (currently Unicode) to the encoding used on the
+        # target wiki
+        encodedFilename = filename.encode(self.targetSite.encoding())
+
+
+        formdata = {}
+        formdata["wpUploadDescription"] = self.description
+        formdata["wpUploadAffirm"] = "1"
+        formdata["wpUpload"] = "upload bestand"
+        # This somehow doesn't work.
+        if self.ignoreWarning:
+            formdata["wpIgnoreWarning"] = "1"
+
+        # Get an edit token so we can do the upload
+        formdata["wpEditToken"]  = self.targetSite.getToken()
+
+        # Set the new filename
+        formdata["wpDestFile"]  = filename
+
+        if self.uploadByUrl:
+            formdata["wpUploadFileURL"]  = self.url
+            formdata["wpSourceType"] = 'Url'
+        #Not needed now. Might be needed in the future
+        #else:
+        #    formdata["wpSourceType"] = 'file'
+        
+        # try to encode the strings to the encoding used by the target site.
+        # if that's not possible (e.g. because there are non-Latin-1 characters and
+        # the home Wikipedia uses Latin-1), convert all non-ASCII characters to
+        # HTML entities.
+        for key in formdata:
+            assert isinstance(key, basestring), "ERROR: %s is not a string but %s" % (key, type(key))
+            try:
+                formdata[key] = formdata[key].encode(self.targetSite.encoding())
+            except (UnicodeEncodeError, UnicodeDecodeError):
+                formdata[key] = wikipedia.UnicodeToAsciiHtml(formdata[key]).encode(self.targetSite.encoding())
+
+        # don't upload if we're in debug mode
+        if not debug:
+            wikipedia.output(u'Uploading file to %s...' % self.targetSite)
+
+            if self.uploadByUrl:
+                # Just do a post with all the fields filled out
+                response, returned_html = self.targetSite.postForm(self.targetSite.upload_address(), formdata.items(), cookies = self.targetSite.cookies())
+            else:
+                response, returned_html = post_multipart(self.targetSite, self.targetSite.upload_address(),
+                                  formdata.items(), (('wpUploadFile', encodedFilename, self._contents),),
+                                  cookies = self.targetSite.cookies())
+            # There are 2 ways MediaWiki can react on success: either it gives
+            # a 200 with a success message, or it gives a 302 (redirection).
+            # Do we know how the "success!" HTML page should look like?
+            # ATTENTION: if you changed your Wikimedia Commons account not to show
+            # an English interface, this detection will fail!
+            success_msg = self.targetSite.mediawiki_message('successfulupload')
+            if success_msg in returned_html or response.status == 302:
+                 wikipedia.output(u"Upload successful.")
+            # The following is not a good idea, because the server also gives a 200 when
+            # something went wrong.
+            #if response.status in [200, 302]:
+            #    wikipedia.output(u"Upload successful.")
+
+            elif response.status == 301:
+                wikipedia.output(u"Following redirect...")
+                address = response.getheader('Location')
+                wikipedia.output(u"Changed upload address to %s. Please update %s.py" % (address, self.targetSite.family.__module__))
+                exec('self.targetSite.upload_address = lambda: %r' % address, locals(), globals())
+                return self.upload_image(debug)
+            else:
+                try:
+                    # Try to find the error message within the HTML page.
+                    # If we can't find it, we just dump the entire HTML page.
+                    returned_html = returned_html[returned_html.index('<!-- start content -->') + 22: returned_html.index('<!-- end content -->')]
+                except:
+                    pass
+                wikipedia.output(u'%s\n\n' % returned_html)
+                wikipedia.output(u'%i %s' % (response.status, response.reason))
+
+                if self.targetSite.mediawiki_message('uploadwarning') in returned_html:
+                    answer = wikipedia.inputChoice(u"You have recevied an upload warning message. Ignore?", ['Yes', 'No'], ['y', 'N'], 'N')
+                    if answer == "y":
+                        self.ignoreWarning = 1
+                        self.keepFilename = True
+                        return self._uploadImageOld(debug)
+                else:
+                    answer = wikipedia.inputChoice(u'Upload of %s probably failed. Above you see the HTML page which was returned by MediaWiki. Try again?' % filename, ['Yes', 'No'], ['y', 'N'], 'N')
+                    if answer == "y":
+                        return self._uploadImageOld(debug)
+                    else:
+                        return
+        return filename
+
+    def run(self):
+        while not self.urlOK():
+            if not self.url:
+                wikipedia.output(u'No input filename given')
+            else:
+                wikipedia.output(u'Invalid input filename given. Try again.')
+            self.url = wikipedia.input(u'File or URL where image is now:')
+        return self.upload_image()
+
+def main(args):
+    url = u''
+    description = []
+    keepFilename = False
+    useFilename = None
+    verifyDescription = True
+
+    # call wikipedia.py function to process all global wikipedia args
+    # returns a list of non-global args, i.e. args for upload.py
+    args = wikipedia.handleArgs()
+
+    for arg in args:
+        if arg:
+            if arg.startswith('-keep'):
+                keepFilename = True
+            elif arg.startswith('-filename:'):
+                useFilename = arg[10:]
+            elif arg.startswith('-noverify'):
+                verifyDescription = False
+            elif url == u'':
+                url = arg
+            else:
+                description.append(arg)
+    description = u' '.join(description)
+    bot = UploadRobot(url, description=description, useFilename=useFilename, keepFilename=keepFilename, verifyDescription=verifyDescription)
+    bot.run()
+
+if __name__ == "__main__":
+    try:
+        main(sys.argv[1:])
+    finally:
+        wikipedia.stopme()