Revision: 7521 Author: alexsh Date: 2009-10-21 02:12:58 +0000 (Wed, 21 Oct 2009)
Log Message: ----------- wp.py:Site().postData():change httplib to MyURLopener, disable all response.reason, response.status->response.code, remove all httplib.BadStatusLine
Modified Paths: -------------- trunk/pywikipedia/interwiki.py trunk/pywikipedia/login.py trunk/pywikipedia/query.py trunk/pywikipedia/upload.py trunk/pywikipedia/userlib.py trunk/pywikipedia/weblinkchecker.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-10-20 23:28:49 UTC (rev 7520) +++ trunk/pywikipedia/interwiki.py 2009-10-21 02:12:58 UTC (rev 7521) @@ -1596,7 +1596,7 @@ timeout=60 while 1: try: - status, reason, data = page.put(newtext, comment = mcomment) + status, data = page.put(newtext, comment = mcomment) except wikipedia.LockedPage: wikipedia.output(u'Page %s is locked. Skipping.' % (page.title(),)) raise SaveError @@ -1628,7 +1628,7 @@ if str(status) == '302': return True else: - wikipedia.output(u'%s %s' % (status, reason)) + wikipedia.output(u'%s' % status) return False elif answer == 'g': raise GiveUpOnPage
Modified: trunk/pywikipedia/login.py =================================================================== --- trunk/pywikipedia/login.py 2009-10-20 23:28:49 UTC (rev 7520) +++ trunk/pywikipedia/login.py 2009-10-21 02:12:58 UTC (rev 7521) @@ -182,7 +182,7 @@ fakepredata['wpPassword'] = fakepredata['lgpassword'] = u'XXXXX' wikipedia.output(u"self.site.postData(%s, %s)" % (address, self.site.urlEncode(fakepredata))) fakeresponsemsg = re.sub(r"(session|Token)=..........", r"session=XXXXXXXXXX", response.msg.__str__()) - wikipedia.output(u"%s/%s\n%s" % (response.status, response.reason, fakeresponsemsg)) + wikipedia.output(u"%s\n%s" % (response.code, fakeresponsemsg)) wikipedia.output(u"%s" % data) Reat=re.compile(': (.*?);') L = []
Modified: trunk/pywikipedia/query.py =================================================================== --- trunk/pywikipedia/query.py 2009-10-20 23:28:49 UTC (rev 7520) +++ trunk/pywikipedia/query.py 2009-10-21 02:12:58 UTC (rev 7521) @@ -115,11 +115,6 @@ (('file', params['filename'].encode(site.encoding()), cont),), site.cookies(sysop=sysop) ) - elif site.hostname() in wikipedia.config.authenticate.keys(): - params["Content-type"] = "application/x-www-form-urlencoded" - params["User-agent"] = useragent - res = urllib2.urlopen(urllib2.Request(site.protocol() + '://' + site.hostname() + address, site.urlEncode(params))) - jsontext = res.read() elif params['action'] in postAC: res, jsontext = site.postForm(path, params, sysop, site.cookies(sysop = sysop) ) else:
Modified: trunk/pywikipedia/upload.py =================================================================== --- trunk/pywikipedia/upload.py 2009-10-20 23:28:49 UTC (rev 7520) +++ trunk/pywikipedia/upload.py 2009-10-21 02:12:58 UTC (rev 7521) @@ -347,14 +347,14 @@ # ATTENTION: if you changed your Wikimedia Commons account not to show # an English interface, this detection will fail! success_msg = self.targetSite.mediawiki_message('successfulupload') - if success_msg in returned_html or response.status == 302: + if success_msg in returned_html or response.code == 302: wikipedia.output(u"Upload successful.") # The following is not a good idea, because the server also gives a 200 when # something went wrong. - #if response.status in [200, 302]: + #if response.code in [200, 302]: # wikipedia.output(u"Upload successful.")
- elif response.status == 301: + elif response.code == 301: wikipedia.output(u"Following redirect...") address = response.getheader('Location') wikipedia.output(u"Changed upload address to %s. Please update %s.py" % (address, self.targetSite.family.__module__)) @@ -368,7 +368,7 @@ except: pass wikipedia.output(u'%s\n\n' % returned_html) - wikipedia.output(u'%i %s' % (response.status, response.reason)) + wikipedia.output(u'%i' % response.code)
if self.targetSite.mediawiki_message('uploadwarning') in returned_html: answer = wikipedia.inputChoice(u"You have recevied an upload warning message. Ignore?", ['Yes', 'No'], ['y', 'N'], 'N')
Modified: trunk/pywikipedia/userlib.py =================================================================== --- trunk/pywikipedia/userlib.py 2009-10-20 23:28:49 UTC (rev 7520) +++ trunk/pywikipedia/userlib.py 2009-10-21 02:12:58 UTC (rev 7521) @@ -554,7 +554,7 @@ address = self.site().unblock_address()
response, data = self.site().postForm(address, predata, sysop = True) - if response.status != 302: + if response.code != 302: if self.site().mediawiki_message('ipb_cant_unblock').replace('$1',blockID) in data: raise AlreadyUnblockedError raise UnblockError, data
Modified: trunk/pywikipedia/weblinkchecker.py =================================================================== --- trunk/pywikipedia/weblinkchecker.py 2009-10-20 23:28:49 UTC (rev 7520) +++ trunk/pywikipedia/weblinkchecker.py 2009-10-21 02:12:58 UTC (rev 7521) @@ -394,7 +394,7 @@ return self.resolveRedirect(useHEAD = False) else: raise - if response.status >= 300 and response.status <= 399: + if response.code >= 300 and response.code <= 399: #print response.getheaders() redirTarget = response.getheader('Location') if redirTarget: @@ -493,10 +493,10 @@ # read the server's encoding, in case we need it later self.readEncodingFromResponse(response) # site down if the server status is between 400 and 499 - alive = response.status not in range(400, 500) - if response.status in self.HTTPignore: + alive = response.code not in range(400, 500) + if response.code in self.HTTPignore: alive = False - return alive, '%s %s' % (response.status, response.reason) + return alive, '%s' % response.code
class LinkCheckThread(threading.Thread): '''
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2009-10-20 23:28:49 UTC (rev 7520) +++ trunk/pywikipedia/wikipedia.py 2009-10-21 02:12:58 UTC (rev 7521) @@ -123,7 +123,7 @@ __version__ = '$Id$'
import os, sys -import httplib, socket, urllib, urllib2 +import socket, urllib, urllib2 import traceback import time, threading, Queue import math @@ -1647,8 +1647,6 @@ response, data = query.GetData(params, self.site(), sysop=sysop, back_response = True) if query.IsString(data): raise KeyError - except httplib.BadStatusLine, line: - raise PageNotSaved('Bad status line: %s' % line.line) except ServerError: output(u''.join(traceback.format_exception(*sys.exc_info()))) retry_attempt += 1 @@ -1673,7 +1671,7 @@ # Check blocks self.site().checkBlocks(sysop = sysop) # A second text area means that an edit conflict has occured. - if response.status == 500: + if response.code == 500: output(u"Server error encountered; will retry in %i minute%s." % (retry_delay, retry_delay != 1 and "s" or "")) time.sleep(60 * retry_delay) @@ -1698,8 +1696,8 @@ #for debug only #------------------------ if verbose: - output("error occured, code:%s\ninfo:%s\nstatus:%s\nresponse:%s" % ( - data['error']['code'], data['error']['info'], response.status, response.reason)) + output("error occured, code:%s\ninfo:%s\nstatus:%s" % ( + data['error']['code'], data['error']['info'], response.code)) faked = params if 'text' in faked: del faked['text'] @@ -1708,7 +1706,7 @@ #------------------------ errorCode = data['error']['code'] #cannot handle longpageerror and PageNoSave yet - if errorCode == 'maxlag' or response.status == 503: + if errorCode == 'maxlag' or response.code == 503: # server lag; wait for the lag time and retry m = re.search('Waiting for (.+?): (.+?) seconds lagged', data['error']['info']) timelag = int(m.group(2)) @@ -1793,13 +1791,13 @@ # if the page update is successed, we need to return code 302 for cheat script who # using status code # - return 302, response.reason, data['edit'] + return 302, data['edit']
solve = self.site().solveCaptcha(data) if solve: return self._putPage(text, comment, watchArticle, minorEdit, newPage, token, newToken, sysop, captcha=solve)
- return response.status, response.reason, data + return response.code, data
def _putPageOld(self, text, comment=None, watchArticle=False, minorEdit=True, @@ -1886,7 +1884,7 @@ return None try: response, data = self.site().postForm(address, predata, sysop) - if response.status == 503: + if response.code == 503: if 'x-database-lag' in response.msg.keys(): # server lag; Mediawiki recommends waiting 5 seconds # and retrying @@ -1898,9 +1896,7 @@ wait = min(wait*2, 300) continue # Squid error 503 - raise ServerError(response.status) - except httplib.BadStatusLine, line: - raise PageNotSaved('Bad status line: %s' % line.line) + raise ServerError(response.code) except ServerError: output(u''.join(traceback.format_exception(*sys.exc_info()))) retry_attempt += 1 @@ -1992,7 +1988,7 @@ # to "Wikipedia has a problem", but I'm not sure. Maybe we could # just check for HTTP Status 500 (Internal Server Error)? if ("<title>Wikimedia Error</title>" in data or "has a problem</title>" in data) \ - or response.status == 500: + or response.code == 500: output(u"Server error encountered; will retry in %i minute%s." % (retry_delay, retry_delay != 1 and "s" or "")) time.sleep(60 * retry_delay) @@ -2053,7 +2049,7 @@ # Something went wrong, and we don't know what. Show the # HTML code that hopefully includes some error message. output(u"ERROR: Unexpected response from wiki server.") - output(u" %s (%s) " % (response.status, response.reason)) + output(u" %s" % response.code) output(data) # Unexpected responses should raise an error and not pass, # be it silently or loudly. This should raise an error @@ -2062,7 +2058,7 @@ # We are on the preview page, so the page was not saved raise PageNotSaved
- return response.status, response.reason, data + return response.code, data
def canBeEdited(self): """Return bool indicating whether this page can be edited. @@ -3285,7 +3281,7 @@ else: response, data = self.site().postForm(address, predata, sysop=True)
- if response.status == 302 and not data: + if response.code == 302 and not data: output(u'Changed protection level of page %s.' % self.aslink()) return True else: @@ -3293,7 +3289,7 @@ self.site().checkBlocks(sysop = True) output(u'Failed to change protection level of page %s:' % self.aslink()) - output(u"HTTP response code %s" % response.status) + output(u"HTTP response code %s" % response.code) output(data) return False
@@ -3691,7 +3687,7 @@ while True: try: data = self.getData() - except (socket.error, httplib.BadStatusLine, ServerError): + except (socket.error, ServerError): # Print the traceback of the caught exception output(u''.join(traceback.format_exception(*sys.exc_info()))) output(u'DBG> got network error in _GetAll.run. ' \ @@ -5138,14 +5134,6 @@ if not language[0].upper() + language[1:] in self.namespaces(): self._validlanguages.append(language)
- #if persistent_http is None: - # persistent_http = config.persistent_http - #self.persistent_http = persistent_http and self.protocol() in ('http', 'https') - #if persistent_http: - # if self.protocol() == 'http': - # self.conn = httplib.HTTPConnection(self.hostname()) - # elif self.protocol() == 'https': - # self.conn = httplib.HTTPSConnection(self.hostname()) self.persistent_http = False
def _userIndex(self, sysop = False): @@ -5389,75 +5377,94 @@ """
# TODO: add the authenticate stuff here - - #if False: #self.persistent_http: - # conn = self.conn - #else: - if config.proxy['host']: - conn = httplib.HTTPConnection(config.proxy['host']) - proxyPutAddr = '%s://%s%s' % (self.protocol(), self.hostname(), address) - conn.putrequest('POST', proxyPutAddr) - if type(config.proxy['auth']) == tuple: - import base64 - authcode = base64.b64encode("%s:%s" % (config.proxy['auth'][0], config.proxy['auth'][1]) ) - conn.putheader('Proxy-Authorization', "Basic %s" % authcode ) - + if self.hostname() in config.authenticate.keys(): + uo = authenticateURLopener else: - if self.protocol() == 'http': - conn = httplib.HTTPConnection(self.hostname()) - elif self.protocol() == 'https': - conn = httplib.HTTPSConnection(self.hostname()) + if config.proxy['host'] and type(config.proxy['auth']) == tuple: + proxyHandle = {'http':'http://%s:%s@%s' % (config.proxy['auth'][0], config.proxy['auth'][1], config.proxy['host'] )} + elif config.proxy['host']: + proxyHandle = {'http':'http://%s' % config.proxy['host'] } + else: + proxyHandle = None
- conn.putrequest('POST', address) - - # Encode all of this into a HTTP request - # otherwise, it will crash, as other protocols are not supported - + uo = MyURLopener(proxies = proxyHandle) + uo.addheader('Cookie', cookies) + if compress: + uo.addheader('Accept-encoding', 'gzip') if address[-1] == "?": address = address[:-1] - if self.hostname() in config.authenticate.keys(): - import base64 - authcode = base64.b64encode("%s:%s" % (config.authenticate[self.hostname()][0], config.authenticate[self.hostname()][1]) ) - conn.putheader("Authorization", "Basic %s" % authcode ) - - conn.putheader('Content-Length', str(len(data))) - conn.putheader('Content-type', contentType) - conn.putheader('User-agent', useragent) - if cookies: - conn.putheader('Cookie', cookies) - #if False: #self.persistent_http: - # conn.putheader('Connection', 'Keep-Alive') - if compress: - conn.putheader('Accept-encoding', 'gzip') - conn.endheaders() - conn.send(data) + url = '%s://%s%s' % (self.protocol(), self.hostname(), address)
- # Prepare the return values - # Note that this can raise network exceptions which are not - # caught here. - try: - response = conn.getresponse() - except httplib.BadStatusLine: - # Blub. - conn.close() - conn.connect() - return self.postData(address, data, contentType, sysop, compress, cookies) + # Try to retrieve the page until it was successfully loaded (just in + # case the server is down or overloaded). + # Wait for retry_idle_time minutes (growing!) between retries. + retry_idle_time = 1 + while True: + try: + if self.hostname() in config.authenticate.keys(): + request = urllib2.Request(url, data) + request.add_header('User-agent', useragent) + opener = urllib2.build_opener() + f = opener.open(request) + else: + f = uo.open(url, data)
- data = response.read() + # read & info can raise socket.error + text = f.read() + headers = f.info() + break + except KeyboardInterrupt: + raise + except Exception, e: + if retry: + # We assume that the server is down. Wait some time, then try again. + output(u"%s" % e) + output(u"WARNING: Could not open '%s'. Maybe the server or" % url) + output(u"your connection is down. Retrying in %i minutes..." % retry_idle_time) + time.sleep(retry_idle_time * 60) + # Next time wait longer, but not longer than half an hour + retry_idle_time *= 2 + if retry_idle_time > 30: + retry_idle_time = 30 + else: + raise
- if compress and response.getheader('Content-Encoding') == 'gzip': - data = decompress_gzip(data) + contentType = headers.get('content-type', '') + contentEncoding = headers.get('content-encoding', '')
- data = data.decode(self.encoding()) - response.close() + # Ensure that all sent data is received + if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers: + output(u'Warning! len(text) does not match content-length: %s != %s' % \ + (len(text), headers.get('content-length'))) + return self.postData(path, address, data, contentType, sysop, compress, cookie)
- if True: #not self.persistent_http: - conn.close() + if compress and contentEncoding == 'gzip': + text = decompress_gzip(text)
+ R = re.compile('charset=([^'";]+)') + m = R.search(contentType) + if m: + charset = m.group(1) + else: + if verbose: + output(u"WARNING: No character set found.") + # UTF-8 as default + charset = 'utf-8' + # Check if this is the charset we expected + self.checkCharset(charset) + # Convert HTML to Unicode + try: + text = unicode(text, charset, errors = 'strict') + except UnicodeDecodeError, e: + print e + output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \ufffd.' % (self.protocol(), self.hostname(), path)) + # We use error='replace' in case of bad encoding. + text = unicode(text, charset, errors = 'replace') + # If a wiki page, get user data - self._getUserDataOld(data, sysop = sysop) + self._getUserDataOld(text, sysop = sysop)
- return response, data + return f, text
def getUrl(self, path, retry = None, sysop = False, data = None, compress = True, no_hostname = False, cookie_only=False, back_response=False): @@ -5524,10 +5531,8 @@ if retry: # We assume that the server is down. Wait some time, then try again. output(u"%s" % e) - output(u"""\ -WARNING: Could not open '%s'. Maybe the server or -your connection is down. Retrying in %i minutes...""" - % (url, retry_idle_time)) + output(u"WARNING: Could not open '%s'. Maybe the server or" % url) + output("your connection is down. Retrying in %i minutes..." % retry_idle_time) time.sleep(retry_idle_time * 60) # Next time wait longer, but not longer than half an hour retry_idle_time *= 2 @@ -5545,9 +5550,6 @@ if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers: output(u'Warning! len(text) does not match content-length: %s != %s' % \ (len(text), headers.get('content-length'))) - #if False: #self.persistent_http - # self.conn.close() - # self.conn.connect() return self.getUrl(path, retry, sysop, data, compress, no_hostname, cookie_only, back_response)
if compress and contentEncoding == 'gzip':
pywikipedia-svn@lists.wikimedia.org