Revision: 7599
Author: alexsh
Date: 2009-11-05 18:16:45 +0000 (Thu, 05 Nov 2009)
Log Message:
-----------
wikipedia.py.Site().postData(): change library httplib to urllib2, and change all response usage to correct attribute(status->code, reason->msg)
Modified Paths:
--------------
trunk/pywikipedia/login.py
trunk/pywikipedia/query.py
trunk/pywikipedia/upload.py
trunk/pywikipedia/userlib.py
trunk/pywikipedia/weblinkchecker.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/login.py
===================================================================
--- trunk/pywikipedia/login.py 2009-11-05 18:14:37 UTC (rev 7598)
+++ trunk/pywikipedia/login.py 2009-11-05 18:16:45 UTC (rev 7599)
@@ -150,69 +150,53 @@
login_address = self.site.login_address()
address = login_address + '&action=submit'
- if self.site.hostname() in config.authenticate.keys():
- headers = {
- "Content-type": "application/x-www-form-urlencoded",
- "User-agent": wikipedia.useragent
- }
- data = self.site.urlEncode(predata)
+ if api:
+ response, data = query.GetData(predata, self.site, back_response = True)
+ if data['login']['result'] != "Success":
+ faildInfo = data['login']['result']
+ #if faildInfo == "NotExists":
+ #
+ #elif faildInfo == "WrongPass":
+ #
+ #elif faildInfo == "Throttled":
+ #
+ return False
+ else:
+ response, data = self.site.postData(address, self.site.urlEncode(predata))
if self.verbose:
fakepredata = predata
- fakepredata['wpPassword'] = u'XXXX'
- wikipedia.output(u"urllib2.urlopen(urllib2.Request('%s', %s, %s)):" % (self.site.protocol() + '://' + self.site.hostname() + address, self.site.urlEncode(fakepredata), headers))
- response = urllib2.urlopen(urllib2.Request(self.site.protocol() + '://' + self.site.hostname() + address, data, headers))
- data = response.read()
- if self.verbose:
- fakedata = re.sub(r"(session|Token)=..........", r"session=XXXXXXXXXX", data)
+ fakepredata['wpPassword'] = u'XXXXX'
+ wikipedia.output(u"self.site.postData(%s, %s)" % (address, self.site.urlEncode(fakepredata)))
trans = config.transliterate
config.transliterate = False #transliteration breaks for some reason
wikipedia.output(fakedata.decode(self.site.encoding()))
config.transliterate = trans
+ fakeresponsemsg = re.sub(r"(session|Token)=..........", r"session=XXXXXXXXXX", data)
+ wikipedia.output(u"%s/%s\n%s" % (response.code, response.msg, fakeresponsemsg))
wikipedia.cj.save(wikipedia.COOKIEFILE)
- return "Ok"
- else:
- if api:
- response, data = query.GetData(predata, self.site, back_response = True)
- if data['login']['result'] != "Success":
- faildInfo = data['login']['result']
- #if faildInfo == "NotExists":
- #
- #elif faildInfo == "WrongPass":
- #
- #elif faildInfo == "Throttled":
- #
- return False
- else:
- response, data = self.site.postData(address, self.site.urlEncode(predata))
- if self.verbose:
- fakepredata = predata
- fakepredata['wpPassword'] = fakepredata['lgpassword'] = u'XXXXX'
- wikipedia.output(u"self.site.postData(%s, %s)" % (address, self.site.urlEncode(fakepredata)))
- fakeresponsemsg = re.sub(r"(session|Token)=..........", r"session=XXXXXXXXXX", response.msg.__str__())
- wikipedia.output(u"%s/%s\n%s" % (response.status, response.reason, fakeresponsemsg))
- wikipedia.output(u"%s" % data)
- Reat=re.compile(': (.*?);')
- L = []
+
+ Reat=re.compile(': (.*?);')
- for eat in response.msg.getallmatchingheaders('set-cookie'):
- m = Reat.search(eat)
- if m:
- L.append(m.group(1))
+ L = []
+ for eat in response.info().getallmatchingheaders('set-cookie'):
+ m = Reat.search(eat)
+ if m:
+ L.append(m.group(1))
- got_token = got_user = False
- for Ldata in L:
- if 'Token=' in Ldata:
- got_token = True
- if 'User=' in Ldata or 'UserName=' in Ldata:
- got_user = True
+ got_token = got_user = False
+ for Ldata in L:
+ if 'Token=' in Ldata:
+ got_token = True
+ if 'User=' in Ldata or 'UserName=' in Ldata:
+ got_user = True
- if got_token and got_user:
- return "\n".join(L)
- elif not captcha:
- solve = self.site.solveCaptcha(data)
- if solve:
- return self.getCookie(api = api, remember = remember, captcha = solve)
- return None
+ if got_token and got_user:
+ return "\n".join(L)
+ elif not captcha:
+ solve = self.site.solveCaptcha(data)
+ if solve:
+ return self.getCookie(api = api, remember = remember, captcha = solve)
+ return None
def storecookiedata(self, data):
"""
Modified: trunk/pywikipedia/query.py
===================================================================
--- trunk/pywikipedia/query.py 2009-11-05 18:14:37 UTC (rev 7598)
+++ trunk/pywikipedia/query.py 2009-11-05 18:16:45 UTC (rev 7599)
@@ -80,7 +80,7 @@
postAC = [
- 'edit', 'login', 'purge', 'rollback', 'delete', 'undelete', 'protect',
+ 'edit', 'login', 'purge', 'rollback', 'delete', 'undelete', 'protect', 'parse',
'block', 'unblock', 'move', 'emailuser','import', 'userrights', 'upload',
]
if useAPI:
Modified: trunk/pywikipedia/upload.py
===================================================================
--- trunk/pywikipedia/upload.py 2009-11-05 18:14:37 UTC (rev 7598)
+++ trunk/pywikipedia/upload.py 2009-11-05 18:16:45 UTC (rev 7599)
@@ -340,14 +340,14 @@
# ATTENTION: if you changed your Wikimedia Commons account not to show
# an English interface, this detection will fail!
success_msg = self.targetSite.mediawiki_message('successfulupload')
- if success_msg in returned_html or response.status == 302:
+ if success_msg in returned_html or response.code == 302:
wikipedia.output(u"Upload successful.")
# The following is not a good idea, because the server also gives a 200 when
# something went wrong.
- #if response.status in [200, 302]:
+ #if response.code in [200, 302]:
# wikipedia.output(u"Upload successful.")
- elif response.status == 301:
+ elif response.code == 301:
wikipedia.output(u"Following redirect...")
address = response.getheader('Location')
wikipedia.output(u"Changed upload address to %s. Please update %s.py" % (address, self.targetSite.family.__module__))
Modified: trunk/pywikipedia/userlib.py
===================================================================
--- trunk/pywikipedia/userlib.py 2009-11-05 18:14:37 UTC (rev 7598)
+++ trunk/pywikipedia/userlib.py 2009-11-05 18:16:45 UTC (rev 7599)
@@ -554,7 +554,7 @@
address = self.site().unblock_address()
response, data = self.site().postForm(address, predata, sysop = True)
- if response.status != 302:
+ if response.code != 302:
if self.site().mediawiki_message('ipb_cant_unblock').replace('$1',blockID) in data:
raise AlreadyUnblockedError
raise UnblockError, data
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2009-11-05 18:14:37 UTC (rev 7598)
+++ trunk/pywikipedia/weblinkchecker.py 2009-11-05 18:16:45 UTC (rev 7599)
@@ -394,7 +394,7 @@
return self.resolveRedirect(useHEAD = False)
else:
raise
- if response.status >= 300 and response.status <= 399:
+ if response.code >= 300 and response.code <= 399:
#print response.getheaders()
redirTarget = response.getheader('Location')
if redirTarget:
@@ -493,10 +493,10 @@
# read the server's encoding, in case we need it later
self.readEncodingFromResponse(response)
# site down if the server status is between 400 and 499
- alive = response.status not in range(400, 500)
- if response.status in self.HTTPignore:
+ alive = response.code not in range(400, 500)
+ if response.code in self.HTTPignore:
alive = False
- return alive, '%s %s' % (response.status, response.reason)
+ return alive, '%s %s' % (response.code, response.msg)
class LinkCheckThread(threading.Thread):
'''
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-11-05 18:14:37 UTC (rev 7598)
+++ trunk/pywikipedia/wikipedia.py 2009-11-05 18:16:45 UTC (rev 7599)
@@ -116,7 +116,7 @@
"""
from __future__ import generators
#
-# (C) Pywikipedia bot team, 2003-2007
+# (C) Pywikipedia bot team, 2003-2009
#
# Distributed under the terms of the MIT license.
#
@@ -1673,7 +1673,7 @@
# Check blocks
self.site().checkBlocks(sysop = sysop)
# A second text area means that an edit conflict has occured.
- if response.status == 500:
+ if response.code == 500:
output(u"Server error encountered; will retry in %i minute%s."
% (retry_delay, retry_delay != 1 and "s" or ""))
time.sleep(60 * retry_delay)
@@ -1699,7 +1699,7 @@
#------------------------
if verbose:
output("error occured, code:%s\ninfo:%s\nstatus:%s\nresponse:%s" % (
- data['error']['code'], data['error']['info'], response.status, response.reason))
+ data['error']['code'], data['error']['info'], response.code, response.msg))
faked = params
if 'text' in faked:
del faked['text']
@@ -1708,7 +1708,7 @@
#------------------------
errorCode = data['error']['code']
#cannot handle longpageerror and PageNoSave yet
- if errorCode == 'maxlag' or response.status == 503:
+ if errorCode == 'maxlag' or response.code == 503:
# server lag; wait for the lag time and retry
m = re.search('Waiting for (.+?): (.+?) seconds lagged', data['error']['info'])
timelag = int(m.group(2))
@@ -1793,13 +1793,13 @@
# if the page update is successed, we need to return code 302 for cheat script who
# using status code
#
- return 302, response.reason, data['edit']
+ return 302, response.msg, data['edit']
solve = self.site().solveCaptcha(data)
if solve:
return self._putPage(text, comment, watchArticle, minorEdit, newPage, token, newToken, sysop, captcha=solve)
- return response.status, response.reason, data
+ return response.code, response.msg, data
def _putPageOld(self, text, comment=None, watchArticle=False, minorEdit=True,
@@ -1886,7 +1886,7 @@
return None
try:
response, data = self.site().postForm(address, predata, sysop)
- if response.status == 503:
+ if response.code == 503:
if 'x-database-lag' in response.msg.keys():
# server lag; Mediawiki recommends waiting 5 seconds
# and retrying
@@ -1898,7 +1898,7 @@
wait = min(wait*2, 300)
continue
# Squid error 503
- raise ServerError(response.status)
+ raise ServerError(response.code)
except httplib.BadStatusLine, line:
raise PageNotSaved('Bad status line: %s' % line.line)
except ServerError:
@@ -1992,7 +1992,7 @@
# to "Wikipedia has a problem", but I'm not sure. Maybe we could
# just check for HTTP Status 500 (Internal Server Error)?
if ("<title>Wikimedia Error</title>" in data or "has a problem</title>" in data) \
- or response.status == 500:
+ or response.code == 500:
output(u"Server error encountered; will retry in %i minute%s."
% (retry_delay, retry_delay != 1 and "s" or ""))
time.sleep(60 * retry_delay)
@@ -2053,7 +2053,7 @@
# Something went wrong, and we don't know what. Show the
# HTML code that hopefully includes some error message.
output(u"ERROR: Unexpected response from wiki server.")
- output(u" %s (%s) " % (response.status, response.reason))
+ output(u" %s (%s) " % (response.code, response.msg))
output(data)
# Unexpected responses should raise an error and not pass,
# be it silently or loudly. This should raise an error
@@ -2062,7 +2062,7 @@
# We are on the preview page, so the page was not saved
raise PageNotSaved
- return response.status, response.reason, data
+ return response.code, response.msg, data
def canBeEdited(self):
"""Return bool indicating whether this page can be edited.
@@ -3287,7 +3287,7 @@
else:
response, data = self.site().postForm(address, predata, sysop=True)
- if response.status == 302 and not data:
+ if response.code == 302 and not data:
output(u'Changed protection level of page %s.' % self.aslink())
return True
else:
@@ -3295,7 +3295,7 @@
self.site().checkBlocks(sysop = True)
output(u'Failed to change protection level of page %s:'
% self.aslink())
- output(u"HTTP response code %s" % response.status)
+ output(u"HTTP response code %s" % response.code)
output(data)
return False
@@ -5382,71 +5382,93 @@
body of the response.
"""
- # TODO: add the authenticate stuff here
-
- if config.proxy['host']:
- conn = httplib.HTTPConnection(config.proxy['host'])
- proxyPutAddr = '%s://%s%s' % (self.protocol(), self.hostname(), address)
- conn.putrequest('POST', proxyPutAddr)
- if type(config.proxy['auth']) == tuple:
- import base64
- authcode = base64.b64encode("%s:%s" % (config.proxy['auth'][0], config.proxy['auth'][1]) )
- conn.putheader('Proxy-Authorization', "Basic %s" % authcode )
-
- else:
- if self.protocol() == 'http':
- conn = httplib.HTTPConnection(self.hostname())
- elif self.protocol() == 'https':
- conn = httplib.HTTPSConnection(self.hostname())
-
- conn.putrequest('POST', address)
-
- # Encode all of this into a HTTP request
- # otherwise, it will crash, as other protocols are not supported
-
if address[-1] == "?":
address = address[:-1]
- if self.hostname() in config.authenticate.keys():
- import base64
- authcode = base64.b64encode("%s:%s" % (config.authenticate[self.hostname()][0], config.authenticate[self.hostname()][1]) )
- conn.putheader("Authorization", "Basic %s" % authcode )
-
- conn.putheader('Content-Length', str(len(data)))
- conn.putheader('Content-type', contentType)
- conn.putheader('User-agent', useragent)
+
+ headers = {
+ 'User-agent': useragent,
+ 'Content-Length': str(len(data)),
+ 'Content-type':contentType,
+ }
if cookies:
- conn.putheader('Cookie', cookies)
+ headers['Cookie'] = cookies
if compress:
- conn.putheader('Accept-encoding', 'gzip')
- conn.endheaders()
- conn.send(data)
+ headers['Accept-encoding'] = 'gzip'
+
+ url = '%s://%s%s' % (self.protocol(), self.hostname(), address)
+ # Try to retrieve the page until it was successfully loaded (just in
+ # case the server is down or overloaded).
+ # Wait for retry_idle_time minutes (growing!) between retries.
+ retry_idle_time = 1
+ while True:
+ try:
+ request = urllib2.Request(url, data, headers)
+ f = MyURLopener.open(request)
- # Prepare the return values
- # Note that this can raise network exceptions which are not
- # caught here.
- try:
- response = conn.getresponse()
- except httplib.BadStatusLine:
- # Blub.
- conn.close()
- conn.connect()
+ # read & info can raise socket.error
+ text = f.read()
+ headers = f.info()
+ break
+ except KeyboardInterrupt:
+ raise
+ except urllib2.HTTPError, e:
+ if e.code in [401, 404]:
+ raise PageNotFound(u'Page %s could not be retrieved. Check your family file ?' % url)
+ output(u"Result:%s %s" % (e.code, e.msg))
+ raise
+ except Exception, e:
+ output(u'%s' %e)
+ if retry:
+ output(u"""WARNING: Could not open '%s'. Maybe the server or\n your connection is down. Retrying in %i minutes..."""
+ % (url, retry_idle_time))
+ time.sleep(retry_idle_time * 60)
+ # Next time wait longer, but not longer than half an hour
+ retry_idle_time *= 2
+ if retry_idle_time > 30:
+ retry_idle_time = 30
+ continue
+
+ raise
+ resContentType = headers.get('content-type', '')
+ contentEncoding = headers.get('content-encoding', '')
+
+ # Ensure that all sent data is received
+ if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers:
+ output(u'Warning! len(text) does not match content-length: %s != %s' % \
+ (len(text), headers.get('content-length')))
return self.postData(address, data, contentType, sysop, compress, cookies)
- data = response.read()
+ if compress and contentEncoding == 'gzip':
+ text = decompress_gzip(text)
- if compress and response.getheader('Content-Encoding') == 'gzip':
- data = decompress_gzip(data)
+ R = re.compile('charset=([^\'\";]+)')
+ m = R.search(resContentType)
+ if m:
+ charset = m.group(1)
+ else:
+ if verbose:
+ output(u"WARNING: No character set found.")
+ # UTF-8 as default
+ charset = 'utf-8'
+ # Check if this is the charset we expected
+ self.checkCharset(charset)
+ # Convert HTML to Unicode
+ try:
+ text = unicode(text, charset, errors = 'strict')
+ except UnicodeDecodeError, e:
+ print e
+ if no_hostname:
+ output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % path)
+ else:
+ output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \\ufffd.' % (self.protocol(), self.hostname(), path))
+ # We use error='replace' in case of bad encoding.
+ text = unicode(text, charset, errors = 'replace')
- data = data.decode(self.encoding())
- response.close()
-
- conn.close()
-
# If a wiki page, get user data
- self._getUserDataOld(data, sysop = sysop)
+ self._getUserDataOld(text, sysop = sysop)
- return response, data
+ return f, text
def getUrl(self, path, retry = None, sysop = False, data = None, compress = True,
no_hostname = False, cookie_only=False, refer=None, back_response=False):