Revision: 7591
Author: alexsh
Date: 2009-11-04 13:22:17 +0000 (Wed, 04 Nov 2009)
Log Message:
-----------
* site().getUrl(): change all HTTP process to use urllib2.
* handle and combine Site Authentication, proxy handle and Proxy Authentication in the bottom.
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-11-04 09:05:42 UTC (rev 7590)
+++ trunk/pywikipedia/wikipedia.py 2009-11-04 13:22:17 UTC (rev 7591)
@@ -123,7 +123,7 @@
__version__ = '$Id$'
import os, sys
-import httplib, socket, urllib, urllib2
+import httplib, socket, urllib, urllib2, cookielib
import traceback
import time, threading, Queue
import math
@@ -5464,69 +5464,59 @@
Returns the HTML text of the page converted to unicode.
"""
+
if retry is None:
retry = config.retry_on_fail
- if self.hostname() in config.authenticate.keys():
- uo = authenticateURLopener
+ headers = {'User-agent': useragent,}
+
+ if self.cookies(sysop = sysop):
+ headers['Cookie'] = self.cookies(sysop = sysop)
+ if compress:
+ headers['Accept-encoding'] = 'gzip'
+
+ if refer:
+ headers['Refer'] = refer
+
+ if no_hostname: # This allow users to parse also toolserver's script
+ url = path # and other useful pages without using some other functions.
else:
- if config.proxy['host'] and type(config.proxy['auth']) == tuple:
- proxyHandle = {'http':'http://%s:%s@%s' % (config.proxy['auth'][0], config.proxy['auth'][1], config.proxy['host'] )}
- elif config.proxy['host']:
- proxyHandle = {'http':'http://%s' % config.proxy['host'] }
- else:
- proxyHandle = None
-
- uo = MyURLopener(proxies = proxyHandle)
- if self.cookies(sysop = sysop):
- uo.addheader('Cookie', self.cookies(sysop = sysop))
- if compress:
- uo.addheader('Accept-encoding', 'gzip')
- if no_hostname == True: # This allow users to parse also toolserver's script
- url = path # and other useful pages without using some other functions.
- else:
url = '%s://%s%s' % (self.protocol(), self.hostname(), path)
- if refer:
- uo.addheader('Refer', refer)
data = self.urlEncode(data)
-
+
# Try to retrieve the page until it was successfully loaded (just in
# case the server is down or overloaded).
# Wait for retry_idle_time minutes (growing!) between retries.
retry_idle_time = 1
while True:
try:
- if self.hostname() in config.authenticate.keys():
- request = urllib2.Request(url, data)
- request.add_header('User-agent', useragent)
- opener = urllib2.build_opener()
- f = opener.open(request)
- else:
- f = uo.open(url, data)
+ request = urllib2.Request(url, data, headers)
+ f = MyURLopener.open(request)
# read & info can raise socket.error
text = f.read()
headers = f.info()
-
break
except KeyboardInterrupt:
raise
+ except urllib2.HTTPError, e:
+ if e.code in [401, 404]:
+ raise PageNotFound(u'Page %s could not be retrieved. Check your family file ?' % url)
+ output(u"Result:%s %s" % (e.code, e.msg))
+ raise
except Exception, e:
+ output(u'%s' %e)
if retry:
- # We assume that the server is down. Wait some time, then try again.
- output(u"%s" % e)
- output(u"""\
-WARNING: Could not open '%s'. Maybe the server or
-your connection is down. Retrying in %i minutes..."""
+ output(u"""WARNING: Could not open '%s'. Maybe the server or\n your connection is down. Retrying in %i minutes..."""
% (url, retry_idle_time))
time.sleep(retry_idle_time * 60)
# Next time wait longer, but not longer than half an hour
retry_idle_time *= 2
if retry_idle_time > 30:
retry_idle_time = 30
- else:
- raise
-
+ continue
+
+ raise
if cookie_only:
return headers.get('set-cookie', '')
contentType = headers.get('content-type', '')
@@ -5569,8 +5559,8 @@
if back_response:
return f, text
- else:
- return text
+
+ return text
def _getUserData(self, text, sysop = False, force = True):
"""
@@ -8082,48 +8072,37 @@
s = time.strptime(tz, "%Y-%m-%dT%H:%M:%SZ")
return int(time.strftime("%Y%m%d%H%M%S", s))
-class MyURLopener(urllib.FancyURLopener):
- version="PythonWikipediaBot/1.0"
+# Site Cookies handler
+COOKIEFILE = config.datafilepath('login-data', 'cookies.lwp')
+cj = cookielib.LWPCookieJar()
+if os.path.isfile(COOKIEFILE):
+ cj.load(COOKIEFILE)
- def http_error_default(self, url, fp, errcode, errmsg, headers):
- if errcode == 401 or errcode == 404:
- raise PageNotFound(u'Page %s could not be retrieved. Check your family file ?' % url)
- else:
- return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, errmsg, headers)
+cookieProcessor = urllib2.HTTPCookieProcessor(cj)
+MyURLopener = urllib2.build_opener(cookieProcessor)
+
+if config.proxy['host']:
+ proxyHandler = urllib2.ProxyHandler({'http':'http://%s/' % config.proxy['host'] })
+
+ MyURLopener = urllib2.build_opener(cookieProcessor, proxyHandler)
+ if config.proxy['auth']:
+ proxyAuth = urllib2.HTTPPasswordMgrWithDefaultRealm()
+ proxyAuth.add_password(None, config.proxy['host'], config.proxy['auth'][0], config.proxy['auth'][1])
+ proxyAuthHandler = urllib2.ProxyBasicAuthHandler(proxyAuth)
- def open_http(self, url, data=None):
- ret = urllib.FancyURLopener.open_http(self, url, data)
- if hasattr(self, 'http_code'):
- ret.status = self.http_code
- del self.http_code
- else:
- ret.status = 200
- return ret
+ MyURLopener = urllib2.build_opener(cookieProcessor, proxyHandler, proxyAuthHandler)
-
-
-# Special opener in case we are using a site with authentication
if config.authenticate:
- import urllib2, cookielib
- COOKIEFILE = config.datafilepath('login-data', 'cookies.lwp')
- cj = cookielib.LWPCookieJar()
- if os.path.isfile(COOKIEFILE):
- cj.load(COOKIEFILE)
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
for site in config.authenticate:
passman.add_password(None, site, config.authenticate[site][0], config.authenticate[site][1])
authhandler = urllib2.HTTPBasicAuthHandler(passman)
+
+ MyURLopener = urllib2.build_opener(cookieProcessor, authhandler)
if config.proxy['host']:
- proxyHandle = urllib2.ProxyHandler({'http':'http://%s' % config.proxy['host'] })
+ MyURLopener = urllib2.build_opener(cookieProcessor, authhandler, proxyHandler)
if config.proxy['auth']:
- proxyAuth = urllib2.HTTPPasswordMgr()
- proxyAuth.add_password(None, config.proxy['host'], config.proxy['auth'][0], config.proxy['auth'][1])
- proxyAuthHandle = urllib2.ProxyBasicAuthHandler(proxyAuth)
- else:
- proxyHandle = None
- proxyAuthHandle = None
- authenticateURLopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), proxyHandle, proxyAuthHandle, authhandler)
- urllib2.install_opener(authenticateURLopener)
+ MyURLopener = urllib2.build_opener(cookieProcessor, authhandler, proxyHandler, proxyAuthHandler)
if __name__ == '__main__':
import doctest