Revision: 3938
Author: wikipedian
Date: 2007-08-01 22:11:21 +0000 (Wed, 01 Aug 2007)
Log Message:
-----------
decode HTTP redirect targets
use Latin-1 as default server encoding instead of UTF-8
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2007-08-01 21:53:08 UTC (rev 3937)
+++ trunk/pywikipedia/weblinkchecker.py 2007-08-01 22:11:21 UTC (rev 3938)
@@ -198,6 +198,7 @@
return httplib.HTTPSConnection(self.host)
def getEncodingUsedByServer(self):
+ # TODO: We could maybe save a few accesses here by caching.
try:
conn = self.getConnection()
conn.request('HEAD', '/', None, self.header)
@@ -208,8 +209,9 @@
charset = charsetR.search(ct).group(1)
return charset
except:
- wikipedia.output(u'Error retrieving server\'s default charset. Using
UTF-8.')
- return 'utf-8'
+ wikipedia.output(u'Error retrieving server\'s default charset. Using
ISO 8859-1.')
+ # most browsers use ISO 8859-1 (Latin-1) as the default.
+ return 'iso8859-1'
def changeUrl(self, url):
@@ -256,13 +258,14 @@
if response.status >= 300 and response.status <= 399:
#print response.getheaders()
redirTarget = response.getheader('Location')
+ redirTarget = unicode(redirTarget, self.getEncodingUsedByServer())
#print "redirTarget:", redirTarget
if redirTarget:
if redirTarget.startswith('http://') or
redirTarget.startswith('https://'):
self.changeUrl(redirTarget)
return True
elif redirTarget.startswith('/'):
- self.changeUrl('%s://%s%s' % (self.protocol, self.host,
redirTarget))
+ self.changeUrl(u'%s://%s%s' % (self.protocol, self.host,
redirTarget))
return True
else: # redirect to relative position
# cut off filename