Revision: 3938 Author: wikipedian Date: 2007-08-01 22:11:21 +0000 (Wed, 01 Aug 2007)
Log Message: ----------- decode HTTP redirect targets use Latin-1 as default server encoding instead of UTF-8
Modified Paths: -------------- trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py =================================================================== --- trunk/pywikipedia/weblinkchecker.py 2007-08-01 21:53:08 UTC (rev 3937) +++ trunk/pywikipedia/weblinkchecker.py 2007-08-01 22:11:21 UTC (rev 3938) @@ -198,6 +198,7 @@ return httplib.HTTPSConnection(self.host)
def getEncodingUsedByServer(self): + # TODO: We could maybe save a few accesses here by caching. try: conn = self.getConnection() conn.request('HEAD', '/', None, self.header) @@ -208,8 +209,9 @@ charset = charsetR.search(ct).group(1) return charset except: - wikipedia.output(u'Error retrieving server's default charset. Using UTF-8.') - return 'utf-8' + wikipedia.output(u'Error retrieving server's default charset. Using ISO 8859-1.') + # most browsers use ISO 8859-1 (Latin-1) as the default. + return 'iso8859-1'
def changeUrl(self, url): @@ -256,13 +258,14 @@ if response.status >= 300 and response.status <= 399: #print response.getheaders() redirTarget = response.getheader('Location') + redirTarget = unicode(redirTarget, self.getEncodingUsedByServer()) #print "redirTarget:", redirTarget if redirTarget: if redirTarget.startswith('http://') or redirTarget.startswith('https://'): self.changeUrl(redirTarget) return True elif redirTarget.startswith('/'): - self.changeUrl('%s://%s%s' % (self.protocol, self.host, redirTarget)) + self.changeUrl(u'%s://%s%s' % (self.protocol, self.host, redirTarget)) return True else: # redirect to relative position # cut off filename