Revision: 6010 Author: wikipedian Date: 2008-10-21 21:48:05 +0000 (Tue, 21 Oct 2008)
Log Message: ----------- fixed bug [ 2158249 ] weblinkchecker.py doesn't report archive.org links anymore
Modified Paths: -------------- trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py =================================================================== --- trunk/pywikipedia/weblinkchecker.py 2008-10-21 15:10:05 UTC (rev 6009) +++ trunk/pywikipedia/weblinkchecker.py 2008-10-21 21:48:05 UTC (rev 6010) @@ -262,8 +262,14 @@ return None except UnicodeEncodeError: return None - text = f.read() - if text.find("Search Results for ") != -1: + data = f.read() + if f.headers.get('content-encoding', None) == 'gzip': + # Since 2008, the Internet Archive returns pages in GZIPed + # compression format. Unfortunatelly urllib2 doesn't handle + # the decompression for us, so we have to do it ourselves. + import gzip, StringIO + data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() + if data.find("Search Results for ") != -1: return archiveURL else: return None
pywikipedia-l@lists.wikimedia.org