Revision: 6010
Author: wikipedian
Date: 2008-10-21 21:48:05 +0000 (Tue, 21 Oct 2008)
Log Message:
-----------
fixed bug [ 2158249 ] weblinkchecker.py doesn't report
archive.org links anymore
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2008-10-21 15:10:05 UTC (rev 6009)
+++ trunk/pywikipedia/weblinkchecker.py 2008-10-21 21:48:05 UTC (rev 6010)
@@ -262,8 +262,14 @@
return None
except UnicodeEncodeError:
return None
- text = f.read()
- if text.find("Search Results for ") != -1:
+ data = f.read()
+ if f.headers.get('content-encoding', None) == 'gzip':
+ # Since 2008, the Internet Archive returns pages in GZIPed
+ # compression format. Unfortunatelly urllib2 doesn't handle
+ # the decompression for us, so we have to do it ourselves.
+ import gzip, StringIO
+ data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
+ if data.find("Search Results for ") != -1:
return archiveURL
else:
return None