Revision: 7282 Author: nicdumz Date: 2009-09-21 10:37:31 +0000 (Mon, 21 Sep 2009)
Log Message: ----------- Patch orginally from Dispenser: uncompress gzipped content when necessary
Modified Paths: -------------- trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py =================================================================== --- trunk/pywikipedia/reflinks.py 2009-09-19 19:41:50 UTC (rev 7281) +++ trunk/pywikipedia/reflinks.py 2009-09-21 10:37:31 UTC (rev 7282) @@ -41,7 +41,7 @@ from BeautifulSoup import UnicodeDammit import sys, re, urllib2, httplib, socket, codecs, ftplib import wikipedia, pagegenerators, noreferences -import subprocess, tempfile, os +import subprocess, tempfile, os, gzip, StringIO
stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper', 'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe', @@ -519,6 +519,15 @@ wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link) continue
+ # uncompress if necessary + if headers.get('Content-Encoding') in ('gzip', 'x-gzip'): + # XXX: small issue here: the whole page is downloaded + # through f.read(). It might fetch big files/pages. + # However, truncating an encoded gzipped stream is not + # an option, for unzipping will fail. + compressed = StringIO.StringIO(f.read()) + f = gzip.GzipFile(fileobj=compressed) + # Read the first 1,000,000 bytes (0.95 MB) linkedpagetext = f.read(1000000) socket.setdefaulttimeout(None)