[Pywikipedia-svn] SVN: [7282] trunk/pywikipedia/reflinks.py
nicdumz at svn.wikimedia.org
nicdumz at svn.wikimedia.org
Mon Sep 21 10:37:33 UTC 2009
Revision: 7282
Author: nicdumz
Date: 2009-09-21 10:37:31 +0000 (Mon, 21 Sep 2009)
Log Message:
-----------
Patch orginally from Dispenser: uncompress gzipped content when necessary
Modified Paths:
--------------
trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py 2009-09-19 19:41:50 UTC (rev 7281)
+++ trunk/pywikipedia/reflinks.py 2009-09-21 10:37:31 UTC (rev 7282)
@@ -41,7 +41,7 @@
from BeautifulSoup import UnicodeDammit
import sys, re, urllib2, httplib, socket, codecs, ftplib
import wikipedia, pagegenerators, noreferences
-import subprocess, tempfile, os
+import subprocess, tempfile, os, gzip, StringIO
stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
@@ -519,6 +519,15 @@
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link)
continue
+ # uncompress if necessary
+ if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
+ # XXX: small issue here: the whole page is downloaded
+ # through f.read(). It might fetch big files/pages.
+ # However, truncating an encoded gzipped stream is not
+ # an option, for unzipping will fail.
+ compressed = StringIO.StringIO(f.read())
+ f = gzip.GzipFile(fileobj=compressed)
+
# Read the first 1,000,000 bytes (0.95 MB)
linkedpagetext = f.read(1000000)
socket.setdefaulttimeout(None)
More information about the Pywikipedia-svn
mailing list