[Pywikipedia-svn] SVN: [7282] trunk/pywikipedia/reflinks.py

nicdumz at svn.wikimedia.org nicdumz at svn.wikimedia.org
Mon Sep 21 10:37:33 UTC 2009


Revision: 7282
Author:   nicdumz
Date:     2009-09-21 10:37:31 +0000 (Mon, 21 Sep 2009)

Log Message:
-----------
Patch orginally from Dispenser: uncompress gzipped content when necessary

Modified Paths:
--------------
    trunk/pywikipedia/reflinks.py

Modified: trunk/pywikipedia/reflinks.py
===================================================================
--- trunk/pywikipedia/reflinks.py	2009-09-19 19:41:50 UTC (rev 7281)
+++ trunk/pywikipedia/reflinks.py	2009-09-21 10:37:31 UTC (rev 7282)
@@ -41,7 +41,7 @@
 from BeautifulSoup import UnicodeDammit
 import sys, re, urllib2, httplib, socket, codecs, ftplib
 import wikipedia, pagegenerators, noreferences
-import subprocess, tempfile, os
+import subprocess, tempfile, os, gzip, StringIO
 
 stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
             'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
@@ -519,6 +519,15 @@
                             wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link)
                             continue
 
+                    # uncompress if necessary
+                    if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
+                        # XXX: small issue here: the whole page is downloaded
+                        # through f.read(). It might fetch big files/pages.
+                        # However, truncating an encoded gzipped stream is not
+                        # an option, for unzipping will fail.
+                        compressed = StringIO.StringIO(f.read())
+                        f = gzip.GzipFile(fileobj=compressed)
+
                     # Read the first 1,000,000 bytes (0.95 MB)
                     linkedpagetext = f.read(1000000)
                     socket.setdefaulttimeout(None)





More information about the Pywikipedia-svn mailing list