[Pywikipedia-svn] SVN: [7282] trunk/pywikipedia/reflinks.py

21 Sep 2009

Revision: 7282
Author:   nicdumz
Date:     2009-09-21 10:37:31 +0000 (Mon, 21 Sep 2009)
Log Message:
-----------
Patch orginally from Dispenser: uncompress gzipped content when necessary
Modified Paths:
--------------
    trunk/pywikipedia/reflinks.py
Modified: trunk/pywikipedia/reflinks.py
===================================================================

--- trunk/pywikipedia/reflinks.py	2009-09-19 19:41:50 UTC (rev 7281)
+++ trunk/pywikipedia/reflinks.py	2009-09-21 10:37:31 UTC (rev 7282)
@@ -41,7 +41,7 @@
 from BeautifulSoup import UnicodeDammit
 import sys, re, urllib2, httplib, socket, codecs, ftplib
 import wikipedia, pagegenerators, noreferences
-import subprocess, tempfile, os
+import subprocess, tempfile, os, gzip, StringIO
stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
             'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
@@ -519,6 +519,15 @@
                             wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link)
                             continue
+                    # uncompress if necessary
+                    if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
+                        # XXX: small issue here: the whole page is downloaded
+                        # through f.read(). It might fetch big files/pages.
+                        # However, truncating an encoded gzipped stream is not
+                        # an option, for unzipping will fail.
+                        compressed = StringIO.StringIO(f.read())
+                        f = gzip.GzipFile(fileobj=compressed)
+
                     # Read the first 1,000,000 bytes (0.95 MB)
                     linkedpagetext = f.read(1000000)
                     socket.setdefaulttimeout(None)

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Pywikipedia-svn] SVN: [7282] trunk/pywikipedia/reflinks.py