SVN: [11020] trunk/pywikipedia/pywikibot/comms/http.py - Pywikipedia-svn

31 Jan 2013

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11020
Revision: 11020
Author:   drtrigon
Date:     2013-01-31 17:36:03 +0000 (Thu, 31 Jan 2013)
Log Message:
-----------
catch AssertionError (if charset missmatches) but print warning
suppress output (if back_response=True) but still print error in verbose mode
warning if/before attempt to download big content (>10MB)
PEP 8 and docu
Modified Paths:
--------------
    trunk/pywikipedia/pywikibot/comms/http.py
Modified: trunk/pywikipedia/pywikibot/comms/http.py
===================================================================

--- trunk/pywikipedia/pywikibot/comms/http.py	2013-01-30 23:56:18 UTC (rev 11019)
+++ trunk/pywikipedia/pywikibot/comms/http.py	2013-01-31 17:36:03 UTC (rev 11020)
@@ -58,20 +58,25 @@
         return self._buffer[name]
-def request(site, uri, retry = None, sysop = False, data = None, compress = True,
-            no_hostname = False, cookie_only=False, refer=None, back_response=False):
+def request(site, uri, retry=None, sysop=False, data=None, compress=True,
+            no_hostname=False, cookie_only=False, refer=None,
+            back_response=False):
     """
     Low-level routine to get a URL from any source (may be the wiki).
Parameters:
-      @param site        - The Site to connect to.
-      @param uri         - The absolute uri, without the hostname.
-      @param retry       - If True, retries loading the page when a network error
-                         occurs.
-      @param sysop       - If True, the sysop account's cookie will be used.
-      @param data        - An optional dict providing extra post request
-                         parameters.
-      @param cookie_only - Only return the cookie the server sent us back
+      @param site          - The Site to connect to.
+      @param uri           - The absolute uri, without the hostname.
+      @param retry         - If True, retries loading the page when a network
+                             error occurs.
+      @param sysop         - If True, the sysop account's cookie will be used.
+      @param data          - An optional dict providing extra post request
+                             parameters.
+      @param compress      - Accept compressed page content transfer also.
+      @param no_hostname   - Do query to foreign host (any kind of web-server).
+      @param cookie_only   - Only return the cookie the server sent us back
+      @param refer         - ...
+      @param back_response - Return the addinfourl object from request too.
@return: Returns the HTML text of the page converted to unicode.
     """
@@ -114,20 +119,24 @@
             f = buffered_addinfourl(MyURLopener.open(req))
# read & info can raise socket.error
+            headers = f.info()
+            if (int(headers.get('content-length', '-1')) > 1E7):
+                pywikibot.output(u'WARNING: Target is of huge size (>10MB) is '
+                                 u'that correct? Downloading will take some '
+                                 u'time, please be patient.')
             text = f.read()
-            headers = f.info()
             break
         except KeyboardInterrupt:
             raise
         except urllib2.HTTPError, e:
             if e.code in [401, 404]:
                 raise PageNotFound(
-u'Page %s could not be retrieved. Check your family file.'
-                                   % url)
+                    u'Page %s could not be retrieved. Check your family file.'
+                    % url)
             elif e.code in [403]:
                 raise PageNotFound(
-u'Page %s could not be retrieved. Check your virus wall.'
-                                   % url)
+                    u'Page %s could not be retrieved. Check your virus wall.'
+                    % url)
             elif e.code == 504:
                 pywikibot.output(u'HTTPError: %s %s' % (e.code, e.msg))
                 if retry:
@@ -135,8 +144,9 @@
                     if retry_attempt > config.maxretries:
                         raise MaxTriesExceededError()
                     pywikibot.output(
-u"WARNING: Could not open '%s'.Maybe the server or\n your connection is down. Retrying in %i minutes..."
-                           % (url, retry_idle_time))
+                        u"WARNING: Could not open '%s'.Maybe the server or\n "
+                        u"your connection is down. Retrying in %i minutes..."
+                        % (url, retry_idle_time))
                     time.sleep(retry_idle_time * 60)
                     # Next time wait longer,
                     # but not longer than half an hour
@@ -155,8 +165,9 @@
                 if retry_attempt > config.maxretries:
                     raise MaxTriesExceededError()
                 pywikibot.output(
-u"WARNING: Could not open '%s'. Maybe the server or\n your connection is down. Retrying in %i minutes..."
-                       % (url, retry_idle_time))
+                    u"WARNING: Could not open '%s'. Maybe the server or\n your "
+                    u"connection is down. Retrying in %i minutes..."
+                    % (url, retry_idle_time))
                 time.sleep(retry_idle_time * 60)
                 retry_idle_time *= 2
                 if retry_idle_time > 30:
@@ -206,17 +217,29 @@
         # UTF-8 as default
         charset = 'utf-8'
     # Check if this is the charset we expected
-    site.checkCharset(charset)
+    try:
+        site.checkCharset(charset)
+    except AssertionError, e:
+        if (not back_response) or verbose:
+            pywikibot.output(u'%s' %e)
+            if no_hostname:
+                pywikibot.output(u'ERROR: Invalid charset found on %s.' % uri)
+            else:
+                pywikibot.output(u'ERROR: Invalid charset found on %s://%s%s.'
+                    % (site.protocol(), site.hostname(), uri))
     # Convert HTML to Unicode
     try:
         text = unicode(text, charset, errors = 'strict')
     except UnicodeDecodeError, e:
-        if verbose:
+        if (not back_response) or verbose:
             pywikibot.output(u'%s' %e)
-        if no_hostname:
-            pywikibot.output(u'ERROR: Invalid characters found on %s, replaced by \ufffd.' % uri)
-        else:
-            pywikibot.output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \ufffd.' % (site.protocol(), site.hostname(), uri))
+            if no_hostname:
+                pywikibot.output(u'ERROR: Invalid characters found on %s, '
+                                 u'replaced by \ufffd.' % uri)
+            else:
+                pywikibot.output(u'ERROR: Invalid characters found on %s://%s%s, '
+                    u'replaced by \ufffd.' 
+                    % (site.protocol(), site.hostname(), uri))
         # We use error='replace' in case of bad encoding.
         text = unicode(text, charset, errors = 'replace')