[Pywikibot-commits] [Gerrit] [FIX] Http: Allow custom encoding - change (pywikibot/core)

22 Jan 2015

jenkins-bot has submitted this change and it was merged.

Change subject: [FIX] Http: Allow custom encoding
......................................................................


[FIX] Http: Allow custom encoding

This adds the ability to use a specific encodings and warns if the
encoding differs.

Change-Id: I03609e3d6ec9d8b7f72819358c11d62a792bf4c0
---
M pywikibot/comms/http.py
M pywikibot/comms/threadedhttp.py
M tests/http_tests.py
3 files changed, 133 insertions(+), 14 deletions(-)

Approvals:
  John Vandenberg: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index 444f3c7..82f5ca3 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -222,7 +222,7 @@
 
 
 @deprecate_arg('ssl', None)
-def request(site=None, uri=None, *args, **kwargs):
+def request(site=None, uri=None, charset=None, *args, **kwargs):
     """
     Request to Site with default error handling and response decoding.
 
@@ -237,8 +237,12 @@
     @type site: L{pywikibot.site.BaseSite}
     @param uri: the URI to retrieve
     @type uri: str
+    @param charset: Either a valid charset (usable for str.decode()) or None
+        to automatically chose the charset from the returned header (defaults
+        to latin-1)
+    @type charset: CodecInfo, str, None
     @return: The received data
-    @rtype: unicode
+    @rtype: a unicode string
     """
     assert(site or uri)
     if not site:
@@ -260,6 +264,7 @@
 
     format_string = kwargs.setdefault("headers",
{}).get("user-agent")
     kwargs["headers"]["user-agent"] = user_agent(site,
format_string)
+    kwargs['charset'] = charset
 
     r = fetch(baseuri, *args, **kwargs)
     return r.content
diff --git a/pywikibot/comms/threadedhttp.py b/pywikibot/comms/threadedhttp.py
index 8de4213..b7a7422 100644
--- a/pywikibot/comms/threadedhttp.py
+++ b/pywikibot/comms/threadedhttp.py
@@ -21,6 +21,7 @@
 __docformat__ = 'epytext'
 
 # standard python libraries
+import codecs
 import re
 import sys
 import threading
@@ -28,7 +29,6 @@
 if sys.version_info[0] > 2:
     from http import cookiejar as cookielib
     from urllib.parse import splittype, splithost, unquote, urlparse, urljoin
-    unicode = str
 else:
     import cookielib
     from urlparse import urlparse, urljoin
@@ -337,7 +337,7 @@
     """
 
     def __init__(self, uri, method="GET", body=None, headers=None,
-                 callbacks=None, **kwargs):
+                 callbacks=None, charset=None, **kwargs):
         """
         Constructor.
 
@@ -347,6 +347,14 @@
         self.method = method
         self.body = body
         self.headers = headers
+        if isinstance(charset, codecs.CodecInfo):
+            self.charset = charset.name
+        elif charset:
+            self.charset = charset
+        elif headers and 'accept-charset' in headers:
+            self.charset = headers['accept-charset']
+        else:
+            self.charset = None
 
         self.callbacks = callbacks
 
@@ -418,22 +426,60 @@
         return self.response_headers.status
 
     @property
+    def header_encoding(self):
+        """Return charset given by the response header."""
+        if not hasattr(self, '_header_encoding'):
+            pos = self.response_headers['content-type'].find('charset=')
+            if pos >= 0:
+                pos += len('charset=')
+                encoding = self.response_headers['content-type'][pos:]
+                self._header_encoding = encoding
+            else:
+                self._header_encoding = None
+        return self._header_encoding
+
+    @property
     def encoding(self):
         """Detect the response encoding."""
-        pos = self.response_headers['content-type'].find('charset=')
-        if pos >= 0:
-            pos += len('charset=')
-            encoding = self.response_headers['content-type'][pos:]
-        else:
-            encoding = 'ascii'
-            # Don't warn, many pages don't contain one
-            pywikibot.log(u"Http response doesn't contain a charset.")
+        if not hasattr(self, '_encoding'):
+            if not self.charset and not self.header_encoding:
+                pywikibot.log(u"Http response doesn't contain a charset.")
+                charset = 'latin1'
+            else:
+                charset = self.charset
+            if (self.header_encoding and codecs.lookup(self.header_encoding) !=
+                    (codecs.lookup(charset) if charset else None)):
+                if charset:
+                    pywikibot.warning(u'Encoding "{0}" requested but
"{1}" '
+                                       'received in the header.'.format(
+                        charset, self.header_encoding))
+                try:
+                    # TODO: Buffer decoded content, weakref does remove it too
+                    #       early (directly after this method)
+                    self.raw.decode(self.header_encoding)
+                except UnicodeError as e:
+                    self._encoding = e
+                else:
+                    self._encoding = self.header_encoding
+            else:
+                self._encoding = None
 
-        return encoding
+            if charset and (isinstance(self._encoding, Exception) or
+                            not self._encoding):
+                try:
+                    self.raw.decode(charset)
+                except UnicodeError as e:
+                    self._encoding = e
+                else:
+                    self._encoding = charset
+
+        if isinstance(self._encoding, Exception):
+            raise self._encoding
+        return self._encoding
 
     def decode(self, encoding):
         """Return the decoded response."""
-        return self.raw.decode(encoding)
+        return self.raw.decode(self.encoding)
 
     @property
     def content(self):
diff --git a/tests/http_tests.py b/tests/http_tests.py
index 4b1a885..722687f 100644
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -310,6 +310,74 @@
         self.assertIn('Python/' + str(sys.version_info[0]), http.user_agent())
 
 
+class CharsetTestCase(TestCase):
+
+    """Test that HttpRequest correct handles the charsets
given."""
+
+    net = False
+
+    STR = u'äöü'
+    LATIN1_BYTES = STR.encode('latin1')
+    UTF8_BYTES = STR.encode('utf8')
+
+    @staticmethod
+    def _create_request(charset=None, data=UTF8_BYTES):
+        req = threadedhttp.HttpRequest(None, charset=charset)
+        req._data = ({'content-type': 'charset=utf-8'}, data[:])
+        return req
+
+    def test_no_charset(self):
+        """Test decoding without explicit charset."""
+        req = threadedhttp.HttpRequest(None)
+        req._data = ({'content-type': ''},
CharsetTestCase.LATIN1_BYTES[:])
+        self.assertIsNone(req.charset)
+        self.assertEqual('latin1', req.encoding)
+        self.assertEqual(req.raw, CharsetTestCase.LATIN1_BYTES)
+        self.assertEqual(req.content, CharsetTestCase.STR)
+
+    def test_server_charset(self):
+        """Test decoding with server explicit charset."""
+        req = CharsetTestCase._create_request()
+        self.assertIsNone(req.charset)
+        self.assertEqual('utf-8', req.encoding)
+        self.assertEqual(req.raw, CharsetTestCase.UTF8_BYTES)
+        self.assertEqual(req.content, CharsetTestCase.STR)
+
+    def test_same_charset(self):
+        """Test decoding with explicit and equal
charsets."""
+        req = CharsetTestCase._create_request('utf-8')
+        self.assertEqual('utf-8', req.charset)
+        self.assertEqual('utf-8', req.encoding)
+        self.assertEqual(req.raw, CharsetTestCase.UTF8_BYTES)
+        self.assertEqual(req.content, CharsetTestCase.STR)
+
+    def test_header_charset(self):
+        """Test decoding with different charsets and valid header
charset."""
+        req = CharsetTestCase._create_request('latin1')
+        self.assertEqual('latin1', req.charset)
+        self.assertEqual('utf-8', req.encoding)
+        self.assertEqual(req.raw, CharsetTestCase.UTF8_BYTES)
+        self.assertEqual(req.content, CharsetTestCase.STR)
+
+    def test_code_charset(self):
+        """Test decoding with different charsets and invalid header
charset."""
+        req = CharsetTestCase._create_request('latin1',
+                                              CharsetTestCase.LATIN1_BYTES)
+        self.assertEqual('latin1', req.charset)
+        self.assertEqual('latin1', req.encoding)
+        self.assertEqual(req.raw, CharsetTestCase.LATIN1_BYTES)
+        self.assertEqual(req.content, CharsetTestCase.STR)
+
+    def test_invalid_charset(self):
+        """Test decoding with different and invalid
charsets."""
+        req = CharsetTestCase._create_request('utf16',
+                                              CharsetTestCase.LATIN1_BYTES)
+        self.assertEqual('utf16', req.charset)
+        self.assertRaises(UnicodeDecodeError, lambda: req.encoding)
+        self.assertEqual(req.raw, CharsetTestCase.LATIN1_BYTES)
+        self.assertRaises(UnicodeDecodeError, lambda: req.content)
+
+
 if __name__ == '__main__':
     try:
         unittest.main()

-- 
To view, visit https://gerrit.wikimedia.org/r/158848
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I03609e3d6ec9d8b7f72819358c11d62a792bf4c0
Gerrit-PatchSet: 18
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise &lt;CommodoreFabianus(a)gmx.de&gt;
Gerrit-Reviewer: John Vandenberg &lt;jayvdb(a)gmail.com&gt;
Gerrit-Reviewer: Ladsgroup &lt;ladsgroup(a)gmail.com&gt;
Gerrit-Reviewer: Merlijn van Deen &lt;valhallasw(a)arctus.nl&gt;
Gerrit-Reviewer: XZise &lt;CommodoreFabianus(a)gmx.de&gt;
Gerrit-Reviewer: jenkins-bot <>


    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

[Pywikibot-commits] [Gerrit] [FIX] Http: Allow custom encoding - change (pywikibot/core)