jenkins-bot has submitted this change and it was merged.
Change subject: [FIX] Http: Allow custom encoding
......................................................................
[FIX] Http: Allow custom encoding
This adds the ability to use a specific encodings and warns if the
encoding differs.
Change-Id: I03609e3d6ec9d8b7f72819358c11d62a792bf4c0
---
M pywikibot/comms/http.py
M pywikibot/comms/threadedhttp.py
M tests/http_tests.py
3 files changed, 133 insertions(+), 14 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index 444f3c7..82f5ca3 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -222,7 +222,7 @@
@deprecate_arg('ssl', None)
-def request(site=None, uri=None, *args, **kwargs):
+def request(site=None, uri=None, charset=None, *args, **kwargs):
"""
Request to Site with default error handling and response decoding.
@@ -237,8 +237,12 @@
@type site: L{pywikibot.site.BaseSite}
@param uri: the URI to retrieve
@type uri: str
+ @param charset: Either a valid charset (usable for str.decode()) or None
+ to automatically chose the charset from the returned header (defaults
+ to latin-1)
+ @type charset: CodecInfo, str, None
@return: The received data
- @rtype: unicode
+ @rtype: a unicode string
"""
assert(site or uri)
if not site:
@@ -260,6 +264,7 @@
format_string = kwargs.setdefault("headers",
{}).get("user-agent")
kwargs["headers"]["user-agent"] = user_agent(site,
format_string)
+ kwargs['charset'] = charset
r = fetch(baseuri, *args, **kwargs)
return r.content
diff --git a/pywikibot/comms/threadedhttp.py b/pywikibot/comms/threadedhttp.py
index 8de4213..b7a7422 100644
--- a/pywikibot/comms/threadedhttp.py
+++ b/pywikibot/comms/threadedhttp.py
@@ -21,6 +21,7 @@
__docformat__ = 'epytext'
# standard python libraries
+import codecs
import re
import sys
import threading
@@ -28,7 +29,6 @@
if sys.version_info[0] > 2:
from http import cookiejar as cookielib
from urllib.parse import splittype, splithost, unquote, urlparse, urljoin
- unicode = str
else:
import cookielib
from urlparse import urlparse, urljoin
@@ -337,7 +337,7 @@
"""
def __init__(self, uri, method="GET", body=None, headers=None,
- callbacks=None, **kwargs):
+ callbacks=None, charset=None, **kwargs):
"""
Constructor.
@@ -347,6 +347,14 @@
self.method = method
self.body = body
self.headers = headers
+ if isinstance(charset, codecs.CodecInfo):
+ self.charset = charset.name
+ elif charset:
+ self.charset = charset
+ elif headers and 'accept-charset' in headers:
+ self.charset = headers['accept-charset']
+ else:
+ self.charset = None
self.callbacks = callbacks
@@ -418,22 +426,60 @@
return self.response_headers.status
@property
+ def header_encoding(self):
+ """Return charset given by the response header."""
+ if not hasattr(self, '_header_encoding'):
+ pos = self.response_headers['content-type'].find('charset=')
+ if pos >= 0:
+ pos += len('charset=')
+ encoding = self.response_headers['content-type'][pos:]
+ self._header_encoding = encoding
+ else:
+ self._header_encoding = None
+ return self._header_encoding
+
+ @property
def encoding(self):
"""Detect the response encoding."""
- pos = self.response_headers['content-type'].find('charset=')
- if pos >= 0:
- pos += len('charset=')
- encoding = self.response_headers['content-type'][pos:]
- else:
- encoding = 'ascii'
- # Don't warn, many pages don't contain one
- pywikibot.log(u"Http response doesn't contain a charset.")
+ if not hasattr(self, '_encoding'):
+ if not self.charset and not self.header_encoding:
+ pywikibot.log(u"Http response doesn't contain a charset.")
+ charset = 'latin1'
+ else:
+ charset = self.charset
+ if (self.header_encoding and codecs.lookup(self.header_encoding) !=
+ (codecs.lookup(charset) if charset else None)):
+ if charset:
+ pywikibot.warning(u'Encoding "{0}" requested but
"{1}" '
+ 'received in the header.'.format(
+ charset, self.header_encoding))
+ try:
+ # TODO: Buffer decoded content, weakref does remove it too
+ # early (directly after this method)
+ self.raw.decode(self.header_encoding)
+ except UnicodeError as e:
+ self._encoding = e
+ else:
+ self._encoding = self.header_encoding
+ else:
+ self._encoding = None
- return encoding
+ if charset and (isinstance(self._encoding, Exception) or
+ not self._encoding):
+ try:
+ self.raw.decode(charset)
+ except UnicodeError as e:
+ self._encoding = e
+ else:
+ self._encoding = charset
+
+ if isinstance(self._encoding, Exception):
+ raise self._encoding
+ return self._encoding
def decode(self, encoding):
"""Return the decoded response."""
- return self.raw.decode(encoding)
+ return self.raw.decode(self.encoding)
@property
def content(self):
diff --git a/tests/http_tests.py b/tests/http_tests.py
index 4b1a885..722687f 100644
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -310,6 +310,74 @@
self.assertIn('Python/' + str(sys.version_info[0]), http.user_agent())
+class CharsetTestCase(TestCase):
+
+ """Test that HttpRequest correct handles the charsets
given."""
+
+ net = False
+
+ STR = u'äöü'
+ LATIN1_BYTES = STR.encode('latin1')
+ UTF8_BYTES = STR.encode('utf8')
+
+ @staticmethod
+ def _create_request(charset=None, data=UTF8_BYTES):
+ req = threadedhttp.HttpRequest(None, charset=charset)
+ req._data = ({'content-type': 'charset=utf-8'}, data[:])
+ return req
+
+ def test_no_charset(self):
+ """Test decoding without explicit charset."""
+ req = threadedhttp.HttpRequest(None)
+ req._data = ({'content-type': ''},
CharsetTestCase.LATIN1_BYTES[:])
+ self.assertIsNone(req.charset)
+ self.assertEqual('latin1', req.encoding)
+ self.assertEqual(req.raw, CharsetTestCase.LATIN1_BYTES)
+ self.assertEqual(req.content, CharsetTestCase.STR)
+
+ def test_server_charset(self):
+ """Test decoding with server explicit charset."""
+ req = CharsetTestCase._create_request()
+ self.assertIsNone(req.charset)
+ self.assertEqual('utf-8', req.encoding)
+ self.assertEqual(req.raw, CharsetTestCase.UTF8_BYTES)
+ self.assertEqual(req.content, CharsetTestCase.STR)
+
+ def test_same_charset(self):
+ """Test decoding with explicit and equal
charsets."""
+ req = CharsetTestCase._create_request('utf-8')
+ self.assertEqual('utf-8', req.charset)
+ self.assertEqual('utf-8', req.encoding)
+ self.assertEqual(req.raw, CharsetTestCase.UTF8_BYTES)
+ self.assertEqual(req.content, CharsetTestCase.STR)
+
+ def test_header_charset(self):
+ """Test decoding with different charsets and valid header
charset."""
+ req = CharsetTestCase._create_request('latin1')
+ self.assertEqual('latin1', req.charset)
+ self.assertEqual('utf-8', req.encoding)
+ self.assertEqual(req.raw, CharsetTestCase.UTF8_BYTES)
+ self.assertEqual(req.content, CharsetTestCase.STR)
+
+ def test_code_charset(self):
+ """Test decoding with different charsets and invalid header
charset."""
+ req = CharsetTestCase._create_request('latin1',
+ CharsetTestCase.LATIN1_BYTES)
+ self.assertEqual('latin1', req.charset)
+ self.assertEqual('latin1', req.encoding)
+ self.assertEqual(req.raw, CharsetTestCase.LATIN1_BYTES)
+ self.assertEqual(req.content, CharsetTestCase.STR)
+
+ def test_invalid_charset(self):
+ """Test decoding with different and invalid
charsets."""
+ req = CharsetTestCase._create_request('utf16',
+ CharsetTestCase.LATIN1_BYTES)
+ self.assertEqual('utf16', req.charset)
+ self.assertRaises(UnicodeDecodeError, lambda: req.encoding)
+ self.assertEqual(req.raw, CharsetTestCase.LATIN1_BYTES)
+ self.assertRaises(UnicodeDecodeError, lambda: req.content)
+
+
if __name__ == '__main__':
try:
unittest.main()
--
To view, visit
https://gerrit.wikimedia.org/r/158848
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I03609e3d6ec9d8b7f72819358c11d62a792bf4c0
Gerrit-PatchSet: 18
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>