jenkins-bot has submitted this change and it was merged.
Change subject: Assume utf-8 encoding for JSON
......................................................................
Assume utf-8 encoding for JSON
The wikidata sparql endpoint returns
'application/sparql-results+json' without specifying
a charset.
For json based content utf-8 should be a good default
(better than the current latin1 - which could alternatively
be changed).
Change-Id: Icabc98c183e28d013947f27e59e1459361cc4b9f
---
M pywikibot/comms/threadedhttp.py
M tests/http_tests.py
2 files changed, 25 insertions(+), 1 deletion(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/comms/threadedhttp.py b/pywikibot/comms/threadedhttp.py
index 2c3be33..2ae6ed5 100644
--- a/pywikibot/comms/threadedhttp.py
+++ b/pywikibot/comms/threadedhttp.py
@@ -117,11 +117,15 @@
def header_encoding(self):
"""Return charset given by the response header."""
if not hasattr(self, '_header_encoding'):
- pos = self.response_headers['content-type'].find('charset=')
+ content_type = self.response_headers['content-type']
+ pos = content_type.find('charset=')
if pos >= 0:
pos += len('charset=')
encoding = self.response_headers['content-type'][pos:]
self._header_encoding = encoding
+ elif 'json' in content_type:
+ # application/json | application/sparql-results+json
+ self._header_encoding = 'utf-8'
else:
self._header_encoding = None
return self._header_encoding
diff --git a/tests/http_tests.py b/tests/http_tests.py
index 2e51928..9a237a0 100644
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -315,6 +315,26 @@
self.assertEqual(req.raw, CharsetTestCase.LATIN1_BYTES)
self.assertEqual(req.content, CharsetTestCase.STR)
+ def test_content_type_application_json_without_charset(self):
+ """Test decoding without explicit charset but JSON
content."""
+ req = CharsetTestCase._create_request()
+ resp = requests.Response()
+ req._data = resp
+ resp._content = CharsetTestCase.UTF8_BYTES[:]
+ resp.headers = {'content-type': 'application/json'}
+ self.assertIsNone(req.charset)
+ self.assertEqual('utf-8', req.encoding)
+
+ def test_content_type_sparql_json_without_charset(self):
+ """Test decoding without explicit charset but JSON
content."""
+ req = CharsetTestCase._create_request()
+ resp = requests.Response()
+ req._data = resp
+ resp._content = CharsetTestCase.UTF8_BYTES[:]
+ resp.headers = {'content-type':
'application/sparql-results+json'}
+ self.assertIsNone(req.charset)
+ self.assertEqual('utf-8', req.encoding)
+
def test_server_charset(self):
"""Test decoding with server explicit charset."""
req = CharsetTestCase._create_request()
--
To view, visit
https://gerrit.wikimedia.org/r/304680
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Icabc98c183e28d013947f27e59e1459361cc4b9f
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Jberkel <jan.berkel(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>