jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/392314 )
Change subject: [bugfix] fix xml decoding ......................................................................
[bugfix] fix xml decoding
- read xml encoding from xml file if no charset is given - use utf-8 as default - tests added
Bug: T180915 Change-Id: I4091eb8428b2c0bffbda657ee59583857363e1d5 --- M pywikibot/comms/threadedhttp.py M tests/http_tests.py 2 files changed, 53 insertions(+), 1 deletion(-)
Approvals: Merlijn van Deen: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/comms/threadedhttp.py b/pywikibot/comms/threadedhttp.py index 4e1f9e1..3602ce3 100644 --- a/pywikibot/comms/threadedhttp.py +++ b/pywikibot/comms/threadedhttp.py @@ -1,13 +1,14 @@ # -*- coding: utf-8 -*- """Http backend layer, formerly providing a httplib2 wrapper.""" from __future__ import absolute_import, unicode_literals -# (C) Pywikibot team, 2007-2015 +# (C) Pywikibot team, 2007-2017
__version__ = '$Id$' __docformat__ = 'epytext'
# standard python libraries import codecs +import re import sys
if sys.version_info[0] > 2: @@ -127,6 +128,14 @@ elif 'json' in content_type: # application/json | application/sparql-results+json self._header_encoding = 'utf-8' + elif 'xml' in content_type: + header = self.raw[:100].splitlines()[0] # bytestr in py3 + m = re.search(br'encoding=("|' + br"')(?P<encoding>.+?)\1", header) + if m: + self._header_encoding = m.group('encoding').decode('utf-8') + else: + self._header_encoding = 'utf-8' else: self._header_encoding = None return self._header_encoding diff --git a/tests/http_tests.py b/tests/http_tests.py index ac784bf..bd296c8 100644 --- a/tests/http_tests.py +++ b/tests/http_tests.py @@ -483,6 +483,49 @@ self.assertIsNone(req.charset) self.assertEqual('utf-8', req.encoding)
+ def test_content_type_xml_without_charset(self): + """Test decoding without explicit charset but xml content.""" + req = CharsetTestCase._create_request() + resp = requests.Response() + req._data = resp + resp._content = CharsetTestCase.UTF8_BYTES[:] + resp.headers = {'content-type': 'text/xml'} + self.assertIsNone(req.charset) + self.assertEqual('utf-8', req.encoding) + + def test_content_type_xml_with_charset(self): + """Test xml content with utf-8 encoding given in content.""" + req = CharsetTestCase._create_request() + resp = requests.Response() + req._data = resp + resp._content = '<?xml version="1.0" encoding="UTF-8"?>'.encode( + 'utf-8') + resp.headers = {'content-type': 'text/xml'} + self.assertIsNone(req.charset) + self.assertEqual('UTF-8', req.encoding) + + def test_content_type_xml_with_charset_and_more_data(self): + """Test xml content with utf-8 encoding given in content.""" + req = CharsetTestCase._create_request() + resp = requests.Response() + req._data = resp + resp._content = '<?xml version="1.0" encoding="UTF-8" someparam="ignored"?>'.encode( + 'utf-8') + resp.headers = {'content-type': 'text/xml'} + self.assertIsNone(req.charset) + self.assertEqual('UTF-8', req.encoding) + + def test_content_type_xml_with_variant_charset(self): + """Test xml content with latin1 encoding given in content.""" + req = CharsetTestCase._create_request() + resp = requests.Response() + req._data = resp + resp._content = "<?xml version='1.0' encoding='latin1'?>".encode( + 'latin1') + resp.headers = {'content-type': 'text/xml'} + self.assertIsNone(req.charset) + self.assertEqual('latin1', req.encoding) + def test_server_charset(self): """Test decoding with server explicit charset.""" req = CharsetTestCase._create_request()
pywikibot-commits@lists.wikimedia.org