jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/392314 )
Change subject: [bugfix] fix xml decoding
......................................................................
[bugfix] fix xml decoding
- read xml encoding from xml file if no charset is given
- use utf-8 as default
- tests added
Bug: T180915
Change-Id: I4091eb8428b2c0bffbda657ee59583857363e1d5
---
M pywikibot/comms/threadedhttp.py
M tests/http_tests.py
2 files changed, 53 insertions(+), 1 deletion(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/comms/threadedhttp.py b/pywikibot/comms/threadedhttp.py
index 4e1f9e1..3602ce3 100644
--- a/pywikibot/comms/threadedhttp.py
+++ b/pywikibot/comms/threadedhttp.py
@@ -1,13 +1,14 @@
# -*- coding: utf-8 -*-
"""Http backend layer, formerly providing a httplib2
wrapper."""
from __future__ import absolute_import, unicode_literals
-# (C) Pywikibot team, 2007-2015
+# (C) Pywikibot team, 2007-2017
__version__ = '$Id$'
__docformat__ = 'epytext'
# standard python libraries
import codecs
+import re
import sys
if sys.version_info[0] > 2:
@@ -127,6 +128,14 @@
elif 'json' in content_type:
# application/json | application/sparql-results+json
self._header_encoding = 'utf-8'
+ elif 'xml' in content_type:
+ header = self.raw[:100].splitlines()[0] # bytestr in py3
+ m = re.search(br'encoding=("|'
+ br"')(?P<encoding>.+?)\1", header)
+ if m:
+ self._header_encoding =
m.group('encoding').decode('utf-8')
+ else:
+ self._header_encoding = 'utf-8'
else:
self._header_encoding = None
return self._header_encoding
diff --git a/tests/http_tests.py b/tests/http_tests.py
index ac784bf..bd296c8 100644
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -483,6 +483,49 @@
self.assertIsNone(req.charset)
self.assertEqual('utf-8', req.encoding)
+ def test_content_type_xml_without_charset(self):
+ """Test decoding without explicit charset but xml
content."""
+ req = CharsetTestCase._create_request()
+ resp = requests.Response()
+ req._data = resp
+ resp._content = CharsetTestCase.UTF8_BYTES[:]
+ resp.headers = {'content-type': 'text/xml'}
+ self.assertIsNone(req.charset)
+ self.assertEqual('utf-8', req.encoding)
+
+ def test_content_type_xml_with_charset(self):
+ """Test xml content with utf-8 encoding given in
content."""
+ req = CharsetTestCase._create_request()
+ resp = requests.Response()
+ req._data = resp
+ resp._content = '<?xml version="1.0"
encoding="UTF-8"?>'.encode(
+ 'utf-8')
+ resp.headers = {'content-type': 'text/xml'}
+ self.assertIsNone(req.charset)
+ self.assertEqual('UTF-8', req.encoding)
+
+ def test_content_type_xml_with_charset_and_more_data(self):
+ """Test xml content with utf-8 encoding given in
content."""
+ req = CharsetTestCase._create_request()
+ resp = requests.Response()
+ req._data = resp
+ resp._content = '<?xml version="1.0" encoding="UTF-8"
someparam="ignored"?>'.encode(
+ 'utf-8')
+ resp.headers = {'content-type': 'text/xml'}
+ self.assertIsNone(req.charset)
+ self.assertEqual('UTF-8', req.encoding)
+
+ def test_content_type_xml_with_variant_charset(self):
+ """Test xml content with latin1 encoding given in
content."""
+ req = CharsetTestCase._create_request()
+ resp = requests.Response()
+ req._data = resp
+ resp._content = "<?xml version='1.0'
encoding='latin1'?>".encode(
+ 'latin1')
+ resp.headers = {'content-type': 'text/xml'}
+ self.assertIsNone(req.charset)
+ self.assertEqual('latin1', req.encoding)
+
def test_server_charset(self):
"""Test decoding with server explicit charset."""
req = CharsetTestCase._create_request()
--
To view, visit
https://gerrit.wikimedia.org/r/392314
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I4091eb8428b2c0bffbda657ee59583857363e1d5
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Magul <tomasz.magulski(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Russell Blau <russblau(a)imapmail.org>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444(a)gmail.com>
Gerrit-Reviewer: jenkins-bot <>