jenkins-bot submitted this change.

View Change

Approvals: Rubin: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
[bugfix] check for LookupError exception in _try_decode

- check for LookupError or UnicodeDecodeError exception in
http._decide_encoding and use apparent_encoding logic of
requests.Resonse.
- _decide_encoding returns an encoding string or None.
If None is given, requests.Response uses chardet to
detect encoding. No UnicodeDecodeError is raised anymore.
Therefore remove this Exception from error_handling_callback.
- Update tests accordingly

Bug: T276715
Change-Id: I76b690d986c61831587fd18e92919bcda4eeebe7
---
M pywikibot/comms/http.py
M tests/http_tests.py
2 files changed, 38 insertions(+), 46 deletions(-)

diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index a0f25fa..05f8bd2 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -310,11 +310,6 @@
if response.status_code not in (200, 207):
warning('Http response status {}'.format(response.status_code))

- if isinstance(response.encoding, UnicodeDecodeError):
- error('An error occurred for uri {}: '
- 'no encoding detected!'.format(response.request.url))
- raise response.encoding from None
-

@deprecated_args(callback=True, body='data')
def fetch(uri: str, method: str = 'GET', headers: Optional[dict] = None,
@@ -422,7 +417,7 @@
return response


-def _get_encoding_from_response_headers(response):
+def _get_encoding_from_response_headers(response) -> Optional[str]:
"""Return charset given by the response header."""
content_type = response.headers.get('content-type')

@@ -449,11 +444,19 @@
return header_encoding


-def _decide_encoding(response, charset):
+def _decide_encoding(response, charset) -> Optional[str]:
"""Detect the response encoding."""
def _try_decode(content, encoding):
"""Helper function to try decoding."""
- content.decode(encoding)
+ if encoding is None:
+ return None
+ try:
+ content.decode(encoding)
+ except (LookupError, UnicodeDecodeError):
+ pywikibot.warning('Unknown or invalid encoding {!r}'
+ .format(encoding))
+ # let chardet do the job
+ return None
return encoding

header_encoding = _get_encoding_from_response_headers(response)
@@ -475,14 +478,22 @@
return _try_decode(response.content, charset)

# Both charset and header_encoding are available.
- if codecs.lookup(header_encoding) != codecs.lookup(charset):
+ try:
+ header_codecs = codecs.lookup(header_encoding)
+ except LookupError:
+ header_codecs = None
+
+ try:
+ charset_codecs = codecs.lookup(charset)
+ except LookupError:
+ charset_codecs = None
+
+ if header_codecs and charset_codecs and header_codecs != charset_codecs:
pywikibot.warning(
'Encoding "{}" requested but "{}" received in the '
'response header.'.format(charset, header_encoding))

- try:
- _encoding = _try_decode(response.content, header_encoding)
- except UnicodeDecodeError:
- _encoding = _try_decode(response.content, charset)
+ _encoding = _try_decode(response.content, header_encoding) \
+ or _try_decode(response.content, charset)

return _encoding
diff --git a/tests/http_tests.py b/tests/http_tests.py
index 09f2e61..b23f890 100644
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -488,41 +488,22 @@

def test_invalid_charset(self):
"""Test decoding with different and invalid charsets."""
- charset = 'utf16'
- resp = CharsetTestCase._create_response(
- data=CharsetTestCase.LATIN1_BYTES)
- # Ignore WARNING: Encoding "utf16" requested but "utf-8" received
- with patch('pywikibot.warning'):
- with self.assertRaisesRegex(
- UnicodeDecodeError,
- self.CODEC_CANT_DECODE_RE):
- http._decide_encoding(resp, charset)
- self.assertEqual(resp.content, CharsetTestCase.LATIN1_BYTES)
+ invalid_charsets = ('utf16', 'win-1251')
+ for charset in invalid_charsets:
+ with self.subTest(charset=charset):
+ resp = CharsetTestCase._create_response(
+ data=CharsetTestCase.LATIN1_BYTES)

- try:
- resp.encoding = http._decide_encoding(resp, charset)
- except UnicodeDecodeError as e:
- resp.encoding = e
+ with patch('pywikibot.warning'): # Ignore WARNING:
+ resp.encoding = http._decide_encoding(resp, charset)
+ self.assertIsNone(resp.encoding)
+ self.assertIsNotNone(resp.apparent_encoding)
+ self.assertEqual(resp.content, CharsetTestCase.LATIN1_BYTES)

- with patch('pywikibot.error'):
- with self.assertRaisesRegex(
- UnicodeDecodeError,
- self.CODEC_CANT_DECODE_RE):
- http.error_handling_callback(resp)
-
- # TODO: this is a breaking change
- # self.assertRaisesRegex(
- # UnicodeDecodeError, self.CODEC_CANT_DECODE_RE, lambda: resp.text)
-
- # Response() would do:
- # encoding = UnicodeDecodeError -> str(self.content, errors='replace')
- self.assertEqual(
- resp.text, str(resp.content, errors='replace'))
- # encoding = None -> str(resp.content, resp.encoding, errors='replace')
- resp.encoding = None
- self.assertEqual(
- resp.text,
- str(resp.content, resp.apparent_encoding, errors='replace'))
+ # test Response.apparent_encoding
+ self.assertEqual(resp.text, str(resp.content,
+ resp.apparent_encoding,
+ errors='replace'))


class BinaryTestCase(TestCase):

To view, visit change 669725. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I76b690d986c61831587fd18e92919bcda4eeebe7
Gerrit-Change-Number: 669725
Gerrit-PatchSet: 6
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Merlijn van Deen <valhallasw@arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.wiki@gmail.com>
Gerrit-Reviewer: Rubin <rubin.happy@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged