jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/669725 )
Change subject: [bugfix] check for LookupError exception in _try_decode ......................................................................
[bugfix] check for LookupError exception in _try_decode
- check for LookupError or UnicodeDecodeError exception in http._decide_encoding and use apparent_encoding logic of requests.Resonse. - _decide_encoding returns an encoding string or None. If None is given, requests.Response uses chardet to detect encoding. No UnicodeDecodeError is raised anymore. Therefore remove this Exception from error_handling_callback. - Update tests accordingly
Bug: T276715 Change-Id: I76b690d986c61831587fd18e92919bcda4eeebe7 --- M pywikibot/comms/http.py M tests/http_tests.py 2 files changed, 38 insertions(+), 46 deletions(-)
Approvals: Rubin: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py index a0f25fa..05f8bd2 100644 --- a/pywikibot/comms/http.py +++ b/pywikibot/comms/http.py @@ -310,11 +310,6 @@ if response.status_code not in (200, 207): warning('Http response status {}'.format(response.status_code))
- if isinstance(response.encoding, UnicodeDecodeError): - error('An error occurred for uri {}: ' - 'no encoding detected!'.format(response.request.url)) - raise response.encoding from None -
@deprecated_args(callback=True, body='data') def fetch(uri: str, method: str = 'GET', headers: Optional[dict] = None, @@ -422,7 +417,7 @@ return response
-def _get_encoding_from_response_headers(response): +def _get_encoding_from_response_headers(response) -> Optional[str]: """Return charset given by the response header.""" content_type = response.headers.get('content-type')
@@ -449,11 +444,19 @@ return header_encoding
-def _decide_encoding(response, charset): +def _decide_encoding(response, charset) -> Optional[str]: """Detect the response encoding.""" def _try_decode(content, encoding): """Helper function to try decoding.""" - content.decode(encoding) + if encoding is None: + return None + try: + content.decode(encoding) + except (LookupError, UnicodeDecodeError): + pywikibot.warning('Unknown or invalid encoding {!r}' + .format(encoding)) + # let chardet do the job + return None return encoding
header_encoding = _get_encoding_from_response_headers(response) @@ -475,14 +478,22 @@ return _try_decode(response.content, charset)
# Both charset and header_encoding are available. - if codecs.lookup(header_encoding) != codecs.lookup(charset): + try: + header_codecs = codecs.lookup(header_encoding) + except LookupError: + header_codecs = None + + try: + charset_codecs = codecs.lookup(charset) + except LookupError: + charset_codecs = None + + if header_codecs and charset_codecs and header_codecs != charset_codecs: pywikibot.warning( 'Encoding "{}" requested but "{}" received in the ' 'response header.'.format(charset, header_encoding))
- try: - _encoding = _try_decode(response.content, header_encoding) - except UnicodeDecodeError: - _encoding = _try_decode(response.content, charset) + _encoding = _try_decode(response.content, header_encoding) \ + or _try_decode(response.content, charset)
return _encoding diff --git a/tests/http_tests.py b/tests/http_tests.py index 09f2e61..b23f890 100644 --- a/tests/http_tests.py +++ b/tests/http_tests.py @@ -488,41 +488,22 @@
def test_invalid_charset(self): """Test decoding with different and invalid charsets.""" - charset = 'utf16' - resp = CharsetTestCase._create_response( - data=CharsetTestCase.LATIN1_BYTES) - # Ignore WARNING: Encoding "utf16" requested but "utf-8" received - with patch('pywikibot.warning'): - with self.assertRaisesRegex( - UnicodeDecodeError, - self.CODEC_CANT_DECODE_RE): - http._decide_encoding(resp, charset) - self.assertEqual(resp.content, CharsetTestCase.LATIN1_BYTES) + invalid_charsets = ('utf16', 'win-1251') + for charset in invalid_charsets: + with self.subTest(charset=charset): + resp = CharsetTestCase._create_response( + data=CharsetTestCase.LATIN1_BYTES)
- try: - resp.encoding = http._decide_encoding(resp, charset) - except UnicodeDecodeError as e: - resp.encoding = e + with patch('pywikibot.warning'): # Ignore WARNING: + resp.encoding = http._decide_encoding(resp, charset) + self.assertIsNone(resp.encoding) + self.assertIsNotNone(resp.apparent_encoding) + self.assertEqual(resp.content, CharsetTestCase.LATIN1_BYTES)
- with patch('pywikibot.error'): - with self.assertRaisesRegex( - UnicodeDecodeError, - self.CODEC_CANT_DECODE_RE): - http.error_handling_callback(resp) - - # TODO: this is a breaking change - # self.assertRaisesRegex( - # UnicodeDecodeError, self.CODEC_CANT_DECODE_RE, lambda: resp.text) - - # Response() would do: - # encoding = UnicodeDecodeError -> str(self.content, errors='replace') - self.assertEqual( - resp.text, str(resp.content, errors='replace')) - # encoding = None -> str(resp.content, resp.encoding, errors='replace') - resp.encoding = None - self.assertEqual( - resp.text, - str(resp.content, resp.apparent_encoding, errors='replace')) + # test Response.apparent_encoding + self.assertEqual(resp.text, str(resp.content, + resp.apparent_encoding, + errors='replace'))
class BinaryTestCase(TestCase):
pywikibot-commits@lists.wikimedia.org