jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789774 )
Change subject: [bugfix] Improve get_charset_from_content_type function ......................................................................
[bugfix] Improve get_charset_from_content_type function
- remove delimiter in front of the code number - replace win/windows with cp - remove language code in font of win/windows like in sr-win1250
Bug: T307760 Change-Id: I4b13dee432b947dbd4db4846ef435d8b41d7a2b1 --- M pywikibot/comms/http.py M tests/http_tests.py 2 files changed, 15 insertions(+), 2 deletions(-)
Approvals: Rubin: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py index 792a44d..a828e41 100644 --- a/pywikibot/comms/http.py +++ b/pywikibot/comms/http.py @@ -415,8 +415,13 @@ if re.sub(r'[ _-]', '', charset) == 'xeucjp': charset = 'euc_jp' else: - # fix cp encodings (T304830) - charset = re.sub(r'\Acp[ _-](\d{3,4})', r'cp\1', charset) + # fix cp encodings (T304830, T307760) + # remove delimiter in front of the code number + # replace win/windows with cp + # remove language code in font of win/windows + charset = re.sub( + r'\A(?:cp[ _-]|(?:[a-z]+[_-]?)?win(?:dows[_-]?)?)(\d{3,4})', + r'cp\1', charset) return charset
diff --git a/tests/http_tests.py b/tests/http_tests.py index 491eea5..fddad67 100755 --- a/tests/http_tests.py +++ b/tests/http_tests.py @@ -477,6 +477,14 @@ resp.apparent_encoding, errors='replace'))
+ def test_get_charset_from_content_type(self): + """Test get_charset_from_content_type function.""" + self.assertEqual( + http.get_charset_from_content_type('charset="cp-1251"'), 'cp1251') + self.assertEqual( + http.get_charset_from_content_type('charset="ru-win1251"'), + 'cp1251') +
class BinaryTestCase(TestCase):
pywikibot-commits@lists.wikimedia.org