jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/783432 )
Change subject: Add get_charset_from_content_type ......................................................................
Add get_charset_from_content_type
used to extract the charset from the content-type response header
Change-Id: I7216488c9582f6de92034378b1d588e8dbfbc717 --- M pywikibot/comms/http.py M scripts/reflinks.py M tests/http_tests.py 3 files changed, 49 insertions(+), 32 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py index d683dfe..792a44d 100644 --- a/pywikibot/comms/http.py +++ b/pywikibot/comms/http.py @@ -395,6 +395,31 @@ return response
+# Extract charset (from content-type header) +CHARSET_RE = re.compile( + r'charset\s*=\s*(?P<q>['"]?)(?P<charset>[^'",;>/]+)(?P=q)', + flags=re.I, +) + + +def get_charset_from_content_type(content_type: str) -> Optional[str]: + """Get charset from the content-type header. + + .. versionadded:: 7.3 + """ + m = CHARSET_RE.search(content_type) + if not m: + return None + charset = m.group('charset').strip('"' ').lower() + # Convert to python correct encoding names + if re.sub(r'[ _-]', '', charset) == 'xeucjp': + charset = 'euc_jp' + else: + # fix cp encodings (T304830) + charset = re.sub(r'\Acp[ _-](\d{3,4})', r'cp\1', charset) + return charset + + def _get_encoding_from_response_headers(response) -> Optional[str]: """Return charset given by the response header.""" content_type = response.headers.get('content-type') @@ -402,9 +427,9 @@ if not content_type: return None
- m = re.search('charset=(?P<charset>.*?$)', content_type) - if m: - header_encoding = m.group('charset') + charset = get_charset_from_content_type(content_type) + if charset: + header_encoding = charset elif 'json' in content_type: # application/json | application/sparql-results+json header_encoding = 'utf-8' diff --git a/scripts/reflinks.py b/scripts/reflinks.py index 67283a7..d8e571c 100755 --- a/scripts/reflinks.py +++ b/scripts/reflinks.py @@ -58,12 +58,12 @@ from functools import partial from http import HTTPStatus from textwrap import shorten -from typing import Optional
import pywikibot from pywikibot import comms, config, i18n, pagegenerators, textlib -from pywikibot.backports import Match, removeprefix +from pywikibot.backports import removeprefix from pywikibot.bot import ConfigParserBot, ExistingPageBot, SingleSiteBot +from pywikibot.comms.http import get_charset_from_content_type from pywikibot.exceptions import ( FatalServerError, Server414Error, @@ -474,9 +474,6 @@ # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile( br'(?i)<meta[^>]*(?:content-type|charset)[^>]*>') - # Extract the encoding from a charset property (from content-type !) - self.CHARSET = re.compile( - r'(?i)charset\s*=\s*(?P<enc>(?P<q>['"]?)[^'",;>/]*(?P=q))') # Extract html title from page self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments @@ -549,21 +546,6 @@ return True return super().skip_page(page)
- @staticmethod - def charset(enc: Match) -> Optional[str]: - """Find an encoding type.""" - if enc: - # Use encoding if found. Else use chardet apparent encoding - encoding = enc.group('enc').strip('"' ').lower() - # Convert to python correct encoding names - if re.sub(r'[ _-]', '', encoding) == 'xeucjp': - encoding = 'euc_jp' - else: - # fix cp encodings (T304830) - encoding = re.sub(r'\Acp[ _-](\d{3,4})', r'cp\1', encoding) - return encoding - return None - def treat(self, page) -> None: """Process one page.""" # Load the page's text from the wiki @@ -664,14 +646,12 @@ linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
meta_content = self.META_CONTENT.search(linkedpagetext) - s = None + encoding = None if content_type: - # use charset from http header - s = self.CHARSET.search(content_type) + encoding = get_charset_from_content_type(content_type)
if meta_content: tag = None - encoding = self.charset(s) encodings = [encoding] if encoding else [] encodings += list(page.site.encodings()) for enc in encodings: @@ -679,14 +659,12 @@ tag = meta_content.group().decode(enc) break
- # Prefer the contentType from the HTTP header : + # Prefer the content-type from the HTTP header if not content_type and tag: content_type = tag - if not s: - # use charset from html - s = self.CHARSET.search(tag) + if not encoding: + encoding = get_charset_from_content_type(tag)
- encoding = self.charset(s) if encoding: r.encoding = encoding
diff --git a/tests/http_tests.py b/tests/http_tests.py index 384bd91..491eea5 100755 --- a/tests/http_tests.py +++ b/tests/http_tests.py @@ -403,6 +403,20 @@ resp.encoding = http._decide_encoding(resp, charset) self.assertEqual('latin1', resp.encoding)
+ def test_charset_not_last(self): + """Test charset not last part of content-type header.""" + charset = None + resp = CharsetTestCase._create_response( + headers={ + 'content-type': ( + 'text/html; charset=utf-8; profile=' + '"https://www.mediawiki.org/wiki/Specs/HTML/2.4.0"' + ) + }, + data=CharsetTestCase.UTF8_BYTES) + resp.encoding = http._decide_encoding(resp, charset) + self.assertEqual('utf-8', resp.encoding) + def test_server_charset(self): """Test decoding with server explicit charset.""" charset = None