jenkins-bot submitted this change.

View Change

Approvals: Rubin: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
[bugfix] Solve UnicodeDecodeError in reflinks.py

- add a new staticmethod 'charset' to extract a charset
- try to extract a charset from meta content
- try any charset including site.encodings to decode the meta content

Bug: T304288
Change-Id: I3d9b534197fedb4f0a4fa1c0980be5d9369e319e
---
M scripts/reflinks.py
1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 6e7aba8..b15f2e5 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -42,7 +42,7 @@

&params;
"""
-# (C) Pywikibot team, 2008-2021
+# (C) Pywikibot team, 2008-2022
#
# Distributed under the terms of the MIT license.
#
@@ -58,10 +58,11 @@
from functools import partial
from http import HTTPStatus
from textwrap import shorten
+from typing import Optional

import pywikibot
from pywikibot import comms, config, i18n, pagegenerators, textlib
-from pywikibot.backports import removeprefix
+from pywikibot.backports import Match, removeprefix
from pywikibot.bot import (
ConfigParserBot,
ExistingPageBot,
@@ -553,6 +554,19 @@
return True
return super().skip_page(page)

+ @staticmethod
+ def charset(enc: Match) -> Optional[str]:
+ """Find an encoding type."""
+ if enc:
+ # Use encoding if found. Else use chardet apparent encoding
+ encoding = enc.group('enc').strip('"\' ').lower()
+ naked = re.sub(r'[ _\-]', '', encoding)
+ # Convert to python correct encoding names
+ if naked == 'xeucjp':
+ encoding = 'euc_jp'
+ return encoding
+ return None
+
def treat(self, page) -> None:
"""Process one page."""
# Load the page's text from the wiki
@@ -659,21 +673,26 @@
if content_type:
# use charset from http header
s = self.CHARSET.search(content_type)
+
if meta_content:
- tag = meta_content.group().decode()
+ tag = None
+ encoding = self.charset(s)
+ encodings = [encoding] if encoding else []
+ encodings += list(page.site.encodings())
+ for enc in encodings:
+ with suppress(UnicodeDecodeError):
+ tag = meta_content.group().decode(enc)
+ break
+
# Prefer the contentType from the HTTP header :
- if not content_type:
+ if not content_type and tag:
content_type = tag
if not s:
# use charset from html
s = self.CHARSET.search(tag)
- if s:
- # Use encoding if found. Else use chardet apparent encoding
- encoding = s.group('enc').strip('"\' ').lower()
- naked = re.sub(r'[ _\-]', '', encoding)
- # Convert to python correct encoding names
- if naked == 'xeucjp':
- encoding = 'euc_jp'
+
+ encoding = self.charset(s)
+ if encoding:
r.encoding = encoding

if not content_type:

To view, visit change 772400. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I3d9b534197fedb4f0a4fa1c0980be5d9369e319e
Gerrit-Change-Number: 772400
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Rubin <rubin.happy@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged