jenkins-bot submitted this change.

View Change

Approvals:
  Rubin: Looks good to me, but someone else must approve
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

[bugfix] Solve UnicodeDecodeError in reflinks.py

- add a new staticmethod 'charset' to extract a charset
- try to extract a charset from meta content
- try any charset including site.encodings to decode  the meta content

Bug: T304288
Change-Id: I3d9b534197fedb4f0a4fa1c0980be5d9369e319e
---
M scripts/reflinks.py
1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 6e7aba8..b15f2e5 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -42,7 +42,7 @@
 
 &params;
 """
-# (C) Pywikibot team, 2008-2021
+# (C) Pywikibot team, 2008-2022
 #
 # Distributed under the terms of the MIT license.
 #
@@ -58,10 +58,11 @@
 from functools import partial
 from http import HTTPStatus
 from textwrap import shorten
+from typing import Optional
 
 import pywikibot
 from pywikibot import comms, config, i18n, pagegenerators, textlib
-from pywikibot.backports import removeprefix
+from pywikibot.backports import Match, removeprefix
 from pywikibot.bot import (
     ConfigParserBot,
     ExistingPageBot,
@@ -553,6 +554,19 @@
             return True
         return super().skip_page(page)
 
+    @staticmethod
+    def charset(enc: Match) -> Optional[str]:
+        """Find an encoding type."""
+        if enc:
+            # Use encoding if found. Else use chardet apparent encoding
+            encoding = enc.group('enc').strip('"\' ').lower()
+            naked = re.sub(r'[ _\-]', '', encoding)
+            # Convert to python correct encoding names
+            if naked == 'xeucjp':
+                encoding = 'euc_jp'
+            return encoding
+        return None
+
     def treat(self, page) -> None:
         """Process one page."""
         # Load the page's text from the wiki
@@ -659,21 +673,26 @@
             if content_type:
                 # use charset from http header
                 s = self.CHARSET.search(content_type)
+
             if meta_content:
-                tag = meta_content.group().decode()
+                tag = None
+                encoding = self.charset(s)
+                encodings = [encoding] if encoding else []
+                encodings += list(page.site.encodings())
+                for enc in encodings:
+                    with suppress(UnicodeDecodeError):
+                        tag = meta_content.group().decode(enc)
+                        break
+
                 # Prefer the contentType from the HTTP header :
-                if not content_type:
+                if not content_type and tag:
                     content_type = tag
                 if not s:
                     # use charset from html
                     s = self.CHARSET.search(tag)
-            if s:
-                # Use encoding if found. Else use chardet apparent encoding
-                encoding = s.group('enc').strip('"\' ').lower()
-                naked = re.sub(r'[ _\-]', '', encoding)
-                # Convert to python correct encoding names
-                if naked == 'xeucjp':
-                    encoding = 'euc_jp'
+
+            encoding = self.charset(s)
+            if encoding:
                 r.encoding = encoding
 
             if not content_type:

To view, visit change 772400. To unsubscribe, or for help writing mail filters, visit settings.