jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/637696 )
Change subject: [bugfix] use chardet to find a valid encoding
......................................................................
[bugfix] use chardet to find a valid encoding
- set requests.Response.encoding if encoding was found.
Set it to None otherwise to use chardet package
- update encoding conversion
- remove all ugly hacks
Bug: T266862
Change-Id: I6fa147308cc12a7d4e6c04e1535f74114a5ea8c1
---
M scripts/reflinks.py
1 file changed, 7 insertions(+), 44 deletions(-)
Approvals:
Mpaa: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 5cc0914..66666b4 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -624,7 +624,6 @@
linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)
meta_content = self.META_CONTENT.search(linkedpagetext)
- enc = []
s = None
if content_type:
# use charset from http header
@@ -638,20 +637,15 @@
# use charset from html
s = self.CHARSET.search(tag)
if s:
- tmp = s.group('enc').strip('"\' ').lower()
- naked = re.sub(r'[ _\-]', '', tmp)
+ encoding = s.group('enc').strip('"\' ').lower()
+ naked = re.sub(r'[ _\-]', '', encoding)
# Convert to python correct encoding names
- if naked == 'gb2312':
- enc.append('gbk')
- elif naked == 'shiftjis':
- enc.append('shift jis 2004')
- enc.append('cp932')
- elif naked == 'xeucjp':
- enc.append('euc-jp')
- else:
- enc.append(tmp)
+ if naked == 'xeucjp':
+ encoding = 'euc_jp'
+ f.data.encoding = encoding
else:
pywikibot.output('No charset found for ' + ref.link)
+ f.data.encoding = None
if not content_type:
pywikibot.output('No content-type found for ' + ref.link)
@@ -665,30 +659,7 @@
new_text = new_text.replace(match.group(), repl)
return
- # Ugly hacks to try to survive when both server and page
- # return no encoding.
- # Uses most used encodings for each national suffix
- if '.ru' in ref.link or '.su' in ref.link:
- # see
http://www.sci.aha.ru/ATL/ra13a.htm : no server
- # encoding, no page encoding
- enc = enc + ['koi8-r', 'windows-1251']
- elif '.jp' in ref.link:
- enc.append('shift jis 2004')
- enc.append('cp932')
- elif '.kr' in ref.link:
- enc.append('euc-kr')
- enc.append('cp949')
- elif '.zh' in ref.link:
- enc.append('gbk')
-
- if 'utf-8' not in enc:
- enc.append('utf-8')
- try:
- u = linkedpagetext.decode(enc[0]) # Bug T69410
- except (UnicodeDecodeError, LookupError) as e:
- pywikibot.output('{} : Decoding error - {}'
- .format(ref.link, e))
- return
+ u = f.data.text
# Retrieves the first non empty string inside <title> tags
for m in self.TITLE.finditer(u):
@@ -705,14 +676,6 @@
pywikibot.output('{} : No title found...'.format(ref.link))
return
- # XXX Ugly hack
- if 'é' in ref.title:
- repl = ref.refLink()
- new_text = new_text.replace(match.group(), repl)
- pywikibot.output('{} : Hybrid encoding...'
- .format(ref.link))
- return
-
if self.titleBlackList.match(ref.title):
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
--
To view, visit
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/637696
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I6fa147308cc12a7d4e6c04e1535f74114a5ea8c1
Gerrit-Change-Number: 637696
Gerrit-PatchSet: 5
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Rubin <rubin(a)wikimedia.ru>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged