jenkins-bot has submitted this change and it was merged.
Change subject: Support list of encodings for url2unicode.
......................................................................
Support list of encodings for url2unicode.
Decouples url2unicode from Site objects, so it
may be used in code where a Site is not relevant.
Change-Id: I9ca2a933d227afa79de8ce402304592682785d17
---
M pywikibot/page.py
M scripts/cosmetic_changes.py
2 files changed, 31 insertions(+), 18 deletions(-)
Approvals:
John Vandenberg: Looks good to me, but someone else must approve
Legoktm: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/page.py b/pywikibot/page.py
index 7a0d1bd..623fb29 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -3546,12 +3546,14 @@
else:
self._anchor = None
+ # Convert URL-encoded characters to unicode
+ encodings = [self._source.encoding()] + list(self._source.encodings())
+
+ self._text = url2unicode(self._text, encodings=encodings)
+
# Clean up the name, it can come from anywhere.
# Convert HTML entities to unicode
t = html2unicode(self._text)
-
- # Convert URL-encoded characters to unicode
- t = url2unicode(t, site=self._source)
# Normalize unicode string to a NFC (composed) format to allow
# proper string comparisons. According to
@@ -4040,21 +4042,31 @@
return x
-def url2unicode(title, site, site2=None):
- """Convert URL-encoded text to unicode using site's encoding.
-
- If site2 is provided, try its encodings as well. Uses the first encoding
- that doesn't cause an error.
-
+@deprecate_arg('site2', None)
+@deprecate_arg('site', 'encodings')
+def url2unicode(title, encodings='utf-8'):
"""
- # create a list of all possible encodings for both hint sites
- encList = [site.encoding()] + list(site.encodings())
- if site2 and site2 != site:
- encList.append(site2.encoding())
- encList += list(site2.encodings())
+ Convert URL-encoded text to unicode using several encoding.
+
+ Uses the first encoding that doesn't cause an error.
+
+ @param data: URL-encoded character data to convert
+ @type data: str
+ @param encodings: Encodings to attempt to use during conversion.
+ @type encodings: str, list or Site
+ @return: unicode
+
+ @exception UnicodeError: Could not convert using any encoding.
+ """
+ if isinstance(encodings, basestring):
+ encodings = [encodings]
+ elif isinstance(encodings, pywikibot.site.BaseSite):
+ # create a list of all possible encodings for both hint sites
+ site = encodings
+ encodings = [site.encoding()] + list(site.encodings())
+
firstException = None
- # try to handle all encodings (will probably retry utf-8)
- for enc in encList:
+ for enc in encodings:
try:
t = title.encode(enc)
t = unquote_to_bytes(t)
diff --git a/scripts/cosmetic_changes.py b/scripts/cosmetic_changes.py
index 34a2d4f..dcac72c 100755
--- a/scripts/cosmetic_changes.py
+++ b/scripts/cosmetic_changes.py
@@ -78,6 +78,7 @@
import pywikibot
import isbn
from pywikibot import config, i18n, pagegenerators, Bot
+from pywikibot.page import url2unicode
warning = """
ATTENTION: You can run this script as a stand-alone for testing purposes.
@@ -427,8 +428,8 @@
titleLength)
# Convert URL-encoded characters to unicode
- titleWithSection = pywikibot.url2unicode(titleWithSection,
- site=self.site)
+ titleWithSection = url2unicode(titleWithSection,
+ encodings=self.site)
if titleWithSection == '':
# just skip empty links.
--
To view, visit
https://gerrit.wikimedia.org/r/150869
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9ca2a933d227afa79de8ce402304592682785d17
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: jenkins-bot <>