jenkins-bot has submitted this change and it was merged.
Change subject: Support list of encodings for url2unicode. ......................................................................
Support list of encodings for url2unicode.
Decouples url2unicode from Site objects, so it may be used in code where a Site is not relevant.
Change-Id: I9ca2a933d227afa79de8ce402304592682785d17 --- M pywikibot/page.py M scripts/cosmetic_changes.py 2 files changed, 31 insertions(+), 18 deletions(-)
Approvals: John Vandenberg: Looks good to me, but someone else must approve Legoktm: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/page.py b/pywikibot/page.py index 7a0d1bd..623fb29 100644 --- a/pywikibot/page.py +++ b/pywikibot/page.py @@ -3546,12 +3546,14 @@ else: self._anchor = None
+ # Convert URL-encoded characters to unicode + encodings = [self._source.encoding()] + list(self._source.encodings()) + + self._text = url2unicode(self._text, encodings=encodings) + # Clean up the name, it can come from anywhere. # Convert HTML entities to unicode t = html2unicode(self._text) - - # Convert URL-encoded characters to unicode - t = url2unicode(t, site=self._source)
# Normalize unicode string to a NFC (composed) format to allow # proper string comparisons. According to @@ -4040,21 +4042,31 @@ return x
-def url2unicode(title, site, site2=None): - """Convert URL-encoded text to unicode using site's encoding. - - If site2 is provided, try its encodings as well. Uses the first encoding - that doesn't cause an error. - +@deprecate_arg('site2', None) +@deprecate_arg('site', 'encodings') +def url2unicode(title, encodings='utf-8'): """ - # create a list of all possible encodings for both hint sites - encList = [site.encoding()] + list(site.encodings()) - if site2 and site2 != site: - encList.append(site2.encoding()) - encList += list(site2.encodings()) + Convert URL-encoded text to unicode using several encoding. + + Uses the first encoding that doesn't cause an error. + + @param data: URL-encoded character data to convert + @type data: str + @param encodings: Encodings to attempt to use during conversion. + @type encodings: str, list or Site + @return: unicode + + @exception UnicodeError: Could not convert using any encoding. + """ + if isinstance(encodings, basestring): + encodings = [encodings] + elif isinstance(encodings, pywikibot.site.BaseSite): + # create a list of all possible encodings for both hint sites + site = encodings + encodings = [site.encoding()] + list(site.encodings()) + firstException = None - # try to handle all encodings (will probably retry utf-8) - for enc in encList: + for enc in encodings: try: t = title.encode(enc) t = unquote_to_bytes(t) diff --git a/scripts/cosmetic_changes.py b/scripts/cosmetic_changes.py index 34a2d4f..dcac72c 100755 --- a/scripts/cosmetic_changes.py +++ b/scripts/cosmetic_changes.py @@ -78,6 +78,7 @@ import pywikibot import isbn from pywikibot import config, i18n, pagegenerators, Bot +from pywikibot.page import url2unicode
warning = """ ATTENTION: You can run this script as a stand-alone for testing purposes. @@ -427,8 +428,8 @@ titleLength)
# Convert URL-encoded characters to unicode - titleWithSection = pywikibot.url2unicode(titleWithSection, - site=self.site) + titleWithSection = url2unicode(titleWithSection, + encodings=self.site)
if titleWithSection == '': # just skip empty links.