[Gerrit] Support list of encodings for url2unicode. - change (pywikibot/core) - Pywikibot-commits

9 Aug 2014

jenkins-bot has submitted this change and it was merged.
Change subject: Support list of encodings for url2unicode.
......................................................................
Support list of encodings for url2unicode.
Decouples url2unicode from Site objects, so it
may be used in code where a Site is not relevant.
Change-Id: I9ca2a933d227afa79de8ce402304592682785d17
---
M pywikibot/page.py
M scripts/cosmetic_changes.py
2 files changed, 31 insertions(+), 18 deletions(-)
Approvals:
  John Vandenberg: Looks good to me, but someone else must approve
  Legoktm: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/page.py b/pywikibot/page.py
index 7a0d1bd..623fb29 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -3546,12 +3546,14 @@
         else:
             self._anchor = None
+        # Convert URL-encoded characters to unicode
+        encodings = [self._source.encoding()] + list(self._source.encodings())
+
+        self._text = url2unicode(self._text, encodings=encodings)
+
         # Clean up the name, it can come from anywhere.
         # Convert HTML entities to unicode
         t = html2unicode(self._text)
-
-        # Convert URL-encoded characters to unicode
-        t = url2unicode(t, site=self._source)
# Normalize unicode string to a NFC (composed) format to allow
         # proper string comparisons. According to
@@ -4040,21 +4042,31 @@
     return x
-def url2unicode(title, site, site2=None):
-    """Convert URL-encoded text to unicode using site's encoding.
-
-    If site2 is provided, try its encodings as well.  Uses the first encoding
-    that doesn't cause an error.
-
+@deprecate_arg('site2', None)
+@deprecate_arg('site', 'encodings')
+def url2unicode(title, encodings='utf-8'):
     """
-    # create a list of all possible encodings for both hint sites
-    encList = [site.encoding()] + list(site.encodings())
-    if site2 and site2 != site:
-        encList.append(site2.encoding())
-        encList += list(site2.encodings())
+    Convert URL-encoded text to unicode using several encoding.
+
+    Uses the first encoding that doesn't cause an error.
+
+    @param data: URL-encoded character data to convert
+    @type data: str
+    @param encodings: Encodings to attempt to use during conversion.
+    @type encodings: str, list or Site
+    @return: unicode
+
+    @exception UnicodeError: Could not convert using any encoding.
+    """
+    if isinstance(encodings, basestring):
+        encodings = [encodings]
+    elif isinstance(encodings, pywikibot.site.BaseSite):
+        # create a list of all possible encodings for both hint sites
+        site = encodings
+        encodings = [site.encoding()] + list(site.encodings())
+
     firstException = None
-    # try to handle all encodings (will probably retry utf-8)
-    for enc in encList:
+    for enc in encodings:
         try:
             t = title.encode(enc)
             t = unquote_to_bytes(t)
diff --git a/scripts/cosmetic_changes.py b/scripts/cosmetic_changes.py
index 34a2d4f..dcac72c 100755
--- a/scripts/cosmetic_changes.py
+++ b/scripts/cosmetic_changes.py
@@ -78,6 +78,7 @@
 import pywikibot
 import isbn
 from pywikibot import config, i18n, pagegenerators, Bot
+from pywikibot.page import url2unicode
warning = """
 ATTENTION: You can run this script as a stand-alone for testing purposes.
@@ -427,8 +428,8 @@
                                              titleLength)
# Convert URL-encoded characters to unicode
-                    titleWithSection = pywikibot.url2unicode(titleWithSection,
-                                                             site=self.site)
+                    titleWithSection = url2unicode(titleWithSection,
+                                                   encodings=self.site)
if titleWithSection == '':
                         # just skip empty links.
-- 
To view, visit https://gerrit.wikimedia.org/r/150869
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I9ca2a933d227afa79de8ce402304592682785d17
Gerrit-PatchSet: 2
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: Ladsgroup ladsgroup@gmail.com
Gerrit-Reviewer: Legoktm legoktm.wikipedia@gmail.com
Gerrit-Reviewer: Merlijn van Deen valhallasw@arctus.nl
Gerrit-Reviewer: jenkins-bot <>