[Gerrit] [IMPROV] page: re.sub instead of manual iteration - change (pywikibot/core) - Pywikibot-commits

18 Mar 2015

jenkins-bot has submitted this change and it was merged.
Change subject: [IMPROV] page: re.sub instead of manual iteration
......................................................................
[IMPROV] page: re.sub instead of manual iteration
The re.sub method allows to use a callable as the replacement which
allows it to automatically replace without manually stitching the text
together.
Change-Id: I1f46f2285bdaf7749bfc454ac1d47a4f18b77d8a
---
M pywikibot/page.py
M tests/page_tests.py
2 files changed, 33 insertions(+), 34 deletions(-)
Approvals:
  Mpaa: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/page.py b/pywikibot/page.py
index f38dc5c..8f9781b 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -4814,42 +4814,33 @@
     # ensuring that illegal &#129; &#141; and &#157, which have no known values,
     # don't get converted to chr(129), chr(141) or chr(157)
     ignore = set(ignore) | set([129, 141, 157])
-    result = u''
-    i = 0
-    found = True
-    while found:
-        text = text[i:]
-        match = entityR.search(text)
-        if match:
-            unicodeCodepoint = None
-            if match.group('decimal'):
-                unicodeCodepoint = int(match.group('decimal'))
-            elif match.group('hex'):
-                unicodeCodepoint = int(match.group('hex'), 16)
-            elif match.group('name'):
-                name = match.group('name')
-                if name in htmlentitydefs.name2codepoint:
-                    # We found a known HTML entity.
-                    unicodeCodepoint = htmlentitydefs.name2codepoint[name]
-            result += text[:match.start()]
-            try:
-                unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint]
-            except KeyError:
-                pass
-            if unicodeCodepoint and unicodeCodepoint not in ignore:
-                if unicodeCodepoint > sys.maxunicode:
-                    # solve narrow Python 2 build exception (UTF-16)
-                    result += eval(r"u'\U{:08x}'".format(unicodeCodepoint))
-                else:
-                    result += chr(unicodeCodepoint)
+
+    def handle_entity(match):
+        if match.group('decimal'):
+            unicodeCodepoint = int(match.group('decimal'))
+        elif match.group('hex'):
+            unicodeCodepoint = int(match.group('hex'), 16)
+        elif match.group('name'):
+            name = match.group('name')
+            if name in htmlentitydefs.name2codepoint:
+                # We found a known HTML entity.
+                unicodeCodepoint = htmlentitydefs.name2codepoint[name]
             else:
-                # Leave the entity unchanged
-                result += text[match.start():match.end()]
-            i = match.end()
+                unicodeCodepoint = False
+        try:
+            unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint]
+        except KeyError:
+            pass
+        if unicodeCodepoint and unicodeCodepoint not in ignore:
+            if unicodeCodepoint > sys.maxunicode:
+                # solve narrow Python 2 build exception (UTF-16)
+                return eval(r"u'\U{:08x}'".format(unicodeCodepoint))
+            else:
+                return chr(unicodeCodepoint)
         else:
-            result += text
-            found = False
-    return result
+            # Leave the entity unchanged
+            return match.group(0)
+    return entityR.sub(handle_entity, text)
def UnicodeToAsciiHtml(s):
diff --git a/tests/page_tests.py b/tests/page_tests.py
index bc77765..f2ab434 100644
--- a/tests/page_tests.py
+++ b/tests/page_tests.py
@@ -796,12 +796,20 @@
         self.assertEqual(pywikibot.page.html2unicode('A&amp;O'), 'A&O')
         self.assertEqual(pywikibot.page.html2unicode('&#x70;&#x79;'), 'py')
         self.assertEqual(pywikibot.page.html2unicode('&#x10000;'), u'\U00010000')
+        self.assertEqual(pywikibot.page.html2unicode('&#x70;&amp;&#x79;'), 'p&y')
@unittest.expectedFailure
     def test_recursive_entities(self):
         """Test recursive entities."""
         self.assertEqual(pywikibot.page.html2unicode('A&amp;amp;O'), 'A&amp;O')
+    def test_invalid_entities(self):
+        """Test texts with invalid entities."""
+        self.assertEqual(pywikibot.page.html2unicode('A&notaname;O'), 'A&notaname;O')
+        self.assertEqual(pywikibot.page.html2unicode('A&#7f;O'), 'A&#7f;O')
+        self.assertEqual(pywikibot.page.html2unicode('&#7f'), '&#7f')
+        self.assertEqual(pywikibot.page.html2unicode('&#x70&#x79;'), '&#x70y')
+
if __name__ == '__main__':
     try:
-- 
To view, visit https://gerrit.wikimedia.org/r/196424
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I1f46f2285bdaf7749bfc454ac1d47a4f18b77d8a
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise CommodoreFabianus@gmx.de
Gerrit-Reviewer: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: Ladsgroup ladsgroup@gmail.com
Gerrit-Reviewer: Merlijn van Deen valhallasw@arctus.nl
Gerrit-Reviewer: Mpaa mpaa.wiki@gmail.com
Gerrit-Reviewer: Ricordisamoa ricordisamoa@openmailbox.org
Gerrit-Reviewer: jenkins-bot <>