[Gerrit] pywikibot/core[master]: [IMPR] Fix html2unicode - Pywikibot-commits

13 Feb 2017

jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/337290 )
Change subject: [IMPR] Fix html2unicode
......................................................................
[IMPR] Fix html2unicode
- Fix ignoring of convertIllegalHtmlEntities. Previously, ignoring any
  entity inside range(128, 160) would work.
- Do not attempt to resolve protected entites (note that this didn't
  work when '&' was protected different way).
- Add a lot of tests.
Bug: T130925
Change-Id: Ibb115606920cf945b4bdf366fc0b415028aff050
---
M pywikibot/page.py
M tests/page_tests.py
2 files changed, 23 insertions(+), 6 deletions(-)
Approvals:
  jenkins-bot: Verified
  Xqt: Looks good to me, approved

diff --git a/pywikibot/page.py b/pywikibot/page.py
index b77ba4a..c401000 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -5535,7 +5535,7 @@
     # This regular expression will match any decimal and hexadecimal entity and
     # also entities that might be named entities.
     entityR = re.compile(
-        r'&(?:amp;)?(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));')
+        r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));')
     # These characters are Html-illegal, but sadly you *can* find some of
     # these and converting them to chr(decimal) is unsuitable
     convertIllegalHtmlEntities = {
@@ -5569,7 +5569,8 @@
     }
     # ensuring that illegal &#129; &#141; and &#157, which have no known values,
     # don't get converted to chr(129), chr(141) or chr(157)
-    ignore = set(ignore) | set([129, 141, 157])
+    ignore = (set(map(lambda x: convertIllegalHtmlEntities.get(x, x), ignore)) |
+              set([129, 141, 157]))
def handle_entity(match):
         if match.group('decimal'):
@@ -5583,10 +5584,10 @@
                 unicodeCodepoint = htmlentitydefs.name2codepoint[name]
             else:
                 unicodeCodepoint = False
-        try:
+
+        if unicodeCodepoint in convertIllegalHtmlEntities:
             unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint]
-        except KeyError:
-            pass
+
         if unicodeCodepoint and unicodeCodepoint not in ignore:
             if unicodeCodepoint > sys.maxunicode:
                 # solve narrow Python 2 build exception (UTF-16)
diff --git a/tests/page_tests.py b/tests/page_tests.py
index deb0274..fd7b63f 100644
--- a/tests/page_tests.py
+++ b/tests/page_tests.py
@@ -1020,14 +1020,30 @@
net = False
+    def test_no_entities(self):
+        """Test that text is left unchanged."""
+        self.assertEqual(pywikibot.page.html2unicode('foobar'), 'foobar')
+        self.assertEqual(pywikibot.page.html2unicode(' '), ' ')
+
     def test_valid_entities(self):
         """Test valid entities."""
         self.assertEqual(pywikibot.page.html2unicode('A&amp;O'), 'A&O')
         self.assertEqual(pywikibot.page.html2unicode('&#x70;&#x79;'), 'py')
         self.assertEqual(pywikibot.page.html2unicode('&#x10000;'), u'\U00010000')
         self.assertEqual(pywikibot.page.html2unicode('&#x70;&amp;&#x79;'), 'p&y')
+        self.assertEqual(pywikibot.page.html2unicode('&#128;'), '€')
-    @unittest.expectedFailure
+    def test_ignore_entities(self):
+        """Test ignore entities."""
+        self.assertEqual(pywikibot.page.html2unicode('A&amp;O', [38]), 'A&amp;O')
+        self.assertEqual(pywikibot.page.html2unicode('A&#38;O', [38]), 'A&#38;O')
+        self.assertEqual(pywikibot.page.html2unicode('A&#x26;O', [38]), 'A&#x26;O')
+        self.assertEqual(pywikibot.page.html2unicode('A&amp;O', [37]), 'A&O')
+        self.assertEqual(pywikibot.page.html2unicode('&#128;', [128]), '&#128;')
+        self.assertEqual(pywikibot.page.html2unicode('&#128;', [8364]), '&#128;')
+        self.assertEqual(pywikibot.page.html2unicode('&#129;&#141;&#157'),
+                         '&#129;&#141;&#157')
+
     def test_recursive_entities(self):
         """Test recursive entities."""
         self.assertEqual(pywikibot.page.html2unicode('A&amp;amp;O'), 'A&amp;O')
-- 
To view, visit https://gerrit.wikimedia.org/r/337290
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ibb115606920cf945b4bdf366fc0b415028aff050
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Matěj Suchánek matejsuchanek97@gmail.com
Gerrit-Reviewer: Dalba dalba.wiki@gmail.com
Gerrit-Reviewer: Magul tomasz.magulski@gmail.com
Gerrit-Reviewer: Mpaa mpaa.wiki@gmail.com
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: Zhuyifei1999 zhuyifei1999@gmail.com
Gerrit-Reviewer: jenkins-bot <>