jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/337290 )
Change subject: [IMPR] Fix html2unicode ......................................................................
[IMPR] Fix html2unicode
- Fix ignoring of convertIllegalHtmlEntities. Previously, ignoring any entity inside range(128, 160) would work. - Do not attempt to resolve protected entites (note that this didn't work when '&' was protected different way). - Add a lot of tests.
Bug: T130925 Change-Id: Ibb115606920cf945b4bdf366fc0b415028aff050 --- M pywikibot/page.py M tests/page_tests.py 2 files changed, 23 insertions(+), 6 deletions(-)
Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
diff --git a/pywikibot/page.py b/pywikibot/page.py index b77ba4a..c401000 100644 --- a/pywikibot/page.py +++ b/pywikibot/page.py @@ -5535,7 +5535,7 @@ # This regular expression will match any decimal and hexadecimal entity and # also entities that might be named entities. entityR = re.compile( - r'&(?:amp;)?(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));') + r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));') # These characters are Html-illegal, but sadly you *can* find some of # these and converting them to chr(decimal) is unsuitable convertIllegalHtmlEntities = { @@ -5569,7 +5569,8 @@ } # ensuring that illegal   and , which have no known values, # don't get converted to chr(129), chr(141) or chr(157) - ignore = set(ignore) | set([129, 141, 157]) + ignore = (set(map(lambda x: convertIllegalHtmlEntities.get(x, x), ignore)) | + set([129, 141, 157]))
def handle_entity(match): if match.group('decimal'): @@ -5583,10 +5584,10 @@ unicodeCodepoint = htmlentitydefs.name2codepoint[name] else: unicodeCodepoint = False - try: + + if unicodeCodepoint in convertIllegalHtmlEntities: unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint] - except KeyError: - pass + if unicodeCodepoint and unicodeCodepoint not in ignore: if unicodeCodepoint > sys.maxunicode: # solve narrow Python 2 build exception (UTF-16) diff --git a/tests/page_tests.py b/tests/page_tests.py index deb0274..fd7b63f 100644 --- a/tests/page_tests.py +++ b/tests/page_tests.py @@ -1020,14 +1020,30 @@
net = False
+ def test_no_entities(self): + """Test that text is left unchanged.""" + self.assertEqual(pywikibot.page.html2unicode('foobar'), 'foobar') + self.assertEqual(pywikibot.page.html2unicode(' '), ' ') + def test_valid_entities(self): """Test valid entities.""" self.assertEqual(pywikibot.page.html2unicode('A&O'), 'A&O') self.assertEqual(pywikibot.page.html2unicode('py'), 'py') self.assertEqual(pywikibot.page.html2unicode('𐀀'), u'\U00010000') self.assertEqual(pywikibot.page.html2unicode('p&y'), 'p&y') + self.assertEqual(pywikibot.page.html2unicode('€'), '€')
- @unittest.expectedFailure + def test_ignore_entities(self): + """Test ignore entities.""" + self.assertEqual(pywikibot.page.html2unicode('A&O', [38]), 'A&O') + self.assertEqual(pywikibot.page.html2unicode('A&O', [38]), 'A&O') + self.assertEqual(pywikibot.page.html2unicode('A&O', [38]), 'A&O') + self.assertEqual(pywikibot.page.html2unicode('A&O', [37]), 'A&O') + self.assertEqual(pywikibot.page.html2unicode('€', [128]), '€') + self.assertEqual(pywikibot.page.html2unicode('€', [8364]), '€') + self.assertEqual(pywikibot.page.html2unicode(''), + '') + def test_recursive_entities(self): """Test recursive entities.""" self.assertEqual(pywikibot.page.html2unicode('A&amp;O'), 'A&O')