jenkins-bot has submitted this change and it was merged.
Change subject: [IMPROV] page: re.sub instead of manual iteration ......................................................................
[IMPROV] page: re.sub instead of manual iteration
The re.sub method allows to use a callable as the replacement which allows it to automatically replace without manually stitching the text together.
Change-Id: I1f46f2285bdaf7749bfc454ac1d47a4f18b77d8a --- M pywikibot/page.py M tests/page_tests.py 2 files changed, 33 insertions(+), 34 deletions(-)
Approvals: Mpaa: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/page.py b/pywikibot/page.py index f38dc5c..8f9781b 100644 --- a/pywikibot/page.py +++ b/pywikibot/page.py @@ -4814,42 +4814,33 @@ # ensuring that illegal   and , which have no known values, # don't get converted to chr(129), chr(141) or chr(157) ignore = set(ignore) | set([129, 141, 157]) - result = u'' - i = 0 - found = True - while found: - text = text[i:] - match = entityR.search(text) - if match: - unicodeCodepoint = None - if match.group('decimal'): - unicodeCodepoint = int(match.group('decimal')) - elif match.group('hex'): - unicodeCodepoint = int(match.group('hex'), 16) - elif match.group('name'): - name = match.group('name') - if name in htmlentitydefs.name2codepoint: - # We found a known HTML entity. - unicodeCodepoint = htmlentitydefs.name2codepoint[name] - result += text[:match.start()] - try: - unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint] - except KeyError: - pass - if unicodeCodepoint and unicodeCodepoint not in ignore: - if unicodeCodepoint > sys.maxunicode: - # solve narrow Python 2 build exception (UTF-16) - result += eval(r"u'\U{:08x}'".format(unicodeCodepoint)) - else: - result += chr(unicodeCodepoint) + + def handle_entity(match): + if match.group('decimal'): + unicodeCodepoint = int(match.group('decimal')) + elif match.group('hex'): + unicodeCodepoint = int(match.group('hex'), 16) + elif match.group('name'): + name = match.group('name') + if name in htmlentitydefs.name2codepoint: + # We found a known HTML entity. + unicodeCodepoint = htmlentitydefs.name2codepoint[name] else: - # Leave the entity unchanged - result += text[match.start():match.end()] - i = match.end() + unicodeCodepoint = False + try: + unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint] + except KeyError: + pass + if unicodeCodepoint and unicodeCodepoint not in ignore: + if unicodeCodepoint > sys.maxunicode: + # solve narrow Python 2 build exception (UTF-16) + return eval(r"u'\U{:08x}'".format(unicodeCodepoint)) + else: + return chr(unicodeCodepoint) else: - result += text - found = False - return result + # Leave the entity unchanged + return match.group(0) + return entityR.sub(handle_entity, text)
def UnicodeToAsciiHtml(s): diff --git a/tests/page_tests.py b/tests/page_tests.py index bc77765..f2ab434 100644 --- a/tests/page_tests.py +++ b/tests/page_tests.py @@ -796,12 +796,20 @@ self.assertEqual(pywikibot.page.html2unicode('A&O'), 'A&O') self.assertEqual(pywikibot.page.html2unicode('py'), 'py') self.assertEqual(pywikibot.page.html2unicode('𐀀'), u'\U00010000') + self.assertEqual(pywikibot.page.html2unicode('p&y'), 'p&y')
@unittest.expectedFailure def test_recursive_entities(self): """Test recursive entities.""" self.assertEqual(pywikibot.page.html2unicode('A&O'), 'A&O')
+ def test_invalid_entities(self): + """Test texts with invalid entities.""" + self.assertEqual(pywikibot.page.html2unicode('A¬aname;O'), 'A¬aname;O') + self.assertEqual(pywikibot.page.html2unicode('Af;O'), 'Af;O') + self.assertEqual(pywikibot.page.html2unicode('f'), 'f') + self.assertEqual(pywikibot.page.html2unicode('py'), 'py') +
if __name__ == '__main__': try:
pywikibot-commits@lists.wikimedia.org