jenkins-bot has submitted this change and it was merged.
Change subject: [IMPROV] page: re.sub instead of manual iteration
......................................................................
[IMPROV] page: re.sub instead of manual iteration
The re.sub method allows to use a callable as the replacement which
allows it to automatically replace without manually stitching the text
together.
Change-Id: I1f46f2285bdaf7749bfc454ac1d47a4f18b77d8a
---
M pywikibot/page.py
M tests/page_tests.py
2 files changed, 33 insertions(+), 34 deletions(-)
Approvals:
Mpaa: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/page.py b/pywikibot/page.py
index f38dc5c..8f9781b 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -4814,42 +4814,33 @@
# ensuring that illegal   and , which have no known
values,
# don't get converted to chr(129), chr(141) or chr(157)
ignore = set(ignore) | set([129, 141, 157])
- result = u''
- i = 0
- found = True
- while found:
- text = text[i:]
- match = entityR.search(text)
- if match:
- unicodeCodepoint = None
- if match.group('decimal'):
- unicodeCodepoint = int(match.group('decimal'))
- elif match.group('hex'):
- unicodeCodepoint = int(match.group('hex'), 16)
- elif match.group('name'):
- name = match.group('name')
- if name in htmlentitydefs.name2codepoint:
- # We found a known HTML entity.
- unicodeCodepoint = htmlentitydefs.name2codepoint[name]
- result += text[:match.start()]
- try:
- unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint]
- except KeyError:
- pass
- if unicodeCodepoint and unicodeCodepoint not in ignore:
- if unicodeCodepoint > sys.maxunicode:
- # solve narrow Python 2 build exception (UTF-16)
- result +=
eval(r"u'\U{:08x}'".format(unicodeCodepoint))
- else:
- result += chr(unicodeCodepoint)
+
+ def handle_entity(match):
+ if match.group('decimal'):
+ unicodeCodepoint = int(match.group('decimal'))
+ elif match.group('hex'):
+ unicodeCodepoint = int(match.group('hex'), 16)
+ elif match.group('name'):
+ name = match.group('name')
+ if name in htmlentitydefs.name2codepoint:
+ # We found a known HTML entity.
+ unicodeCodepoint = htmlentitydefs.name2codepoint[name]
else:
- # Leave the entity unchanged
- result += text[match.start():match.end()]
- i = match.end()
+ unicodeCodepoint = False
+ try:
+ unicodeCodepoint = convertIllegalHtmlEntities[unicodeCodepoint]
+ except KeyError:
+ pass
+ if unicodeCodepoint and unicodeCodepoint not in ignore:
+ if unicodeCodepoint > sys.maxunicode:
+ # solve narrow Python 2 build exception (UTF-16)
+ return eval(r"u'\U{:08x}'".format(unicodeCodepoint))
+ else:
+ return chr(unicodeCodepoint)
else:
- result += text
- found = False
- return result
+ # Leave the entity unchanged
+ return match.group(0)
+ return entityR.sub(handle_entity, text)
def UnicodeToAsciiHtml(s):
diff --git a/tests/page_tests.py b/tests/page_tests.py
index bc77765..f2ab434 100644
--- a/tests/page_tests.py
+++ b/tests/page_tests.py
@@ -796,12 +796,20 @@
self.assertEqual(pywikibot.page.html2unicode('A&O'),
'A&O')
self.assertEqual(pywikibot.page.html2unicode('py'),
'py')
self.assertEqual(pywikibot.page.html2unicode('𐀀'),
u'\U00010000')
+
self.assertEqual(pywikibot.page.html2unicode('p&y'),
'p&y')
@unittest.expectedFailure
def test_recursive_entities(self):
"""Test recursive entities."""
self.assertEqual(pywikibot.page.html2unicode('A&O'),
'A&O')
+ def test_invalid_entities(self):
+ """Test texts with invalid entities."""
+ self.assertEqual(pywikibot.page.html2unicode('A¬aname;O'),
'A¬aname;O')
+ self.assertEqual(pywikibot.page.html2unicode('Af;O'),
'Af;O')
+ self.assertEqual(pywikibot.page.html2unicode('f'),
'f')
+ self.assertEqual(pywikibot.page.html2unicode('py'),
'py')
+
if __name__ == '__main__':
try:
--
To view, visit
https://gerrit.wikimedia.org/r/196424
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I1f46f2285bdaf7749bfc454ac1d47a4f18b77d8a
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Ricordisamoa <ricordisamoa(a)openmailbox.org>
Gerrit-Reviewer: jenkins-bot <>