jenkins-bot has submitted this change and it was merged.
Change subject: [FEAT] chars: Generic module for char classes ......................................................................
[FEAT] chars: Generic module for char classes
Add the pywikibot.tools.chars module which handles currently only invisible characters. This is now used by replace (instead of a script specific implementation) and the PatchManager class uses this module too to replace invisible characters with placeholders.
Change-Id: I79c84f6aa5d980e5481e6b441dcd590f00f1a320 --- M pywikibot/diff.py A pywikibot/tools/chars.py M scripts/replace.py A tests/tools_chars_tests.py 4 files changed, 162 insertions(+), 12 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/diff.py b/pywikibot/diff.py index 5acc741..08f939e 100644 --- a/pywikibot/diff.py +++ b/pywikibot/diff.py @@ -26,6 +26,8 @@ BeautifulSoup = False
import pywikibot +from pywikibot.tools import chars + from pywikibot.backports import format_range_unified # introduced in 2.7.2 from pywikibot.tools import deprecated_args
@@ -221,7 +223,8 @@ """
@deprecated_args(n='context') - def __init__(self, text_a, text_b, context=0, by_letter=False): + def __init__(self, text_a, text_b, context=0, by_letter=False, + replace_invisible=False): """Constructor.
@param text_a: base text @@ -233,6 +236,9 @@ @param by_letter: if text_a and text_b are single lines, comparison can be done letter by letter. @type by_letter: bool + @param replace_invisible: Replace invisible characters like U+200e with + the charnumber in brackets (e.g. <200e>). + @type replace_invisible: bool """ if '\n' in text_a or '\n' in text_b: self.a = text_a.splitlines(1) @@ -265,6 +271,7 @@ self.blocks = self.get_blocks() self.context = context self._super_hunks = self._generate_super_hunks() + self._replace_invisible = replace_invisible
def get_blocks(self): """Return list with blocks of indexes which compose a and, where applicable, b. @@ -352,7 +359,10 @@ output += extend_context(previous_hunk.a_rng[1], hunk.a_rng[0]) previous_hunk = hunk output += hunk.diff_text - return output + extend_context(hunks[-1].a_rng[1], context_range[0][1]) + output += extend_context(hunks[-1].a_rng[1], context_range[0][1]) + if self._replace_invisible: + output = chars.replace_invisible(output) + return output
def review_hunks(self): """Review hunks.""" diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py new file mode 100644 index 0000000..9a29e24 --- /dev/null +++ b/pywikibot/tools/chars.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +"""Character based helper functions(not wiki-dependent).""" +# +# (C) Pywikibot team, 2015 +# +# Distributed under the terms of the MIT license. +# +from __future__ import unicode_literals + +__version__ = '$Id$' + +import sys + +from pywikibot.tools import LazyRegex + + +if sys.version_info[0] > 2: + unicode = str + + +# All characters in the Cf category in a static list. When testing each Unicode +# codepoint it takes longer especially when working with UCS2. The lists also +# differ between Python versions which can be avoided by this static list. +_category_cf = frozenset([ + '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603', + '\U00000604', '\U0000061c', '\U000006dd', '\U0000070f', '\U0000180e', + '\U0000200b', '\U0000200c', '\U0000200d', '\U0000200e', '\U0000200f', + '\U0000202a', '\U0000202b', '\U0000202c', '\U0000202d', '\U0000202e', + '\U00002060', '\U00002061', '\U00002062', '\U00002063', '\U00002064', + '\U00002066', '\U00002067', '\U00002068', '\U00002069', '\U0000206a', + '\U0000206b', '\U0000206c', '\U0000206d', '\U0000206e', '\U0000206f', + '\U0000feff', '\U0000fff9', '\U0000fffa', '\U0000fffb', '\U000110bd', + '\U0001d173', '\U0001d174', '\U0001d175', '\U0001d176', '\U0001d177', + '\U0001d178', '\U0001d179', '\U0001d17a', '\U000e0001', '\U000e0020', + '\U000e0021', '\U000e0022', '\U000e0023', '\U000e0024', '\U000e0025', + '\U000e0026', '\U000e0027', '\U000e0028', '\U000e0029', '\U000e002a', + '\U000e002b', '\U000e002c', '\U000e002d', '\U000e002e', '\U000e002f', + '\U000e0030', '\U000e0031', '\U000e0032', '\U000e0033', '\U000e0034', + '\U000e0035', '\U000e0036', '\U000e0037', '\U000e0038', '\U000e0039', + '\U000e003a', '\U000e003b', '\U000e003c', '\U000e003d', '\U000e003e', + '\U000e003f', '\U000e0040', '\U000e0041', '\U000e0042', '\U000e0043', + '\U000e0044', '\U000e0045', '\U000e0046', '\U000e0047', '\U000e0048', + '\U000e0049', '\U000e004a', '\U000e004b', '\U000e004c', '\U000e004d', + '\U000e004e', '\U000e004f', '\U000e0050', '\U000e0051', '\U000e0052', + '\U000e0053', '\U000e0054', '\U000e0055', '\U000e0056', '\U000e0057', + '\U000e0058', '\U000e0059', '\U000e005a', '\U000e005b', '\U000e005c', + '\U000e005d', '\U000e005e', '\U000e005f', '\U000e0060', '\U000e0061', + '\U000e0062', '\U000e0063', '\U000e0064', '\U000e0065', '\U000e0066', + '\U000e0067', '\U000e0068', '\U000e0069', '\U000e006a', '\U000e006b', + '\U000e006c', '\U000e006d', '\U000e006e', '\U000e006f', '\U000e0070', + '\U000e0071', '\U000e0072', '\U000e0073', '\U000e0074', '\U000e0075', + '\U000e0076', '\U000e0077', '\U000e0078', '\U000e0079', '\U000e007a', + '\U000e007b', '\U000e007c', '\U000e007d', '\U000e007e', '\U000e007f', +]) +# This is a set of all invisible characters +# At the moment we've only added the characters from the Cf category +_invisible_chars = frozenset(_category_cf) + +# TODO: Is that complex and a lazy regex justified? +invisible_regex = LazyRegex() +invisible_regex.raw = '[' + ''.join(_invisible_chars) + ']' +invisible_regex.flags = 0 + + +def contains_invisible(text): + """Return True if the text contain any of the invisible characters.""" + return any(char in _invisible_chars for char in text) + + +def replace_invisible(text): + """Replace invisible characters by '<codepoint>'.""" + def replace(match): + match = match.group() + if sys.maxunicode < 0x10ffff and len(match) == 2: + mask = (1 << 10) - 1 + assert(ord(match[0]) & ~mask == 0xd800) + assert(ord(match[1]) & ~mask == 0xdc00) + codepoint = (ord(match[0]) & mask) << 10 | (ord(match[1]) & mask) + else: + codepoint = ord(match) + return '<{0:x}>'.format(codepoint) + return invisible_regex.sub(replace, text) diff --git a/scripts/replace.py b/scripts/replace.py index 7cb7764..7c19f3c 100755 --- a/scripts/replace.py +++ b/scripts/replace.py @@ -137,7 +137,6 @@ import re import time import sys -import unicodedata
import pywikibot from pywikibot import i18n, textlib, pagegenerators, Bot @@ -145,6 +144,8 @@
# Imports predefined replacements tasks from fixes.py from pywikibot import fixes + +from pywikibot.tools import chars
if sys.version_info[0] > 2: basestring = (str, ) @@ -667,11 +668,6 @@ return pattern
-def contains_format_characters(text): - """Return True when there are format characters (e.g. U+200E) in text.""" - return any(unicodedata.category(char) == 'Cf' for char in text) - - def main(*args): """ Process command line arguments and invoke bot. @@ -881,12 +877,14 @@ set_summary) for replacement in fix['replacements']: summary = None if len(replacement) < 3 else replacement[2] - if contains_format_characters(replacement[0]): + if chars.contains_invisible(replacement[0]): pywikibot.warning('The old string "{0}" contains formatting ' - 'characters like U+200E'.format(replacement[0])) - if contains_format_characters(replacement[1]): + 'characters like U+200E'.format( + chars.replace_invisible(replacement[0]))) + if chars.contains_invisible(replacement[1]): pywikibot.warning('The new string "{0}" contains formatting ' - 'characters like U+200E'.format(replacement[1])) + 'characters like U+200E'.format( + chars.replace_invisible(replacement[1]))) replacements.append(ReplacementListEntry( old=replacement[0], new=replacement[1], diff --git a/tests/tools_chars_tests.py b/tests/tools_chars_tests.py new file mode 100644 index 0000000..e7c45d1 --- /dev/null +++ b/tests/tools_chars_tests.py @@ -0,0 +1,60 @@ +#!/usr/bin/python +"""Test tools.chars package.""" +# -*- coding: utf-8 -*- +# +# (C) Pywikibot team, 2015 +# +# Distributed under the terms of the MIT license. +from __future__ import unicode_literals + +__version__ = '$Id$' + +import sys +import unicodedata + +from pywikibot.tools import chars + +from tests.aspects import unittest, TestCase + + +class CharsTestCase(TestCase): + + """General test case testing the module.""" + + net = False + + def test_replace(self): + """Test replace_invisible.""" + self.assertEqual(chars.replace_invisible('Hello world!'), 'Hello world!') + self.assertEqual(chars.replace_invisible('\u200eRTL\u200f'), '<200e>RTL<200f>') + + def test_contains(self): + """Test contains_invisible.""" + self.assertFalse(chars.contains_invisible('Hello world!')) + self.assertTrue(chars.contains_invisible('\u200eRTL\u200f')) + + def test_category_cf(self): + """Test that all characters in _category_cf are actually in Cf.""" + invalid = {} + for char in chars._category_cf: + cat = unicodedata.category(char) + if cat != 'Cf': + invalid[char] = cat + if sys.version_info[0] == 2: + # These weren't defined in Unicode 5.2 (which is what Py2 is using) + self.assertEqual(invalid.pop('\u0604'), 'Cn') + self.assertEqual(invalid.pop('\u061c'), 'Cn') + self.assertEqual(invalid.pop('\u2066'), 'Cn') + self.assertEqual(invalid.pop('\u2067'), 'Cn') + self.assertEqual(invalid.pop('\u2068'), 'Cn') + self.assertEqual(invalid.pop('\u2069'), 'Cn') + # This category has changed between Unicode 6 and 7 to Cf + self.assertEqual(invalid.pop('\u180e'), 'Zs') + self.assertCountEqual(invalid.items(), []) + + +if __name__ == '__main__': + try: + unittest.main() + except SystemExit: + pass