jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/453786 )
Change subject: Revert "pywikibot.tools.chars: Update and simplify the code" ......................................................................
Revert "pywikibot.tools.chars: Update and simplify the code"
Revert 02e3830b5826, but keep added characters.
Replace `_invisible_chars = frozenset(_category_cf)` with `_invisible_chars = _category_cf`, _category_cf is already a frozenset and there is no need to create a copy of it.
Bug: T202238 Change-Id: Ie55d3ba5f100e691f901f6be15e61425aef70795 --- M pywikibot/tools/chars.py 1 file changed, 42 insertions(+), 30 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py index e0722f1..b92c478 100644 --- a/pywikibot/tools/chars.py +++ b/pywikibot/tools/chars.py @@ -1,52 +1,64 @@ # -*- coding: utf-8 -*- """Character based helper functions(not wiki-dependent).""" # -# (C) Pywikibot team, 2015 +# (C) Pywikibot team, 2015-2018 # # Distributed under the terms of the MIT license. # from __future__ import absolute_import, unicode_literals
-import re import sys
+from pywikibot.tools import LazyRegex +
if sys.version_info[0] > 2: unicode = str
-# All characters in the Cf category. When testing each Unicode codepoint it -# takes longer especially when working with UCS2. The codepoints also +# All characters in the Cf category in a static list. When testing each Unicode +# codepoint it takes longer especially when working with UCS2. The lists also # differ between Python versions which can be avoided by this static list. -_category_cf = ( - '\U000000AD\U00000600\U00000601\U00000602\U00000603\U00000604\U00000605' - '\U0000061C\U000006DD\U0000070F\U000008E2\U0000180E\U0000200B\U0000200C' - '\U0000200D\U0000200E\U0000200F\U0000202A\U0000202B\U0000202C\U0000202D' - '\U0000202E\U00002060\U00002061\U00002062\U00002063\U00002064\U00002066' - '\U00002067\U00002068\U00002069\U0000206A\U0000206B\U0000206C\U0000206D' - '\U0000206E\U0000206F\U0000FEFF\U0000FFF9\U0000FFFA\U0000FFFB\U000110BD' - '\U000110CD\U0001BCA0\U0001BCA1\U0001BCA2\U0001BCA3\U0001D173\U0001D174' - '\U0001D175\U0001D176\U0001D177\U0001D178\U0001D179\U0001D17A\U000E0001' - '\U000E0020\U000E0021\U000E0022\U000E0023\U000E0024\U000E0025\U000E0026' - '\U000E0027\U000E0028\U000E0029\U000E002A\U000E002B\U000E002C\U000E002D' - '\U000E002E\U000E002F\U000E0030\U000E0031\U000E0032\U000E0033\U000E0034' - '\U000E0035\U000E0036\U000E0037\U000E0038\U000E0039\U000E003A\U000E003B' - '\U000E003C\U000E003D\U000E003E\U000E003F\U000E0040\U000E0041\U000E0042' - '\U000E0043\U000E0044\U000E0045\U000E0046\U000E0047\U000E0048\U000E0049' - '\U000E004A\U000E004B\U000E004C\U000E004D\U000E004E\U000E004F\U000E0050' - '\U000E0051\U000E0052\U000E0053\U000E0054\U000E0055\U000E0056\U000E0057' - '\U000E0058\U000E0059\U000E005A\U000E005B\U000E005C\U000E005D\U000E005E' - '\U000E005F\U000E0060\U000E0061\U000E0062\U000E0063\U000E0064\U000E0065' - '\U000E0066\U000E0067\U000E0068\U000E0069\U000E006A\U000E006B\U000E006C' - '\U000E006D\U000E006E\U000E006F\U000E0070\U000E0071\U000E0072\U000E0073' - '\U000E0074\U000E0075\U000E0076\U000E0077\U000E0078\U000E0079\U000E007A' - '\U000E007B\U000E007C\U000E007D\U000E007E\U000E007F') - +_category_cf = frozenset([ + '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603', + '\U00000604', '\U00000605', '\U0000061c', '\U000006dd', '\U0000070f', + '\U000008e2', '\U0000180e', '\U0000200b', '\U0000200c', '\U0000200d', + '\U0000200e', '\U0000200f', '\U0000202a', '\U0000202b', '\U0000202c', + '\U0000202d', '\U0000202e', '\U00002060', '\U00002061', '\U00002062', + '\U00002063', '\U00002064', '\U00002066', '\U00002067', '\U00002068', + '\U00002069', '\U0000206a', '\U0000206b', '\U0000206c', '\U0000206d', + '\U0000206e', '\U0000206f', '\U0000feff', '\U0000fff9', '\U0000fffa', + '\U0000fffb', '\U000110bd', '\U000110cd', '\U0001bca0', '\U0001bca1', + '\U0001bca2', '\U0001bca3', '\U0001d173', '\U0001d174', '\U0001d175', + '\U0001d176', '\U0001d177', '\U0001d178', '\U0001d179', '\U0001d17a', + '\U000e0001', '\U000e0020', '\U000e0021', '\U000e0022', '\U000e0023', + '\U000e0024', '\U000e0025', '\U000e0026', '\U000e0027', '\U000e0028', + '\U000e0029', '\U000e002a', '\U000e002b', '\U000e002c', '\U000e002d', + '\U000e002e', '\U000e002f', '\U000e0030', '\U000e0031', '\U000e0032', + '\U000e0033', '\U000e0034', '\U000e0035', '\U000e0036', '\U000e0037', + '\U000e0038', '\U000e0039', '\U000e003a', '\U000e003b', '\U000e003c', + '\U000e003d', '\U000e003e', '\U000e003f', '\U000e0040', '\U000e0041', + '\U000e0042', '\U000e0043', '\U000e0044', '\U000e0045', '\U000e0046', + '\U000e0047', '\U000e0048', '\U000e0049', '\U000e004a', '\U000e004b', + '\U000e004c', '\U000e004d', '\U000e004e', '\U000e004f', '\U000e0050', + '\U000e0051', '\U000e0052', '\U000e0053', '\U000e0054', '\U000e0055', + '\U000e0056', '\U000e0057', '\U000e0058', '\U000e0059', '\U000e005a', + '\U000e005b', '\U000e005c', '\U000e005d', '\U000e005e', '\U000e005f', + '\U000e0060', '\U000e0061', '\U000e0062', '\U000e0063', '\U000e0064', + '\U000e0065', '\U000e0066', '\U000e0067', '\U000e0068', '\U000e0069', + '\U000e006a', '\U000e006b', '\U000e006c', '\U000e006d', '\U000e006e', + '\U000e006f', '\U000e0070', '\U000e0071', '\U000e0072', '\U000e0073', + '\U000e0074', '\U000e0075', '\U000e0076', '\U000e0077', '\U000e0078', + '\U000e0079', '\U000e007a', '\U000e007b', '\U000e007c', '\U000e007d', + '\U000e007e', '\U000e007f', +]) # This is a set of all invisible characters # At the moment we've only added the characters from the Cf category -_invisible_chars = frozenset(_category_cf) +_invisible_chars = _category_cf
-invisible_regex = re.compile('[' + _category_cf + ']') +invisible_regex = LazyRegex( + lambda: '[' + ''.join(_invisible_chars) + ']' +)
def contains_invisible(text):