jenkins-bot merged this change.
Revert "pywikibot.tools.chars: Update and simplify the code"
Revert 02e3830b5826, but keep added characters.
Replace `_invisible_chars = frozenset(_category_cf)` with
`_invisible_chars = _category_cf`, _category_cf is already a frozenset
and there is no need to create a copy of it.
Bug: T202238
Change-Id: Ie55d3ba5f100e691f901f6be15e61425aef70795
---
M pywikibot/tools/chars.py
1 file changed, 42 insertions(+), 30 deletions(-)
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
index e0722f1..b92c478 100644
--- a/pywikibot/tools/chars.py
+++ b/pywikibot/tools/chars.py
@@ -1,52 +1,64 @@
# -*- coding: utf-8 -*-
"""Character based helper functions(not wiki-dependent)."""
#
-# (C) Pywikibot team, 2015
+# (C) Pywikibot team, 2015-2018
#
# Distributed under the terms of the MIT license.
#
from __future__ import absolute_import, unicode_literals
-import re
import sys
+from pywikibot.tools import LazyRegex
+
if sys.version_info[0] > 2:
unicode = str
-# All characters in the Cf category. When testing each Unicode codepoint it
-# takes longer especially when working with UCS2. The codepoints also
+# All characters in the Cf category in a static list. When testing each Unicode
+# codepoint it takes longer especially when working with UCS2. The lists also
# differ between Python versions which can be avoided by this static list.
-_category_cf = (
- '\U000000AD\U00000600\U00000601\U00000602\U00000603\U00000604\U00000605'
- '\U0000061C\U000006DD\U0000070F\U000008E2\U0000180E\U0000200B\U0000200C'
- '\U0000200D\U0000200E\U0000200F\U0000202A\U0000202B\U0000202C\U0000202D'
- '\U0000202E\U00002060\U00002061\U00002062\U00002063\U00002064\U00002066'
- '\U00002067\U00002068\U00002069\U0000206A\U0000206B\U0000206C\U0000206D'
- '\U0000206E\U0000206F\U0000FEFF\U0000FFF9\U0000FFFA\U0000FFFB\U000110BD'
- '\U000110CD\U0001BCA0\U0001BCA1\U0001BCA2\U0001BCA3\U0001D173\U0001D174'
- '\U0001D175\U0001D176\U0001D177\U0001D178\U0001D179\U0001D17A\U000E0001'
- '\U000E0020\U000E0021\U000E0022\U000E0023\U000E0024\U000E0025\U000E0026'
- '\U000E0027\U000E0028\U000E0029\U000E002A\U000E002B\U000E002C\U000E002D'
- '\U000E002E\U000E002F\U000E0030\U000E0031\U000E0032\U000E0033\U000E0034'
- '\U000E0035\U000E0036\U000E0037\U000E0038\U000E0039\U000E003A\U000E003B'
- '\U000E003C\U000E003D\U000E003E\U000E003F\U000E0040\U000E0041\U000E0042'
- '\U000E0043\U000E0044\U000E0045\U000E0046\U000E0047\U000E0048\U000E0049'
- '\U000E004A\U000E004B\U000E004C\U000E004D\U000E004E\U000E004F\U000E0050'
- '\U000E0051\U000E0052\U000E0053\U000E0054\U000E0055\U000E0056\U000E0057'
- '\U000E0058\U000E0059\U000E005A\U000E005B\U000E005C\U000E005D\U000E005E'
- '\U000E005F\U000E0060\U000E0061\U000E0062\U000E0063\U000E0064\U000E0065'
- '\U000E0066\U000E0067\U000E0068\U000E0069\U000E006A\U000E006B\U000E006C'
- '\U000E006D\U000E006E\U000E006F\U000E0070\U000E0071\U000E0072\U000E0073'
- '\U000E0074\U000E0075\U000E0076\U000E0077\U000E0078\U000E0079\U000E007A'
- '\U000E007B\U000E007C\U000E007D\U000E007E\U000E007F')
-
+_category_cf = frozenset([
+ '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603',
+ '\U00000604', '\U00000605', '\U0000061c', '\U000006dd', '\U0000070f',
+ '\U000008e2', '\U0000180e', '\U0000200b', '\U0000200c', '\U0000200d',
+ '\U0000200e', '\U0000200f', '\U0000202a', '\U0000202b', '\U0000202c',
+ '\U0000202d', '\U0000202e', '\U00002060', '\U00002061', '\U00002062',
+ '\U00002063', '\U00002064', '\U00002066', '\U00002067', '\U00002068',
+ '\U00002069', '\U0000206a', '\U0000206b', '\U0000206c', '\U0000206d',
+ '\U0000206e', '\U0000206f', '\U0000feff', '\U0000fff9', '\U0000fffa',
+ '\U0000fffb', '\U000110bd', '\U000110cd', '\U0001bca0', '\U0001bca1',
+ '\U0001bca2', '\U0001bca3', '\U0001d173', '\U0001d174', '\U0001d175',
+ '\U0001d176', '\U0001d177', '\U0001d178', '\U0001d179', '\U0001d17a',
+ '\U000e0001', '\U000e0020', '\U000e0021', '\U000e0022', '\U000e0023',
+ '\U000e0024', '\U000e0025', '\U000e0026', '\U000e0027', '\U000e0028',
+ '\U000e0029', '\U000e002a', '\U000e002b', '\U000e002c', '\U000e002d',
+ '\U000e002e', '\U000e002f', '\U000e0030', '\U000e0031', '\U000e0032',
+ '\U000e0033', '\U000e0034', '\U000e0035', '\U000e0036', '\U000e0037',
+ '\U000e0038', '\U000e0039', '\U000e003a', '\U000e003b', '\U000e003c',
+ '\U000e003d', '\U000e003e', '\U000e003f', '\U000e0040', '\U000e0041',
+ '\U000e0042', '\U000e0043', '\U000e0044', '\U000e0045', '\U000e0046',
+ '\U000e0047', '\U000e0048', '\U000e0049', '\U000e004a', '\U000e004b',
+ '\U000e004c', '\U000e004d', '\U000e004e', '\U000e004f', '\U000e0050',
+ '\U000e0051', '\U000e0052', '\U000e0053', '\U000e0054', '\U000e0055',
+ '\U000e0056', '\U000e0057', '\U000e0058', '\U000e0059', '\U000e005a',
+ '\U000e005b', '\U000e005c', '\U000e005d', '\U000e005e', '\U000e005f',
+ '\U000e0060', '\U000e0061', '\U000e0062', '\U000e0063', '\U000e0064',
+ '\U000e0065', '\U000e0066', '\U000e0067', '\U000e0068', '\U000e0069',
+ '\U000e006a', '\U000e006b', '\U000e006c', '\U000e006d', '\U000e006e',
+ '\U000e006f', '\U000e0070', '\U000e0071', '\U000e0072', '\U000e0073',
+ '\U000e0074', '\U000e0075', '\U000e0076', '\U000e0077', '\U000e0078',
+ '\U000e0079', '\U000e007a', '\U000e007b', '\U000e007c', '\U000e007d',
+ '\U000e007e', '\U000e007f',
+])
# This is a set of all invisible characters
# At the moment we've only added the characters from the Cf category
-_invisible_chars = frozenset(_category_cf)
+_invisible_chars = _category_cf
-invisible_regex = re.compile('[' + _category_cf + ']')
+invisible_regex = LazyRegex(
+ lambda: '[' + ''.join(_invisible_chars) + ']'
+)
def contains_invisible(text):
To view, visit change 453786. To unsubscribe, or for help writing mail filters, visit settings.