jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/452288 )
Change subject: pywikibot.tools.chars: Update and simplify the code ......................................................................
pywikibot.tools.chars: Update and simplify the code
- Use a string instead of a frozenset to define _category_cf. The string form will fit into `invisible_regex` constructor more easily. - Update the the characters using Python 3.7's unicodedata v11.0.0. Seven new characters were added: {'\U0001bca3', '\U0001bca1', '\U000110cd', '\u0605', '\U0001bca2', '\U0001bca0', '\u08e2'} - Rewrite invisible_regex using _category_cf and re.compile.
Change-Id: I5adf1d1d9b714ec00bbd8854d36b3253d01824d7 --- M pywikibot/tools/chars.py 1 file changed, 28 insertions(+), 38 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py index 7a9b4d8..e0722f1 100644 --- a/pywikibot/tools/chars.py +++ b/pywikibot/tools/chars.py @@ -7,56 +7,46 @@ # from __future__ import absolute_import, unicode_literals
+import re import sys
-from pywikibot.tools import LazyRegex -
if sys.version_info[0] > 2: unicode = str
-# All characters in the Cf category in a static list. When testing each Unicode -# codepoint it takes longer especially when working with UCS2. The lists also +# All characters in the Cf category. When testing each Unicode codepoint it +# takes longer especially when working with UCS2. The codepoints also # differ between Python versions which can be avoided by this static list. -_category_cf = frozenset([ - '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603', - '\U00000604', '\U0000061c', '\U000006dd', '\U0000070f', '\U0000180e', - '\U0000200b', '\U0000200c', '\U0000200d', '\U0000200e', '\U0000200f', - '\U0000202a', '\U0000202b', '\U0000202c', '\U0000202d', '\U0000202e', - '\U00002060', '\U00002061', '\U00002062', '\U00002063', '\U00002064', - '\U00002066', '\U00002067', '\U00002068', '\U00002069', '\U0000206a', - '\U0000206b', '\U0000206c', '\U0000206d', '\U0000206e', '\U0000206f', - '\U0000feff', '\U0000fff9', '\U0000fffa', '\U0000fffb', '\U000110bd', - '\U0001d173', '\U0001d174', '\U0001d175', '\U0001d176', '\U0001d177', - '\U0001d178', '\U0001d179', '\U0001d17a', '\U000e0001', '\U000e0020', - '\U000e0021', '\U000e0022', '\U000e0023', '\U000e0024', '\U000e0025', - '\U000e0026', '\U000e0027', '\U000e0028', '\U000e0029', '\U000e002a', - '\U000e002b', '\U000e002c', '\U000e002d', '\U000e002e', '\U000e002f', - '\U000e0030', '\U000e0031', '\U000e0032', '\U000e0033', '\U000e0034', - '\U000e0035', '\U000e0036', '\U000e0037', '\U000e0038', '\U000e0039', - '\U000e003a', '\U000e003b', '\U000e003c', '\U000e003d', '\U000e003e', - '\U000e003f', '\U000e0040', '\U000e0041', '\U000e0042', '\U000e0043', - '\U000e0044', '\U000e0045', '\U000e0046', '\U000e0047', '\U000e0048', - '\U000e0049', '\U000e004a', '\U000e004b', '\U000e004c', '\U000e004d', - '\U000e004e', '\U000e004f', '\U000e0050', '\U000e0051', '\U000e0052', - '\U000e0053', '\U000e0054', '\U000e0055', '\U000e0056', '\U000e0057', - '\U000e0058', '\U000e0059', '\U000e005a', '\U000e005b', '\U000e005c', - '\U000e005d', '\U000e005e', '\U000e005f', '\U000e0060', '\U000e0061', - '\U000e0062', '\U000e0063', '\U000e0064', '\U000e0065', '\U000e0066', - '\U000e0067', '\U000e0068', '\U000e0069', '\U000e006a', '\U000e006b', - '\U000e006c', '\U000e006d', '\U000e006e', '\U000e006f', '\U000e0070', - '\U000e0071', '\U000e0072', '\U000e0073', '\U000e0074', '\U000e0075', - '\U000e0076', '\U000e0077', '\U000e0078', '\U000e0079', '\U000e007a', - '\U000e007b', '\U000e007c', '\U000e007d', '\U000e007e', '\U000e007f', -]) +_category_cf = ( + '\U000000AD\U00000600\U00000601\U00000602\U00000603\U00000604\U00000605' + '\U0000061C\U000006DD\U0000070F\U000008E2\U0000180E\U0000200B\U0000200C' + '\U0000200D\U0000200E\U0000200F\U0000202A\U0000202B\U0000202C\U0000202D' + '\U0000202E\U00002060\U00002061\U00002062\U00002063\U00002064\U00002066' + '\U00002067\U00002068\U00002069\U0000206A\U0000206B\U0000206C\U0000206D' + '\U0000206E\U0000206F\U0000FEFF\U0000FFF9\U0000FFFA\U0000FFFB\U000110BD' + '\U000110CD\U0001BCA0\U0001BCA1\U0001BCA2\U0001BCA3\U0001D173\U0001D174' + '\U0001D175\U0001D176\U0001D177\U0001D178\U0001D179\U0001D17A\U000E0001' + '\U000E0020\U000E0021\U000E0022\U000E0023\U000E0024\U000E0025\U000E0026' + '\U000E0027\U000E0028\U000E0029\U000E002A\U000E002B\U000E002C\U000E002D' + '\U000E002E\U000E002F\U000E0030\U000E0031\U000E0032\U000E0033\U000E0034' + '\U000E0035\U000E0036\U000E0037\U000E0038\U000E0039\U000E003A\U000E003B' + '\U000E003C\U000E003D\U000E003E\U000E003F\U000E0040\U000E0041\U000E0042' + '\U000E0043\U000E0044\U000E0045\U000E0046\U000E0047\U000E0048\U000E0049' + '\U000E004A\U000E004B\U000E004C\U000E004D\U000E004E\U000E004F\U000E0050' + '\U000E0051\U000E0052\U000E0053\U000E0054\U000E0055\U000E0056\U000E0057' + '\U000E0058\U000E0059\U000E005A\U000E005B\U000E005C\U000E005D\U000E005E' + '\U000E005F\U000E0060\U000E0061\U000E0062\U000E0063\U000E0064\U000E0065' + '\U000E0066\U000E0067\U000E0068\U000E0069\U000E006A\U000E006B\U000E006C' + '\U000E006D\U000E006E\U000E006F\U000E0070\U000E0071\U000E0072\U000E0073' + '\U000E0074\U000E0075\U000E0076\U000E0077\U000E0078\U000E0079\U000E007A' + '\U000E007B\U000E007C\U000E007D\U000E007E\U000E007F') + # This is a set of all invisible characters # At the moment we've only added the characters from the Cf category _invisible_chars = frozenset(_category_cf)
-invisible_regex = LazyRegex( - lambda: '[' + ''.join(_invisible_chars) + ']' -) +invisible_regex = re.compile('[' + _category_cf + ']')
def contains_invisible(text):
pywikibot-commits@lists.wikimedia.org