jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/615717 )
Change subject: [unicode] Update characters in the Cf category to Unicode version 12.1.0 ......................................................................
[unicode] Update characters in the Cf category to Unicode version 12.1.0
Update characters in the Cf category from Unicode version 11.0.0 to 12.1.0 but move the frozenset to _unidata.py which holds other data derived from unicodedata module.
Change-Id: Ieec0a10956d7a05203773e7cfe78bac70de32a28 --- M pywikibot/tools/_unidata.py M pywikibot/tools/chars.py 2 files changed, 48 insertions(+), 44 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/tools/_unidata.py b/pywikibot/tools/_unidata.py index f23ae25..53a76c4 100644 --- a/pywikibot/tools/_unidata.py +++ b/pywikibot/tools/_unidata.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Helper function fo MediaWiki title-cased forms.""" +"""Helper function which holds data from unicodedata library.""" # # (C) Pywikibot team, 2018-2020 # @@ -266,3 +266,47 @@
_first_upper_exception = _first_upper_exception_dict.get + + +# All characters in the Cf category in a static list. When testing each Unicode +# codepoint it takes longer especially when working with UCS2. The lists also +# differ between Python versions which can be avoided by this static list. +# +# This frozenset was created using Python 3.8 (Unicode version 12.1.0): +# list(c for c in (chr(i) for i in range(sys.maxunicode)) +# if unicodedata.category(c) == 'Cf') +_category_cf = frozenset([ + '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603', + '\U00000604', '\U00000605', '\U0000061c', '\U000006dd', '\U0000070f', + '\U000008e2', '\U0000180e', '\U0000200b', '\U0000200c', '\U0000200d', + '\U0000200e', '\U0000200f', '\U0000202a', '\U0000202b', '\U0000202c', + '\U0000202d', '\U0000202e', '\U00002060', '\U00002061', '\U00002062', + '\U00002063', '\U00002064', '\U00002066', '\U00002067', '\U00002068', + '\U00002069', '\U0000206a', '\U0000206b', '\U0000206c', '\U0000206d', + '\U0000206e', '\U0000206f', '\U0000feff', '\U0000fff9', '\U0000fffa', + '\U0000fffb', '\U000110bd', '\U000110cd', '\U00013430', '\U00013431', + '\U00013432', '\U00013433', '\U00013434', '\U00013435', '\U00013436', + '\U00013437', '\U00013438', '\U0001bca0', '\U0001bca1', '\U0001bca2', + '\U0001bca3', '\U0001d173', '\U0001d174', '\U0001d175', '\U0001d176', + '\U0001d177', '\U0001d178', '\U0001d179', '\U0001d17a', '\U000e0001', + '\U000e0020', '\U000e0021', '\U000e0022', '\U000e0023', '\U000e0024', + '\U000e0025', '\U000e0026', '\U000e0027', '\U000e0028', '\U000e0029', + '\U000e002a', '\U000e002b', '\U000e002c', '\U000e002d', '\U000e002e', + '\U000e002f', '\U000e0030', '\U000e0031', '\U000e0032', '\U000e0033', + '\U000e0034', '\U000e0035', '\U000e0036', '\U000e0037', '\U000e0038', + '\U000e0039', '\U000e003a', '\U000e003b', '\U000e003c', '\U000e003d', + '\U000e003e', '\U000e003f', '\U000e0040', '\U000e0041', '\U000e0042', + '\U000e0043', '\U000e0044', '\U000e0045', '\U000e0046', '\U000e0047', + '\U000e0048', '\U000e0049', '\U000e004a', '\U000e004b', '\U000e004c', + '\U000e004d', '\U000e004e', '\U000e004f', '\U000e0050', '\U000e0051', + '\U000e0052', '\U000e0053', '\U000e0054', '\U000e0055', '\U000e0056', + '\U000e0057', '\U000e0058', '\U000e0059', '\U000e005a', '\U000e005b', + '\U000e005c', '\U000e005d', '\U000e005e', '\U000e005f', '\U000e0060', + '\U000e0061', '\U000e0062', '\U000e0063', '\U000e0064', '\U000e0065', + '\U000e0066', '\U000e0067', '\U000e0068', '\U000e0069', '\U000e006a', + '\U000e006b', '\U000e006c', '\U000e006d', '\U000e006e', '\U000e006f', + '\U000e0070', '\U000e0071', '\U000e0072', '\U000e0073', '\U000e0074', + '\U000e0075', '\U000e0076', '\U000e0077', '\U000e0078', '\U000e0079', + '\U000e007a', '\U000e007b', '\U000e007c', '\U000e007d', '\U000e007e', + '\U000e007f', +]) diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py index 072a4d0..5d5ba35 100644 --- a/pywikibot/tools/chars.py +++ b/pywikibot/tools/chars.py @@ -1,60 +1,20 @@ # -*- coding: utf-8 -*- """Character based helper functions (not wiki-dependent).""" # -# (C) Pywikibot team, 2015-2019 +# (C) Pywikibot team, 2015-2020 # # Distributed under the terms of the MIT license. # -from __future__ import absolute_import, division, unicode_literals - import sys
+from pywikibot.tools._unidata import _category_cf from pywikibot.tools import LazyRegex
- -# All characters in the Cf category in a static list. When testing each Unicode -# codepoint it takes longer especially when working with UCS2. The lists also -# differ between Python versions which can be avoided by this static list. -_category_cf = frozenset([ - '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603', - '\U00000604', '\U00000605', '\U0000061c', '\U000006dd', '\U0000070f', - '\U000008e2', '\U0000180e', '\U0000200b', '\U0000200c', '\U0000200d', - '\U0000200e', '\U0000200f', '\U0000202a', '\U0000202b', '\U0000202c', - '\U0000202d', '\U0000202e', '\U00002060', '\U00002061', '\U00002062', - '\U00002063', '\U00002064', '\U00002066', '\U00002067', '\U00002068', - '\U00002069', '\U0000206a', '\U0000206b', '\U0000206c', '\U0000206d', - '\U0000206e', '\U0000206f', '\U0000feff', '\U0000fff9', '\U0000fffa', - '\U0000fffb', '\U000110bd', '\U000110cd', '\U0001bca0', '\U0001bca1', - '\U0001bca2', '\U0001bca3', '\U0001d173', '\U0001d174', '\U0001d175', - '\U0001d176', '\U0001d177', '\U0001d178', '\U0001d179', '\U0001d17a', - '\U000e0001', '\U000e0020', '\U000e0021', '\U000e0022', '\U000e0023', - '\U000e0024', '\U000e0025', '\U000e0026', '\U000e0027', '\U000e0028', - '\U000e0029', '\U000e002a', '\U000e002b', '\U000e002c', '\U000e002d', - '\U000e002e', '\U000e002f', '\U000e0030', '\U000e0031', '\U000e0032', - '\U000e0033', '\U000e0034', '\U000e0035', '\U000e0036', '\U000e0037', - '\U000e0038', '\U000e0039', '\U000e003a', '\U000e003b', '\U000e003c', - '\U000e003d', '\U000e003e', '\U000e003f', '\U000e0040', '\U000e0041', - '\U000e0042', '\U000e0043', '\U000e0044', '\U000e0045', '\U000e0046', - '\U000e0047', '\U000e0048', '\U000e0049', '\U000e004a', '\U000e004b', - '\U000e004c', '\U000e004d', '\U000e004e', '\U000e004f', '\U000e0050', - '\U000e0051', '\U000e0052', '\U000e0053', '\U000e0054', '\U000e0055', - '\U000e0056', '\U000e0057', '\U000e0058', '\U000e0059', '\U000e005a', - '\U000e005b', '\U000e005c', '\U000e005d', '\U000e005e', '\U000e005f', - '\U000e0060', '\U000e0061', '\U000e0062', '\U000e0063', '\U000e0064', - '\U000e0065', '\U000e0066', '\U000e0067', '\U000e0068', '\U000e0069', - '\U000e006a', '\U000e006b', '\U000e006c', '\U000e006d', '\U000e006e', - '\U000e006f', '\U000e0070', '\U000e0071', '\U000e0072', '\U000e0073', - '\U000e0074', '\U000e0075', '\U000e0076', '\U000e0077', '\U000e0078', - '\U000e0079', '\U000e007a', '\U000e007b', '\U000e007c', '\U000e007d', - '\U000e007e', '\U000e007f', -]) # This is a set of all invisible characters # At the moment we've only added the characters from the Cf category _invisible_chars = _category_cf
-invisible_regex = LazyRegex( - lambda: '[' + ''.join(_invisible_chars) + ']' -) +invisible_regex = LazyRegex(lambda: '[{}]'.format(''.join(_invisible_chars)))
def contains_invisible(text):