jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[unicode] Update characters in the Cf category to Unicode version 12.1.0

Update characters in the Cf category from Unicode version 11.0.0 to
12.1.0 but move the frozenset to _unidata.py which holds other data
derived from unicodedata module.

Change-Id: Ieec0a10956d7a05203773e7cfe78bac70de32a28
---
M pywikibot/tools/_unidata.py
M pywikibot/tools/chars.py
2 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/pywikibot/tools/_unidata.py b/pywikibot/tools/_unidata.py
index f23ae25..53a76c4 100644
--- a/pywikibot/tools/_unidata.py
+++ b/pywikibot/tools/_unidata.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-"""Helper function fo MediaWiki title-cased forms."""
+"""Helper function which holds data from unicodedata library."""
#
# (C) Pywikibot team, 2018-2020
#
@@ -266,3 +266,47 @@


_first_upper_exception = _first_upper_exception_dict.get
+
+
+# All characters in the Cf category in a static list. When testing each Unicode
+# codepoint it takes longer especially when working with UCS2. The lists also
+# differ between Python versions which can be avoided by this static list.
+#
+# This frozenset was created using Python 3.8 (Unicode version 12.1.0):
+# list(c for c in (chr(i) for i in range(sys.maxunicode))
+# if unicodedata.category(c) == 'Cf')
+_category_cf = frozenset([
+ '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603',
+ '\U00000604', '\U00000605', '\U0000061c', '\U000006dd', '\U0000070f',
+ '\U000008e2', '\U0000180e', '\U0000200b', '\U0000200c', '\U0000200d',
+ '\U0000200e', '\U0000200f', '\U0000202a', '\U0000202b', '\U0000202c',
+ '\U0000202d', '\U0000202e', '\U00002060', '\U00002061', '\U00002062',
+ '\U00002063', '\U00002064', '\U00002066', '\U00002067', '\U00002068',
+ '\U00002069', '\U0000206a', '\U0000206b', '\U0000206c', '\U0000206d',
+ '\U0000206e', '\U0000206f', '\U0000feff', '\U0000fff9', '\U0000fffa',
+ '\U0000fffb', '\U000110bd', '\U000110cd', '\U00013430', '\U00013431',
+ '\U00013432', '\U00013433', '\U00013434', '\U00013435', '\U00013436',
+ '\U00013437', '\U00013438', '\U0001bca0', '\U0001bca1', '\U0001bca2',
+ '\U0001bca3', '\U0001d173', '\U0001d174', '\U0001d175', '\U0001d176',
+ '\U0001d177', '\U0001d178', '\U0001d179', '\U0001d17a', '\U000e0001',
+ '\U000e0020', '\U000e0021', '\U000e0022', '\U000e0023', '\U000e0024',
+ '\U000e0025', '\U000e0026', '\U000e0027', '\U000e0028', '\U000e0029',
+ '\U000e002a', '\U000e002b', '\U000e002c', '\U000e002d', '\U000e002e',
+ '\U000e002f', '\U000e0030', '\U000e0031', '\U000e0032', '\U000e0033',
+ '\U000e0034', '\U000e0035', '\U000e0036', '\U000e0037', '\U000e0038',
+ '\U000e0039', '\U000e003a', '\U000e003b', '\U000e003c', '\U000e003d',
+ '\U000e003e', '\U000e003f', '\U000e0040', '\U000e0041', '\U000e0042',
+ '\U000e0043', '\U000e0044', '\U000e0045', '\U000e0046', '\U000e0047',
+ '\U000e0048', '\U000e0049', '\U000e004a', '\U000e004b', '\U000e004c',
+ '\U000e004d', '\U000e004e', '\U000e004f', '\U000e0050', '\U000e0051',
+ '\U000e0052', '\U000e0053', '\U000e0054', '\U000e0055', '\U000e0056',
+ '\U000e0057', '\U000e0058', '\U000e0059', '\U000e005a', '\U000e005b',
+ '\U000e005c', '\U000e005d', '\U000e005e', '\U000e005f', '\U000e0060',
+ '\U000e0061', '\U000e0062', '\U000e0063', '\U000e0064', '\U000e0065',
+ '\U000e0066', '\U000e0067', '\U000e0068', '\U000e0069', '\U000e006a',
+ '\U000e006b', '\U000e006c', '\U000e006d', '\U000e006e', '\U000e006f',
+ '\U000e0070', '\U000e0071', '\U000e0072', '\U000e0073', '\U000e0074',
+ '\U000e0075', '\U000e0076', '\U000e0077', '\U000e0078', '\U000e0079',
+ '\U000e007a', '\U000e007b', '\U000e007c', '\U000e007d', '\U000e007e',
+ '\U000e007f',
+])
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
index 072a4d0..5d5ba35 100644
--- a/pywikibot/tools/chars.py
+++ b/pywikibot/tools/chars.py
@@ -1,60 +1,20 @@
# -*- coding: utf-8 -*-
"""Character based helper functions (not wiki-dependent)."""
#
-# (C) Pywikibot team, 2015-2019
+# (C) Pywikibot team, 2015-2020
#
# Distributed under the terms of the MIT license.
#
-from __future__ import absolute_import, division, unicode_literals
-
import sys

+from pywikibot.tools._unidata import _category_cf
from pywikibot.tools import LazyRegex

-
-# All characters in the Cf category in a static list. When testing each Unicode
-# codepoint it takes longer especially when working with UCS2. The lists also
-# differ between Python versions which can be avoided by this static list.
-_category_cf = frozenset([
- '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603',
- '\U00000604', '\U00000605', '\U0000061c', '\U000006dd', '\U0000070f',
- '\U000008e2', '\U0000180e', '\U0000200b', '\U0000200c', '\U0000200d',
- '\U0000200e', '\U0000200f', '\U0000202a', '\U0000202b', '\U0000202c',
- '\U0000202d', '\U0000202e', '\U00002060', '\U00002061', '\U00002062',
- '\U00002063', '\U00002064', '\U00002066', '\U00002067', '\U00002068',
- '\U00002069', '\U0000206a', '\U0000206b', '\U0000206c', '\U0000206d',
- '\U0000206e', '\U0000206f', '\U0000feff', '\U0000fff9', '\U0000fffa',
- '\U0000fffb', '\U000110bd', '\U000110cd', '\U0001bca0', '\U0001bca1',
- '\U0001bca2', '\U0001bca3', '\U0001d173', '\U0001d174', '\U0001d175',
- '\U0001d176', '\U0001d177', '\U0001d178', '\U0001d179', '\U0001d17a',
- '\U000e0001', '\U000e0020', '\U000e0021', '\U000e0022', '\U000e0023',
- '\U000e0024', '\U000e0025', '\U000e0026', '\U000e0027', '\U000e0028',
- '\U000e0029', '\U000e002a', '\U000e002b', '\U000e002c', '\U000e002d',
- '\U000e002e', '\U000e002f', '\U000e0030', '\U000e0031', '\U000e0032',
- '\U000e0033', '\U000e0034', '\U000e0035', '\U000e0036', '\U000e0037',
- '\U000e0038', '\U000e0039', '\U000e003a', '\U000e003b', '\U000e003c',
- '\U000e003d', '\U000e003e', '\U000e003f', '\U000e0040', '\U000e0041',
- '\U000e0042', '\U000e0043', '\U000e0044', '\U000e0045', '\U000e0046',
- '\U000e0047', '\U000e0048', '\U000e0049', '\U000e004a', '\U000e004b',
- '\U000e004c', '\U000e004d', '\U000e004e', '\U000e004f', '\U000e0050',
- '\U000e0051', '\U000e0052', '\U000e0053', '\U000e0054', '\U000e0055',
- '\U000e0056', '\U000e0057', '\U000e0058', '\U000e0059', '\U000e005a',
- '\U000e005b', '\U000e005c', '\U000e005d', '\U000e005e', '\U000e005f',
- '\U000e0060', '\U000e0061', '\U000e0062', '\U000e0063', '\U000e0064',
- '\U000e0065', '\U000e0066', '\U000e0067', '\U000e0068', '\U000e0069',
- '\U000e006a', '\U000e006b', '\U000e006c', '\U000e006d', '\U000e006e',
- '\U000e006f', '\U000e0070', '\U000e0071', '\U000e0072', '\U000e0073',
- '\U000e0074', '\U000e0075', '\U000e0076', '\U000e0077', '\U000e0078',
- '\U000e0079', '\U000e007a', '\U000e007b', '\U000e007c', '\U000e007d',
- '\U000e007e', '\U000e007f',
-])
# This is a set of all invisible characters
# At the moment we've only added the characters from the Cf category
_invisible_chars = _category_cf

-invisible_regex = LazyRegex(
- lambda: '[' + ''.join(_invisible_chars) + ']'
-)
+invisible_regex = LazyRegex(lambda: '[{}]'.format(''.join(_invisible_chars)))


def contains_invisible(text):

To view, visit change 615717. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ieec0a10956d7a05203773e7cfe78bac70de32a28
Gerrit-Change-Number: 615717
Gerrit-PatchSet: 2
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-CC: Dalba <dalba.wiki@gmail.com>
Gerrit-CC: Huji <huji.huji@gmail.com>
Gerrit-MessageType: merged