jenkins-bot submitted this change.

View Change

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

[unicode] Update characters in the Cf category to Unicode version 12.1.0

Update characters in the Cf category from Unicode version 11.0.0 to
12.1.0 but move the frozenset to _unidata.py which holds other data
derived from unicodedata module.

Change-Id: Ieec0a10956d7a05203773e7cfe78bac70de32a28
---
M pywikibot/tools/_unidata.py
M pywikibot/tools/chars.py
2 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/pywikibot/tools/_unidata.py b/pywikibot/tools/_unidata.py
index f23ae25..53a76c4 100644
--- a/pywikibot/tools/_unidata.py
+++ b/pywikibot/tools/_unidata.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""Helper function fo MediaWiki title-cased forms."""
+"""Helper function which holds data from unicodedata library."""
 #
 # (C) Pywikibot team, 2018-2020
 #
@@ -266,3 +266,47 @@
 
 
 _first_upper_exception = _first_upper_exception_dict.get
+
+
+# All characters in the Cf category in a static list. When testing each Unicode
+# codepoint it takes longer especially when working with UCS2. The lists also
+# differ between Python versions which can be avoided by this static list.
+#
+# This frozenset was created using Python 3.8 (Unicode version 12.1.0):
+# list(c for c in (chr(i) for i in range(sys.maxunicode))
+#      if unicodedata.category(c) == 'Cf')
+_category_cf = frozenset([
+    '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603',
+    '\U00000604', '\U00000605', '\U0000061c', '\U000006dd', '\U0000070f',
+    '\U000008e2', '\U0000180e', '\U0000200b', '\U0000200c', '\U0000200d',
+    '\U0000200e', '\U0000200f', '\U0000202a', '\U0000202b', '\U0000202c',
+    '\U0000202d', '\U0000202e', '\U00002060', '\U00002061', '\U00002062',
+    '\U00002063', '\U00002064', '\U00002066', '\U00002067', '\U00002068',
+    '\U00002069', '\U0000206a', '\U0000206b', '\U0000206c', '\U0000206d',
+    '\U0000206e', '\U0000206f', '\U0000feff', '\U0000fff9', '\U0000fffa',
+    '\U0000fffb', '\U000110bd', '\U000110cd', '\U00013430', '\U00013431',
+    '\U00013432', '\U00013433', '\U00013434', '\U00013435', '\U00013436',
+    '\U00013437', '\U00013438', '\U0001bca0', '\U0001bca1', '\U0001bca2',
+    '\U0001bca3', '\U0001d173', '\U0001d174', '\U0001d175', '\U0001d176',
+    '\U0001d177', '\U0001d178', '\U0001d179', '\U0001d17a', '\U000e0001',
+    '\U000e0020', '\U000e0021', '\U000e0022', '\U000e0023', '\U000e0024',
+    '\U000e0025', '\U000e0026', '\U000e0027', '\U000e0028', '\U000e0029',
+    '\U000e002a', '\U000e002b', '\U000e002c', '\U000e002d', '\U000e002e',
+    '\U000e002f', '\U000e0030', '\U000e0031', '\U000e0032', '\U000e0033',
+    '\U000e0034', '\U000e0035', '\U000e0036', '\U000e0037', '\U000e0038',
+    '\U000e0039', '\U000e003a', '\U000e003b', '\U000e003c', '\U000e003d',
+    '\U000e003e', '\U000e003f', '\U000e0040', '\U000e0041', '\U000e0042',
+    '\U000e0043', '\U000e0044', '\U000e0045', '\U000e0046', '\U000e0047',
+    '\U000e0048', '\U000e0049', '\U000e004a', '\U000e004b', '\U000e004c',
+    '\U000e004d', '\U000e004e', '\U000e004f', '\U000e0050', '\U000e0051',
+    '\U000e0052', '\U000e0053', '\U000e0054', '\U000e0055', '\U000e0056',
+    '\U000e0057', '\U000e0058', '\U000e0059', '\U000e005a', '\U000e005b',
+    '\U000e005c', '\U000e005d', '\U000e005e', '\U000e005f', '\U000e0060',
+    '\U000e0061', '\U000e0062', '\U000e0063', '\U000e0064', '\U000e0065',
+    '\U000e0066', '\U000e0067', '\U000e0068', '\U000e0069', '\U000e006a',
+    '\U000e006b', '\U000e006c', '\U000e006d', '\U000e006e', '\U000e006f',
+    '\U000e0070', '\U000e0071', '\U000e0072', '\U000e0073', '\U000e0074',
+    '\U000e0075', '\U000e0076', '\U000e0077', '\U000e0078', '\U000e0079',
+    '\U000e007a', '\U000e007b', '\U000e007c', '\U000e007d', '\U000e007e',
+    '\U000e007f',
+])
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
index 072a4d0..5d5ba35 100644
--- a/pywikibot/tools/chars.py
+++ b/pywikibot/tools/chars.py
@@ -1,60 +1,20 @@
 # -*- coding: utf-8 -*-
 """Character based helper functions (not wiki-dependent)."""
 #
-# (C) Pywikibot team, 2015-2019
+# (C) Pywikibot team, 2015-2020
 #
 # Distributed under the terms of the MIT license.
 #
-from __future__ import absolute_import, division, unicode_literals
-
 import sys
 
+from pywikibot.tools._unidata import _category_cf
 from pywikibot.tools import LazyRegex
 
-
-# All characters in the Cf category in a static list. When testing each Unicode
-# codepoint it takes longer especially when working with UCS2. The lists also
-# differ between Python versions which can be avoided by this static list.
-_category_cf = frozenset([
-    '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603',
-    '\U00000604', '\U00000605', '\U0000061c', '\U000006dd', '\U0000070f',
-    '\U000008e2', '\U0000180e', '\U0000200b', '\U0000200c', '\U0000200d',
-    '\U0000200e', '\U0000200f', '\U0000202a', '\U0000202b', '\U0000202c',
-    '\U0000202d', '\U0000202e', '\U00002060', '\U00002061', '\U00002062',
-    '\U00002063', '\U00002064', '\U00002066', '\U00002067', '\U00002068',
-    '\U00002069', '\U0000206a', '\U0000206b', '\U0000206c', '\U0000206d',
-    '\U0000206e', '\U0000206f', '\U0000feff', '\U0000fff9', '\U0000fffa',
-    '\U0000fffb', '\U000110bd', '\U000110cd', '\U0001bca0', '\U0001bca1',
-    '\U0001bca2', '\U0001bca3', '\U0001d173', '\U0001d174', '\U0001d175',
-    '\U0001d176', '\U0001d177', '\U0001d178', '\U0001d179', '\U0001d17a',
-    '\U000e0001', '\U000e0020', '\U000e0021', '\U000e0022', '\U000e0023',
-    '\U000e0024', '\U000e0025', '\U000e0026', '\U000e0027', '\U000e0028',
-    '\U000e0029', '\U000e002a', '\U000e002b', '\U000e002c', '\U000e002d',
-    '\U000e002e', '\U000e002f', '\U000e0030', '\U000e0031', '\U000e0032',
-    '\U000e0033', '\U000e0034', '\U000e0035', '\U000e0036', '\U000e0037',
-    '\U000e0038', '\U000e0039', '\U000e003a', '\U000e003b', '\U000e003c',
-    '\U000e003d', '\U000e003e', '\U000e003f', '\U000e0040', '\U000e0041',
-    '\U000e0042', '\U000e0043', '\U000e0044', '\U000e0045', '\U000e0046',
-    '\U000e0047', '\U000e0048', '\U000e0049', '\U000e004a', '\U000e004b',
-    '\U000e004c', '\U000e004d', '\U000e004e', '\U000e004f', '\U000e0050',
-    '\U000e0051', '\U000e0052', '\U000e0053', '\U000e0054', '\U000e0055',
-    '\U000e0056', '\U000e0057', '\U000e0058', '\U000e0059', '\U000e005a',
-    '\U000e005b', '\U000e005c', '\U000e005d', '\U000e005e', '\U000e005f',
-    '\U000e0060', '\U000e0061', '\U000e0062', '\U000e0063', '\U000e0064',
-    '\U000e0065', '\U000e0066', '\U000e0067', '\U000e0068', '\U000e0069',
-    '\U000e006a', '\U000e006b', '\U000e006c', '\U000e006d', '\U000e006e',
-    '\U000e006f', '\U000e0070', '\U000e0071', '\U000e0072', '\U000e0073',
-    '\U000e0074', '\U000e0075', '\U000e0076', '\U000e0077', '\U000e0078',
-    '\U000e0079', '\U000e007a', '\U000e007b', '\U000e007c', '\U000e007d',
-    '\U000e007e', '\U000e007f',
-])
 # This is a set of all invisible characters
 # At the moment we've only added the characters from the Cf category
 _invisible_chars = _category_cf
 
-invisible_regex = LazyRegex(
-    lambda: '[' + ''.join(_invisible_chars) + ']'
-)
+invisible_regex = LazyRegex(lambda: '[{}]'.format(''.join(_invisible_chars)))
 
 
 def contains_invisible(text):

To view, visit change 615717. To unsubscribe, or for help writing mail filters, visit settings.