jenkins-bot merged this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
pywikibot.tools.chars: Update and simplify the code

- Use a string instead of a frozenset to define _category_cf. The string
form will fit into `invisible_regex` constructor more easily.
- Update the the characters using Python 3.7's unicodedata v11.0.0.
Seven new characters were added: {'\U0001bca3', '\U0001bca1',
'\U000110cd', '\u0605', '\U0001bca2', '\U0001bca0', '\u08e2'}
- Rewrite invisible_regex using _category_cf and re.compile.

Change-Id: I5adf1d1d9b714ec00bbd8854d36b3253d01824d7
---
M pywikibot/tools/chars.py
1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
index 7a9b4d8..e0722f1 100644
--- a/pywikibot/tools/chars.py
+++ b/pywikibot/tools/chars.py
@@ -7,56 +7,46 @@
#
from __future__ import absolute_import, unicode_literals

+import re
import sys

-from pywikibot.tools import LazyRegex
-

if sys.version_info[0] > 2:
unicode = str


-# All characters in the Cf category in a static list. When testing each Unicode
-# codepoint it takes longer especially when working with UCS2. The lists also
+# All characters in the Cf category. When testing each Unicode codepoint it
+# takes longer especially when working with UCS2. The codepoints also
# differ between Python versions which can be avoided by this static list.
-_category_cf = frozenset([
- '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603',
- '\U00000604', '\U0000061c', '\U000006dd', '\U0000070f', '\U0000180e',
- '\U0000200b', '\U0000200c', '\U0000200d', '\U0000200e', '\U0000200f',
- '\U0000202a', '\U0000202b', '\U0000202c', '\U0000202d', '\U0000202e',
- '\U00002060', '\U00002061', '\U00002062', '\U00002063', '\U00002064',
- '\U00002066', '\U00002067', '\U00002068', '\U00002069', '\U0000206a',
- '\U0000206b', '\U0000206c', '\U0000206d', '\U0000206e', '\U0000206f',
- '\U0000feff', '\U0000fff9', '\U0000fffa', '\U0000fffb', '\U000110bd',
- '\U0001d173', '\U0001d174', '\U0001d175', '\U0001d176', '\U0001d177',
- '\U0001d178', '\U0001d179', '\U0001d17a', '\U000e0001', '\U000e0020',
- '\U000e0021', '\U000e0022', '\U000e0023', '\U000e0024', '\U000e0025',
- '\U000e0026', '\U000e0027', '\U000e0028', '\U000e0029', '\U000e002a',
- '\U000e002b', '\U000e002c', '\U000e002d', '\U000e002e', '\U000e002f',
- '\U000e0030', '\U000e0031', '\U000e0032', '\U000e0033', '\U000e0034',
- '\U000e0035', '\U000e0036', '\U000e0037', '\U000e0038', '\U000e0039',
- '\U000e003a', '\U000e003b', '\U000e003c', '\U000e003d', '\U000e003e',
- '\U000e003f', '\U000e0040', '\U000e0041', '\U000e0042', '\U000e0043',
- '\U000e0044', '\U000e0045', '\U000e0046', '\U000e0047', '\U000e0048',
- '\U000e0049', '\U000e004a', '\U000e004b', '\U000e004c', '\U000e004d',
- '\U000e004e', '\U000e004f', '\U000e0050', '\U000e0051', '\U000e0052',
- '\U000e0053', '\U000e0054', '\U000e0055', '\U000e0056', '\U000e0057',
- '\U000e0058', '\U000e0059', '\U000e005a', '\U000e005b', '\U000e005c',
- '\U000e005d', '\U000e005e', '\U000e005f', '\U000e0060', '\U000e0061',
- '\U000e0062', '\U000e0063', '\U000e0064', '\U000e0065', '\U000e0066',
- '\U000e0067', '\U000e0068', '\U000e0069', '\U000e006a', '\U000e006b',
- '\U000e006c', '\U000e006d', '\U000e006e', '\U000e006f', '\U000e0070',
- '\U000e0071', '\U000e0072', '\U000e0073', '\U000e0074', '\U000e0075',
- '\U000e0076', '\U000e0077', '\U000e0078', '\U000e0079', '\U000e007a',
- '\U000e007b', '\U000e007c', '\U000e007d', '\U000e007e', '\U000e007f',
-])
+_category_cf = (
+ '\U000000AD\U00000600\U00000601\U00000602\U00000603\U00000604\U00000605'
+ '\U0000061C\U000006DD\U0000070F\U000008E2\U0000180E\U0000200B\U0000200C'
+ '\U0000200D\U0000200E\U0000200F\U0000202A\U0000202B\U0000202C\U0000202D'
+ '\U0000202E\U00002060\U00002061\U00002062\U00002063\U00002064\U00002066'
+ '\U00002067\U00002068\U00002069\U0000206A\U0000206B\U0000206C\U0000206D'
+ '\U0000206E\U0000206F\U0000FEFF\U0000FFF9\U0000FFFA\U0000FFFB\U000110BD'
+ '\U000110CD\U0001BCA0\U0001BCA1\U0001BCA2\U0001BCA3\U0001D173\U0001D174'
+ '\U0001D175\U0001D176\U0001D177\U0001D178\U0001D179\U0001D17A\U000E0001'
+ '\U000E0020\U000E0021\U000E0022\U000E0023\U000E0024\U000E0025\U000E0026'
+ '\U000E0027\U000E0028\U000E0029\U000E002A\U000E002B\U000E002C\U000E002D'
+ '\U000E002E\U000E002F\U000E0030\U000E0031\U000E0032\U000E0033\U000E0034'
+ '\U000E0035\U000E0036\U000E0037\U000E0038\U000E0039\U000E003A\U000E003B'
+ '\U000E003C\U000E003D\U000E003E\U000E003F\U000E0040\U000E0041\U000E0042'
+ '\U000E0043\U000E0044\U000E0045\U000E0046\U000E0047\U000E0048\U000E0049'
+ '\U000E004A\U000E004B\U000E004C\U000E004D\U000E004E\U000E004F\U000E0050'
+ '\U000E0051\U000E0052\U000E0053\U000E0054\U000E0055\U000E0056\U000E0057'
+ '\U000E0058\U000E0059\U000E005A\U000E005B\U000E005C\U000E005D\U000E005E'
+ '\U000E005F\U000E0060\U000E0061\U000E0062\U000E0063\U000E0064\U000E0065'
+ '\U000E0066\U000E0067\U000E0068\U000E0069\U000E006A\U000E006B\U000E006C'
+ '\U000E006D\U000E006E\U000E006F\U000E0070\U000E0071\U000E0072\U000E0073'
+ '\U000E0074\U000E0075\U000E0076\U000E0077\U000E0078\U000E0079\U000E007A'
+ '\U000E007B\U000E007C\U000E007D\U000E007E\U000E007F')
+
# This is a set of all invisible characters
# At the moment we've only added the characters from the Cf category
_invisible_chars = frozenset(_category_cf)

-invisible_regex = LazyRegex(
- lambda: '[' + ''.join(_invisible_chars) + ']'
-)
+invisible_regex = re.compile('[' + _category_cf + ']')


def contains_invisible(text):

To view, visit change 452288. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I5adf1d1d9b714ec00bbd8854d36b3253d01824d7
Gerrit-Change-Number: 452288
Gerrit-PatchSet: 12
Gerrit-Owner: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: Zhuyifei1999 <zhuyifei1999@gmail.com>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444@gmail.com>
Gerrit-Reviewer: jenkins-bot (75)
Gerrit-CC: Mpaa <mpaa.wiki@gmail.com>