[Pywikibot-commits] [Gerrit] pywikibot/core[master]: Revert "pywikibot.tools.chars: Update and simplify the code"

19 Aug 2018

jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/453786 )
Change subject: Revert "pywikibot.tools.chars: Update and simplify the code"
......................................................................
Revert "pywikibot.tools.chars: Update and simplify the code"
Revert 02e3830b5826, but keep added characters.
Replace `_invisible_chars = frozenset(_category_cf)` with
`_invisible_chars = _category_cf`, _category_cf is already a frozenset
and there is no need to create a copy of it.
Bug: T202238
Change-Id: Ie55d3ba5f100e691f901f6be15e61425aef70795
---
M pywikibot/tools/chars.py
1 file changed, 42 insertions(+), 30 deletions(-)
Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
index e0722f1..b92c478 100644
--- a/pywikibot/tools/chars.py
+++ b/pywikibot/tools/chars.py
@@ -1,52 +1,64 @@
 # -*- coding: utf-8 -*-
 """Character based helper functions(not wiki-dependent)."""
 #
-# (C) Pywikibot team, 2015
+# (C) Pywikibot team, 2015-2018
 #
 # Distributed under the terms of the MIT license.
 #
 from __future__ import absolute_import, unicode_literals
-import re
 import sys
+from pywikibot.tools import LazyRegex
+
if sys.version_info[0] > 2:
     unicode = str
-# All characters in the Cf category. When testing each Unicode codepoint it
-# takes longer especially when working with UCS2. The codepoints also
+# All characters in the Cf category in a static list. When testing each Unicode
+# codepoint it takes longer especially when working with UCS2. The lists also
 # differ between Python versions which can be avoided by this static list.
-_category_cf = (
-    '\U000000AD\U00000600\U00000601\U00000602\U00000603\U00000604\U00000605'
-    '\U0000061C\U000006DD\U0000070F\U000008E2\U0000180E\U0000200B\U0000200C'
-    '\U0000200D\U0000200E\U0000200F\U0000202A\U0000202B\U0000202C\U0000202D'
-    '\U0000202E\U00002060\U00002061\U00002062\U00002063\U00002064\U00002066'
-    '\U00002067\U00002068\U00002069\U0000206A\U0000206B\U0000206C\U0000206D'
-    '\U0000206E\U0000206F\U0000FEFF\U0000FFF9\U0000FFFA\U0000FFFB\U000110BD'
-    '\U000110CD\U0001BCA0\U0001BCA1\U0001BCA2\U0001BCA3\U0001D173\U0001D174'
-    '\U0001D175\U0001D176\U0001D177\U0001D178\U0001D179\U0001D17A\U000E0001'
-    '\U000E0020\U000E0021\U000E0022\U000E0023\U000E0024\U000E0025\U000E0026'
-    '\U000E0027\U000E0028\U000E0029\U000E002A\U000E002B\U000E002C\U000E002D'
-    '\U000E002E\U000E002F\U000E0030\U000E0031\U000E0032\U000E0033\U000E0034'
-    '\U000E0035\U000E0036\U000E0037\U000E0038\U000E0039\U000E003A\U000E003B'
-    '\U000E003C\U000E003D\U000E003E\U000E003F\U000E0040\U000E0041\U000E0042'
-    '\U000E0043\U000E0044\U000E0045\U000E0046\U000E0047\U000E0048\U000E0049'
-    '\U000E004A\U000E004B\U000E004C\U000E004D\U000E004E\U000E004F\U000E0050'
-    '\U000E0051\U000E0052\U000E0053\U000E0054\U000E0055\U000E0056\U000E0057'
-    '\U000E0058\U000E0059\U000E005A\U000E005B\U000E005C\U000E005D\U000E005E'
-    '\U000E005F\U000E0060\U000E0061\U000E0062\U000E0063\U000E0064\U000E0065'
-    '\U000E0066\U000E0067\U000E0068\U000E0069\U000E006A\U000E006B\U000E006C'
-    '\U000E006D\U000E006E\U000E006F\U000E0070\U000E0071\U000E0072\U000E0073'
-    '\U000E0074\U000E0075\U000E0076\U000E0077\U000E0078\U000E0079\U000E007A'
-    '\U000E007B\U000E007C\U000E007D\U000E007E\U000E007F')
-
+_category_cf = frozenset([
+    '\U000000ad', '\U00000600', '\U00000601', '\U00000602', '\U00000603',
+    '\U00000604', '\U00000605', '\U0000061c', '\U000006dd', '\U0000070f',
+    '\U000008e2', '\U0000180e', '\U0000200b', '\U0000200c', '\U0000200d',
+    '\U0000200e', '\U0000200f', '\U0000202a', '\U0000202b', '\U0000202c',
+    '\U0000202d', '\U0000202e', '\U00002060', '\U00002061', '\U00002062',
+    '\U00002063', '\U00002064', '\U00002066', '\U00002067', '\U00002068',
+    '\U00002069', '\U0000206a', '\U0000206b', '\U0000206c', '\U0000206d',
+    '\U0000206e', '\U0000206f', '\U0000feff', '\U0000fff9', '\U0000fffa',
+    '\U0000fffb', '\U000110bd', '\U000110cd', '\U0001bca0', '\U0001bca1',
+    '\U0001bca2', '\U0001bca3', '\U0001d173', '\U0001d174', '\U0001d175',
+    '\U0001d176', '\U0001d177', '\U0001d178', '\U0001d179', '\U0001d17a',
+    '\U000e0001', '\U000e0020', '\U000e0021', '\U000e0022', '\U000e0023',
+    '\U000e0024', '\U000e0025', '\U000e0026', '\U000e0027', '\U000e0028',
+    '\U000e0029', '\U000e002a', '\U000e002b', '\U000e002c', '\U000e002d',
+    '\U000e002e', '\U000e002f', '\U000e0030', '\U000e0031', '\U000e0032',
+    '\U000e0033', '\U000e0034', '\U000e0035', '\U000e0036', '\U000e0037',
+    '\U000e0038', '\U000e0039', '\U000e003a', '\U000e003b', '\U000e003c',
+    '\U000e003d', '\U000e003e', '\U000e003f', '\U000e0040', '\U000e0041',
+    '\U000e0042', '\U000e0043', '\U000e0044', '\U000e0045', '\U000e0046',
+    '\U000e0047', '\U000e0048', '\U000e0049', '\U000e004a', '\U000e004b',
+    '\U000e004c', '\U000e004d', '\U000e004e', '\U000e004f', '\U000e0050',
+    '\U000e0051', '\U000e0052', '\U000e0053', '\U000e0054', '\U000e0055',
+    '\U000e0056', '\U000e0057', '\U000e0058', '\U000e0059', '\U000e005a',
+    '\U000e005b', '\U000e005c', '\U000e005d', '\U000e005e', '\U000e005f',
+    '\U000e0060', '\U000e0061', '\U000e0062', '\U000e0063', '\U000e0064',
+    '\U000e0065', '\U000e0066', '\U000e0067', '\U000e0068', '\U000e0069',
+    '\U000e006a', '\U000e006b', '\U000e006c', '\U000e006d', '\U000e006e',
+    '\U000e006f', '\U000e0070', '\U000e0071', '\U000e0072', '\U000e0073',
+    '\U000e0074', '\U000e0075', '\U000e0076', '\U000e0077', '\U000e0078',
+    '\U000e0079', '\U000e007a', '\U000e007b', '\U000e007c', '\U000e007d',
+    '\U000e007e', '\U000e007f',
+])
 # This is a set of all invisible characters
 # At the moment we've only added the characters from the Cf category
-_invisible_chars = frozenset(_category_cf)
+_invisible_chars = _category_cf
-invisible_regex = re.compile('[' + _category_cf + ']')
+invisible_regex = LazyRegex(
+    lambda: '[' + ''.join(_invisible_chars) + ']'
+)
def contains_invisible(text):
-- 
To view, visit https://gerrit.wikimedia.org/r/453786
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Ie55d3ba5f100e691f901f6be15e61425aef70795
Gerrit-Change-Number: 453786
Gerrit-PatchSet: 3
Gerrit-Owner: Dalba dalba.wiki@gmail.com
Gerrit-Reviewer: Dalba dalba.wiki@gmail.com
Gerrit-Reviewer: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: Zoranzoki21 zorandori4444@gmail.com
Gerrit-Reviewer: jenkins-bot (75)



    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

[Pywikibot-commits] [Gerrit] pywikibot/core[master]: Revert "pywikibot.tools.chars: Update and simplify the code"