jenkins-bot merged this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
textlib._create_default_regexes: Avoid using inline flags

- Replace dots with [\s\S] instead of using (?s).
- Replace ^ with (?:(?<=\n)|\A) and replace $ with (?=\n|\Z) instead of
using (?m).
- Replace characters with a character class containing their uppercase
and lowercase forms instead of using (?i).
- During the rewrite most of the E241 errors were resolved. Fix the few
remaining ones and remove E241 from tox.ini ignore codes.
- Sort _regex_cache entries.
- Remoe score, source, ref, pre, and gallery entries. They will be
created on demand during _get_regexes call.

Bug: T195538
Change-Id: Ibe2c9a35da8f2fcf4fe960c6f8447e65bea3eaea
---
M pywikibot/textlib.py
M tox.ini
2 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index dc844c4..1e29666 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -247,53 +247,66 @@
return lambda text: any(predicate(text) for predicate in predicates)


+def _ignore_case(string):
+ """Return a case-insensitive pattern for the string."""
+ return ''.join('[' + c.upper() + c.lower() + ']' for c in string)
+
+
+def _tag_pattern(tag_name):
+ """Return a tag pattern for the given tag name."""
+ return r'<{0}[ >][\s\S]*?</{0}\s*>'.format(_ignore_case(tag_name))
+
+
+def _tag_regex(tag_name):
+ """Return a compiled tag regex for the given tag name."""
+ return re.compile(_tag_pattern(tag_name))
+
+
def _create_default_regexes():
"""Fill (and possibly overwrite) _regex_cache with default regexes."""
_regex_cache.update({
- 'comment': re.compile(r'(?s)<!--.*?-->'),
+ # categories
+ 'category': (r'\[\[ *(?:%s)\s*:.*?\]\]',
+ lambda site: '|'.join(site.namespaces[14])),
+ 'comment': re.compile(r'<!--[\s\S]*?-->'),
+ # files
+ 'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])),
# section headers
- 'header': re.compile(r'(?m)^=+.+=+ *$'),
- # preformatted text
- 'pre': re.compile(r'(?is)<pre[ >].*?</pre\s*>'),
- 'source': re.compile(r'(?is)<source[ >].*?</source\s*>'),
- 'score': re.compile(r'(?is)<score[ >].*?</score\s*>'),
- # inline references
- 'ref': re.compile(r'(?is)<ref[ >].*?</ref>'),
- 'template': NESTED_TEMPLATE_REGEX,
+ 'header': re.compile(r'(?:(?<=\n)|\A)=+.+=+ *(?=\n|\Z)'),
+ # external links
+ 'hyperlink': compileLinkR(),
+ # also finds links to foreign sites with preleading ":"
+ 'interwiki': (
+ r'\[\[:?(%s)\s?:[^\]]*\]\][\s]*',
+ lambda site: '|'.join(
+ _ignore_case(i) for i in site.validLanguageLinks()
+ + list(site.family.obsolete.keys()))),
+ # Module invocations (currently only Lua)
+ 'invoke': (
+ r'\{\{\s*\#(?:%s):[\s\S]*?\}\}',
+ lambda site: '|'.join(
+ _ignore_case(mw) for mw in site.getmagicwords('invoke'))),
+ # this matches internal wikilinks, but also interwiki, categories, and
+ # images.
+ 'link': re.compile(r'\[\[[^\]|]*(\|[^\]]*)?\]\]'),
+ # pagelist tag (used in Proofread extension).
+ 'pagelist': re.compile(r'<%s[\s\S]*?/>' % _ignore_case('pagelist')),
+ # Wikibase property inclusions
+ 'property': (
+ r'\{\{\s*\#(?:%s):\s*[Pp]\d+.*?\}\}',
+ lambda site: '|'.join(
+ _ignore_case(mw) for mw in site.getmagicwords('property'))),
+ # lines that start with a colon or more will be indented
+ 'startcolon': re.compile(r'(?:(?<=\n)|\A):(.*?)(?=\n|\Z)'),
# lines that start with a space are shown in a monospace font and
# have whitespace preserved.
- 'startspace': re.compile(r'(?m)^ (.*?)$'),
- # lines that start with a colon or more will be indented
- 'startcolon': re.compile(r'(?m)^:(.*?)$'),
+ 'startspace': re.compile(r'(?:(?<=\n)|\A) (.*?)(?=\n|\Z)'),
# tables often have whitespace that is used to improve wiki
# source code readability.
# TODO: handle nested tables.
- 'table': re.compile(r'(?ims)'
- r'^{\|.*?^\|}|<table[ >].*?</table\s*>'),
- 'hyperlink': compileLinkR(),
- 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery\s*>'),
- # this matches internal wikilinks, but also interwiki, categories, and
- # images.
- 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
- # also finds links to foreign sites with preleading ":"
- 'interwiki': (r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*',
- lambda site: '|'.join(
- site.validLanguageLinks()
- + list(site.family.obsolete.keys()))),
- # Wikibase property inclusions
- 'property': (r'(?i)\{\{\s*\#(?:%s):\s*p\d+.*?\}\}',
- lambda site: '|'.join(site.getmagicwords('property'))),
- # Module invocations (currently only Lua)
- 'invoke': (r'(?is)\{\{\s*\#(?:%s):.*?\}\}',
- lambda site: '|'.join(site.getmagicwords('invoke'))),
- # categories
- 'category': (r'\[\[ *(?:%s)\s*:.*?\]\]',
- lambda site: '|'.join(site.namespaces[14])),
- # files
- 'file': (FILE_LINK_REGEX,
- lambda site: '|'.join(site.namespaces[6])),
- # pagelist tag (used in Proofread extension).
- 'pagelist': re.compile(r'(?is)<pagelist.*?/>'),
+ 'table': re.compile(
+ r'(?:(?<=\n)|\A){\|[\S\s]*?\n\|}|%s' % _tag_pattern('table')),
+ 'template': NESTED_TEMPLATE_REGEX,
})


@@ -330,13 +343,11 @@
else:
# nowiki, noinclude, includeonly, timeline, math and other
# extensions
- _regex_cache[exc] = re.compile(
- r'(?is)<{0}\s*>.*?</{0}\s*>'.format(exc))
+ _regex_cache[exc] = _tag_regex(exc)
result.append(_regex_cache[exc])
# handle alias
if exc == 'source':
- dontTouchRegexes.append(re.compile(
- r'(?is)<syntaxhighlight[ >].*?</syntaxhighlight\s*>'))
+ dontTouchRegexes.append(_tag_regex('syntaxhighlight'))
else:
# assume it's a regular expression
dontTouchRegexes.append(exc)
diff --git a/tox.ini b/tox.ini
index 3dcde5f..275faf1 100644
--- a/tox.ini
+++ b/tox.ini
@@ -180,7 +180,7 @@
pywikibot/logging.py : N803
pywikibot/pagegenerators.py : N803, N806
pywikibot/specialbots.py : N803, N806
- pywikibot/textlib.py : E241, N801, N803, N806
+ pywikibot/textlib.py : N801, N803, N806
pywikibot/tools/ip.py : N803
pywikibot/userinterfaces/cgi_interface.py : N803
pywikibot/userinterfaces/gui.py : N812, N803, N806

To view, visit change 445851. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Ibe2c9a35da8f2fcf4fe960c6f8447e65bea3eaea
Gerrit-Change-Number: 445851
Gerrit-PatchSet: 4
Gerrit-Owner: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444@gmail.com>
Gerrit-Reviewer: jenkins-bot