jenkins-bot has submitted this change and it was merged.
Change subject: Fix textlib 'file' exception regex
......................................................................
Fix textlib 'file' exception regex
Also replace the custom 'file' exception in cc
with the standard fixed exception in textlib.
Change-Id: If2a0827df1a0f83afd3c23e57bb841929ed1399c
---
M pywikibot/cosmetic_changes.py
M pywikibot/textlib.py
M tests/textlib_tests.py
3 files changed, 132 insertions(+), 9 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index 8aabc09..e8f8972 100755
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -833,6 +833,7 @@
return
exceptions = [
'gallery',
+ 'file',
'hyperlink',
'interwiki',
# FIXME: but changes letters inside wikilinks
@@ -856,16 +857,9 @@
new = digits.pop(self.site.code)
# This only works if there are only two items in digits dict
old = digits[digits.keys()[0]]
- # do not change inside file links
- namespaces = list(self.site.namespace(6, all=True))
- pattern = re.compile(
- u'\\[\\[(%s):.+?\\.\\w+? *(\\|((\\[\\[.*?\\]\\])|.)*)?\\]\\]'
- % u'|'.join(namespaces),
- re.UNICODE)
# not to let bot edits in latin content
exceptions.append(re.compile(u"[^%(fa)s] *?\"*? *?, *?[^%(fa)s]"
% {'fa': faChrs}))
- exceptions.append(pattern)
text = textlib.replaceExcept(text, u',', u'،', exceptions)
if self.site.code == 'ckb':
text = textlib.replaceExcept(text,
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 54d20eb..16f5222 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -99,6 +99,21 @@
(?P<unhandled_depth>{{\s*[^{\|#0-9][^{\|#]*?\s* [^{]* {{ .* }})
""", re.VERBOSE)
+# The following regex supports wikilinks anywhere after the first pipe
+# and correctly matches the end of the file link if the wikilink contains
+# [[ or ]].
+# The namespace names must be substituted into this regex.
+# e.g. FILE_LINK_REGEX % 'File' or FILE_LINK_REGEX %
'|'.join(site.namespaces)
+FILE_LINK_REGEX = r"""
+\[\[\s*(?:%s)\s*:[^|]*?\s*
+ (\|
+ ( \[\[ [^[]*? \[\[ [^]]*? \]\] [^]]*? \]\] # capture invalid syntax
+ | ( \[\[ .*? \]\] )? [^[]*?
+ | \[ [^]]*? \]
+ )*
+ )?
+\]\]
+"""
NON_LATIN_DIGITS = {
'ckb': u'٠١٢٣٤٥٦٧٨٩',
@@ -180,7 +195,7 @@
'category': ('\[\[ *(?:%s)\s*:.*?\]\]',
lambda site: '|'.join(site.namespaces[14])),
# files
- 'file': ('\[\[ *(?:%s)\s*:.*?\]\]',
+ 'file': (FILE_LINK_REGEX,
lambda site: '|'.join(site.namespaces[6])),
})
@@ -208,7 +223,7 @@
if (exc, site) not in _regex_cache:
re_text, re_var = _regex_cache[exc]
_regex_cache[(exc, site)] = re.compile(
- re_text % re_var(site))
+ re_text % re_var(site), re.VERBOSE)
result.append(_regex_cache[(exc, site)])
else:
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 762f62d..a1e51be 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -1134,17 +1134,131 @@
self.assertEqual(textlib.replaceExcept('{{#invoke:x}}', 'x',
'y',
['invoke'], site=self.site),
'{{#invoke:x}}')
+
+ def test_replace_tag_category(self):
+ """Test replacing not inside category links."""
for ns_name in self.site.namespaces[14]:
self.assertEqual(textlib.replaceExcept('[[%s:x]]' % ns_name,
'x', 'y',
['category'],
site=self.site),
'[[%s:x]]' % ns_name)
+
+ def test_replace_tag_file(self):
+ """Test replacing not inside file links."""
for ns_name in self.site.namespaces[6]:
self.assertEqual(textlib.replaceExcept('[[%s:x]]' % ns_name,
'x', 'y',
['file'],
site=self.site),
'[[%s:x]]' % ns_name)
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:x|foo]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:x|foo]]')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:x|]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:x|]]')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:x|foo|bar x]] x',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:x|foo|bar x]] y')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:x|]][[File:x|foo]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:x|]][[File:x|foo]]')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[NonFile:x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[NonFile:y]]')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:]]',
+ 'File:', 'NonFile:', ['file'], site=self.site),
+ '[[File:]]')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:x|[[foo]].]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:x|[[foo]].]]')
+
+ # ensure only links inside file are captured
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:a|[[foo]].x]][[x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:a|[[foo]].x]][[y]]')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:a|[[foo]][[bar]].x]][[x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:a|[[foo]][[bar]].x]][[y]]')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:a|[[foo]][[bar]].x]][[x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:a|[[foo]][[bar]].x]][[y]]')
+
+ # Correctly handle single brackets in the text.
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:a|[[foo]] [bar].x]][[x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:a|[[foo]] [bar].x]][[y]]')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:a|[bar] [[foo]] .x]][[x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:a|[bar] [[foo]] .x]][[y]]')
+
+ def test_replace_tag_file_invalid(self):
+ """Test replacing not inside file links with invalid
titles."""
+ # Correctly handle [ and ] inside wikilinks inside file link
+ # even though these are an invalid title.
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:a|[[foo]] [[bar [invalid] ]].x]][[x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:a|[[foo]] [[bar [invalid] ]].x]][[y]]')
+
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:a|[[foo]] [[bar [invalid ]].x]][[x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:a|[[foo]] [[bar [invalid ]].x]][[y]]')
+
+ # Even handle balanced [[ ]] inside the wikilink.
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:a|[[foo]] [[bar [[invalid]] ]].x]][[x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:a|[[foo]] [[bar [[invalid]] ]].x]][[y]]')
+
+ @unittest.expectedFailure
+ def test_replace_tag_file_failure(self):
+ """Test showing limits of the file link regex."""
+ # When the double brackets are unbalanced, the regex
+ # does not correctly detect the end of the file link.
+ self.assertEqual(
+ textlib.replaceExcept(
+ '[[File:a|[[foo]] [[bar [[invalid ]].x]][[x]]',
+ 'x', 'y', ['file'], site=self.site),
+ '[[File:a|[[foo]] [[bar [invalid] ]].x]][[y]]')
+
def test_replace_tags_interwiki(self):
"""Test replacing not inside interwiki links."""
if 'es' not in self.site.family.langs or 'ey' in
self.site.family.langs:
--
To view, visit
https://gerrit.wikimedia.org/r/246802
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: If2a0827df1a0f83afd3c23e57bb841929ed1399c
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>