jenkins-bot has submitted this change and it was merged.
Change subject: Fix textlib 'file' exception regex ......................................................................
Fix textlib 'file' exception regex
Also replace the custom 'file' exception in cc with the standard fixed exception in textlib.
Change-Id: If2a0827df1a0f83afd3c23e57bb841929ed1399c --- M pywikibot/cosmetic_changes.py M pywikibot/textlib.py M tests/textlib_tests.py 3 files changed, 132 insertions(+), 9 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py index 8aabc09..e8f8972 100755 --- a/pywikibot/cosmetic_changes.py +++ b/pywikibot/cosmetic_changes.py @@ -833,6 +833,7 @@ return exceptions = [ 'gallery', + 'file', 'hyperlink', 'interwiki', # FIXME: but changes letters inside wikilinks @@ -856,16 +857,9 @@ new = digits.pop(self.site.code) # This only works if there are only two items in digits dict old = digits[digits.keys()[0]] - # do not change inside file links - namespaces = list(self.site.namespace(6, all=True)) - pattern = re.compile( - u'\[\[(%s):.+?\.\w+? *(\|((\[\[.*?\]\])|.)*)?\]\]' - % u'|'.join(namespaces), - re.UNICODE) # not to let bot edits in latin content exceptions.append(re.compile(u"[^%(fa)s] *?"*? *?, *?[^%(fa)s]" % {'fa': faChrs})) - exceptions.append(pattern) text = textlib.replaceExcept(text, u',', u'،', exceptions) if self.site.code == 'ckb': text = textlib.replaceExcept(text, diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 54d20eb..16f5222 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -99,6 +99,21 @@ (?P<unhandled_depth>{{\s*[^{|#0-9][^{|#]*?\s* [^{]* {{ .* }}) """, re.VERBOSE)
+# The following regex supports wikilinks anywhere after the first pipe +# and correctly matches the end of the file link if the wikilink contains +# [[ or ]]. +# The namespace names must be substituted into this regex. +# e.g. FILE_LINK_REGEX % 'File' or FILE_LINK_REGEX % '|'.join(site.namespaces) +FILE_LINK_REGEX = r""" +[[\s*(?:%s)\s*:[^|]*?\s* + (| + ( [[ [^[]*? [[ [^]]*? ]] [^]]*? ]] # capture invalid syntax + | ( [[ .*? ]] )? [^[]*? + | [ [^]]*? ] + )* + )? +]] +"""
NON_LATIN_DIGITS = { 'ckb': u'٠١٢٣٤٥٦٧٨٩', @@ -180,7 +195,7 @@ 'category': ('[[ *(?:%s)\s*:.*?]]', lambda site: '|'.join(site.namespaces[14])), # files - 'file': ('[[ *(?:%s)\s*:.*?]]', + 'file': (FILE_LINK_REGEX, lambda site: '|'.join(site.namespaces[6])), })
@@ -208,7 +223,7 @@ if (exc, site) not in _regex_cache: re_text, re_var = _regex_cache[exc] _regex_cache[(exc, site)] = re.compile( - re_text % re_var(site)) + re_text % re_var(site), re.VERBOSE)
result.append(_regex_cache[(exc, site)]) else: diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py index 762f62d..a1e51be 100644 --- a/tests/textlib_tests.py +++ b/tests/textlib_tests.py @@ -1134,17 +1134,131 @@ self.assertEqual(textlib.replaceExcept('{{#invoke:x}}', 'x', 'y', ['invoke'], site=self.site), '{{#invoke:x}}') + + def test_replace_tag_category(self): + """Test replacing not inside category links.""" for ns_name in self.site.namespaces[14]: self.assertEqual(textlib.replaceExcept('[[%s:x]]' % ns_name, 'x', 'y', ['category'], site=self.site), '[[%s:x]]' % ns_name) + + def test_replace_tag_file(self): + """Test replacing not inside file links.""" for ns_name in self.site.namespaces[6]: self.assertEqual(textlib.replaceExcept('[[%s:x]]' % ns_name, 'x', 'y', ['file'], site=self.site), '[[%s:x]]' % ns_name)
+ self.assertEqual( + textlib.replaceExcept( + '[[File:x|foo]]', + 'x', 'y', ['file'], site=self.site), + '[[File:x|foo]]') + + self.assertEqual( + textlib.replaceExcept( + '[[File:x|]]', + 'x', 'y', ['file'], site=self.site), + '[[File:x|]]') + + self.assertEqual( + textlib.replaceExcept( + '[[File:x|foo|bar x]] x', + 'x', 'y', ['file'], site=self.site), + '[[File:x|foo|bar x]] y') + + self.assertEqual( + textlib.replaceExcept( + '[[File:x|]][[File:x|foo]]', + 'x', 'y', ['file'], site=self.site), + '[[File:x|]][[File:x|foo]]') + + self.assertEqual( + textlib.replaceExcept( + '[[NonFile:x]]', + 'x', 'y', ['file'], site=self.site), + '[[NonFile:y]]') + + self.assertEqual( + textlib.replaceExcept( + '[[File:]]', + 'File:', 'NonFile:', ['file'], site=self.site), + '[[File:]]') + + self.assertEqual( + textlib.replaceExcept( + '[[File:x|[[foo]].]]', + 'x', 'y', ['file'], site=self.site), + '[[File:x|[[foo]].]]') + + # ensure only links inside file are captured + self.assertEqual( + textlib.replaceExcept( + '[[File:a|[[foo]].x]][[x]]', + 'x', 'y', ['file'], site=self.site), + '[[File:a|[[foo]].x]][[y]]') + + self.assertEqual( + textlib.replaceExcept( + '[[File:a|[[foo]][[bar]].x]][[x]]', + 'x', 'y', ['file'], site=self.site), + '[[File:a|[[foo]][[bar]].x]][[y]]') + + self.assertEqual( + textlib.replaceExcept( + '[[File:a|[[foo]][[bar]].x]][[x]]', + 'x', 'y', ['file'], site=self.site), + '[[File:a|[[foo]][[bar]].x]][[y]]') + + # Correctly handle single brackets in the text. + self.assertEqual( + textlib.replaceExcept( + '[[File:a|[[foo]] [bar].x]][[x]]', + 'x', 'y', ['file'], site=self.site), + '[[File:a|[[foo]] [bar].x]][[y]]') + + self.assertEqual( + textlib.replaceExcept( + '[[File:a|[bar] [[foo]] .x]][[x]]', + 'x', 'y', ['file'], site=self.site), + '[[File:a|[bar] [[foo]] .x]][[y]]') + + def test_replace_tag_file_invalid(self): + """Test replacing not inside file links with invalid titles.""" + # Correctly handle [ and ] inside wikilinks inside file link + # even though these are an invalid title. + self.assertEqual( + textlib.replaceExcept( + '[[File:a|[[foo]] [[bar [invalid] ]].x]][[x]]', + 'x', 'y', ['file'], site=self.site), + '[[File:a|[[foo]] [[bar [invalid] ]].x]][[y]]') + + self.assertEqual( + textlib.replaceExcept( + '[[File:a|[[foo]] [[bar [invalid ]].x]][[x]]', + 'x', 'y', ['file'], site=self.site), + '[[File:a|[[foo]] [[bar [invalid ]].x]][[y]]') + + # Even handle balanced [[ ]] inside the wikilink. + self.assertEqual( + textlib.replaceExcept( + '[[File:a|[[foo]] [[bar [[invalid]] ]].x]][[x]]', + 'x', 'y', ['file'], site=self.site), + '[[File:a|[[foo]] [[bar [[invalid]] ]].x]][[y]]') + + @unittest.expectedFailure + def test_replace_tag_file_failure(self): + """Test showing limits of the file link regex.""" + # When the double brackets are unbalanced, the regex + # does not correctly detect the end of the file link. + self.assertEqual( + textlib.replaceExcept( + '[[File:a|[[foo]] [[bar [[invalid ]].x]][[x]]', + 'x', 'y', ['file'], site=self.site), + '[[File:a|[[foo]] [[bar [invalid] ]].x]][[y]]') + def test_replace_tags_interwiki(self): """Test replacing not inside interwiki links.""" if 'es' not in self.site.family.langs or 'ey' in self.site.family.langs: