jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/454853 )
Change subject: CosmeticChangesToolkit.removeEmptySections: Fix skipping of language links ......................................................................
CosmeticChangesToolkit.removeEmptySections: Fix skipping of language links
The 'interwiki' regex includes the trailing newlines and therefore may potentially mess up page sections. (T202629)
Instead of using the 'interwiki' regex, use the textlib.removeLanguageLinks function. Use marker='\n' argument to avoid the issue mentioned above.
Add a few tests for the fixed bug.
Do minor cleanups in textlib.py: - _ignore_case: No need to ignore-case a letter if it is not case-sensitive. - 'interwiki' regex patter, replace '[\s]*' with '\s*'.
Bug: T202629 Change-Id: I22b4ad92c7058ca65c6e3e6a680586dc1c2490a0 --- M pywikibot/cosmetic_changes.py M pywikibot/textlib.py M tests/cosmetic_changes_tests.py 3 files changed, 20 insertions(+), 8 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py index 3130081..ac4d548 100755 --- a/pywikibot/cosmetic_changes.py +++ b/pywikibot/cosmetic_changes.py @@ -663,12 +663,8 @@
def removeEmptySections(self, text): """Cleanup empty sections.""" - # comments, categories, and interwikis - skippings = ['comment', 'category', 'interwiki'] + skippings = ['comment', 'category'] skip_regexes = _get_regexes(skippings, self.site) - # we want only interwikis, not interlanguage links - skip_regexes[1] = re.compile( - skip_regexes[1].pattern.replace(':?', '')) # site defined templates skip_templates = { 'cs': ('Pahýl[ _]část',), # stub section @@ -681,7 +677,7 @@ skip_regexes.append(re.compile(r'(?m)^[*#] *$'))
# get stripped sections - stripped_text = text + stripped_text = textlib.removeLanguageLinks(text, self.site, '\n') for reg in skip_regexes: stripped_text = reg.sub(r'', stripped_text) strip_sections = textlib.extract_sections( diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 074ba72..c9d416b 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -251,7 +251,9 @@
def _ignore_case(string): """Return a case-insensitive pattern for the string.""" - return ''.join('[' + c.upper() + c.lower() + ']' for c in string) + return ''.join( + '[' + c + s + ']' if c != s else c + for s, c in zip(string, string.swapcase()))
def _tag_pattern(tag_name): @@ -282,7 +284,7 @@ 'hyperlink': compileLinkR(), # also finds links to foreign sites with preleading ":" 'interwiki': ( - r'[[:?(%s)\s?:[^]]*]][\s]*', + r'[[:?(%s)\s?:[^]]*]]\s*', lambda site: '|'.join( _ignore_case(i) for i in site.validLanguageLinks() + list(site.family.obsolete.keys()))), diff --git a/tests/cosmetic_changes_tests.py b/tests/cosmetic_changes_tests.py index d5354c0..a6024a2 100644 --- a/tests/cosmetic_changes_tests.py +++ b/tests/cosmetic_changes_tests.py @@ -332,6 +332,20 @@ self.cct.removeEmptySections('\n==Bar==\n[[cs:Foo]]' '\n[[Category:Baz]]'))
+ def test_remove_empty_sections_interlanguage_links(self): + """Test removeEmptySections with edge cases of language links.""" + # When removing language links, do not remove the \n after them, + # otherwise the sections won't be detected correctly. + text = 'text [[:en:link]]\n=== title1 ===\ncontent1' + self.assertEqual(text, self.cct.removeEmptySections(text)) + self.assertEqual( + 't [[en:link]]\n=== 1 ===\nc', + self.cct.removeEmptySections('t [[en:link]]\n=== 1 ===\nc')) + # Treat sections that only contain language links as empty sections. + self.assertEqual( + 't\n[[en:link]]', + self.cct.removeEmptySections('t\n=== 1 ===\n[[en:link]]')) + def test_remove_empty_sections_with_heading_comments(self): """Test removeEmptySections with comments in the section headings.""" self.assertEqual(
pywikibot-commits@lists.wikimedia.org