jenkins-bot merged this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
CosmeticChangesToolkit.removeEmptySections: Fix skipping of language links

The 'interwiki' regex includes the trailing newlines and therefore may
potentially mess up page sections. (T202629)

Instead of using the 'interwiki' regex, use the textlib.removeLanguageLinks
function. Use marker='\n' argument to avoid the issue mentioned above.

Add a few tests for the fixed bug.

Do minor cleanups in textlib.py:
- _ignore_case: No need to ignore-case a letter if it is not case-sensitive.
- 'interwiki' regex patter, replace '[\s]*' with '\s*'.

Bug: T202629
Change-Id: I22b4ad92c7058ca65c6e3e6a680586dc1c2490a0
---
M pywikibot/cosmetic_changes.py
M pywikibot/textlib.py
M tests/cosmetic_changes_tests.py
3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index 3130081..ac4d548 100755
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -663,12 +663,8 @@

def removeEmptySections(self, text):
"""Cleanup empty sections."""
- # comments, categories, and interwikis
- skippings = ['comment', 'category', 'interwiki']
+ skippings = ['comment', 'category']
skip_regexes = _get_regexes(skippings, self.site)
- # we want only interwikis, not interlanguage links
- skip_regexes[1] = re.compile(
- skip_regexes[1].pattern.replace(':?', ''))
# site defined templates
skip_templates = {
'cs': ('Pahýl[ _]část',), # stub section
@@ -681,7 +677,7 @@
skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))

# get stripped sections
- stripped_text = text
+ stripped_text = textlib.removeLanguageLinks(text, self.site, '\n')
for reg in skip_regexes:
stripped_text = reg.sub(r'', stripped_text)
strip_sections = textlib.extract_sections(
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 074ba72..c9d416b 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -251,7 +251,9 @@

def _ignore_case(string):
"""Return a case-insensitive pattern for the string."""
- return ''.join('[' + c.upper() + c.lower() + ']' for c in string)
+ return ''.join(
+ '[' + c + s + ']' if c != s else c
+ for s, c in zip(string, string.swapcase()))


def _tag_pattern(tag_name):
@@ -282,7 +284,7 @@
'hyperlink': compileLinkR(),
# also finds links to foreign sites with preleading ":"
'interwiki': (
- r'\[\[:?(%s)\s?:[^\]]*\]\][\s]*',
+ r'\[\[:?(%s)\s?:[^\]]*\]\]\s*',
lambda site: '|'.join(
_ignore_case(i) for i in site.validLanguageLinks()
+ list(site.family.obsolete.keys()))),
diff --git a/tests/cosmetic_changes_tests.py b/tests/cosmetic_changes_tests.py
index d5354c0..a6024a2 100644
--- a/tests/cosmetic_changes_tests.py
+++ b/tests/cosmetic_changes_tests.py
@@ -332,6 +332,20 @@
self.cct.removeEmptySections('\n==Bar==\n[[cs:Foo]]'
'\n[[Category:Baz]]'))

+ def test_remove_empty_sections_interlanguage_links(self):
+ """Test removeEmptySections with edge cases of language links."""
+ # When removing language links, do not remove the \n after them,
+ # otherwise the sections won't be detected correctly.
+ text = 'text [[:en:link]]\n=== title1 ===\ncontent1'
+ self.assertEqual(text, self.cct.removeEmptySections(text))
+ self.assertEqual(
+ 't [[en:link]]\n=== 1 ===\nc',
+ self.cct.removeEmptySections('t [[en:link]]\n=== 1 ===\nc'))
+ # Treat sections that only contain language links as empty sections.
+ self.assertEqual(
+ 't\n[[en:link]]',
+ self.cct.removeEmptySections('t\n=== 1 ===\n[[en:link]]'))
+
def test_remove_empty_sections_with_heading_comments(self):
"""Test removeEmptySections with comments in the section headings."""
self.assertEqual(

To view, visit change 454853. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I22b4ad92c7058ca65c6e3e6a680586dc1c2490a0
Gerrit-Change-Number: 454853
Gerrit-PatchSet: 9
Gerrit-Owner: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444@gmail.com>
Gerrit-Reviewer: jenkins-bot (75)