jenkins-bot merged this change.

View Change

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

[bugfix] Remove multiple empty sections at once

+ Add textlib method to parse sections

Bug: T196324
Change-Id: Ic011f8be6ee64572e5b88a551ce18405d36c214d
---
M pywikibot/cosmetic_changes.py
M pywikibot/textlib.py
M tests/cosmetic_changes_tests.py
3 files changed, 184 insertions(+), 42 deletions(-)

diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index a01d7f2..c33cdd1 100755
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -647,11 +647,13 @@
 
     def removeEmptySections(self, text):
         """Cleanup empty sections."""
-        exceptions = ['comment', 'pre', 'source', 'nowiki', 'code',
-                      'startspace']
-
-        skippings = ['comment']
+        # comments, categories, and interwikis
+        skippings = ['comment', 'category', 'interwiki']
         skip_regexes = _get_regexes(skippings, self.site)
+        # we want only interwikis, not interlanguage links
+        skip_regexes[1] = re.compile(
+            skip_regexes[1].pattern.replace(':?', ''))
+        # site defined templates
         skip_templates = {
             'cs': ('Pahýl[ _]část',),  # stub section
         }
@@ -659,25 +661,33 @@
             for template in skip_templates[self.site.code]:
                 skip_regexes.append(
                     re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
+        # empty lists
+        skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))
 
+        # get stripped sections
         stripped_text = text
         for reg in skip_regexes:
             stripped_text = reg.sub(r'', stripped_text)
+        strip_sections = textlib.extract_sections(
+            stripped_text, self.site)[1]
 
-        stripped_pattern = re.compile(
-            r'\n((=+) *[^\n=]+? *\2) *\n\s*(?=(\2 *[^\n=]+? *\2))')
-        pos = 0
-        while True:
-            match = stripped_pattern.search(stripped_text[pos:])
-            if not match:
-                break
-            pattern = re.compile(r'\n{}.+?(?={})'.format(
-                match.group(1), match.group(3)), re.DOTALL)
-            text = textlib.replaceExcept(text, pattern, r'\n',
-                                         exceptions=exceptions)
-            pos = match.end()
+        # get proper sections
+        header, sections, footer = textlib.extract_sections(text, self.site)
 
-        return text
+        # iterate stripped sections and create a new page body
+        new_body = []
+        for i, strip_section in enumerate(strip_sections):
+            current_heading = sections[i][0]
+            try:
+                next_heading = sections[i + 1][0]
+            except IndexError:
+                next_heading = ''
+            current_dep = (len(current_heading)
+                           - len(current_heading.lstrip('=')))
+            next_dep = len(next_heading) - len(next_heading.lstrip('='))
+            if strip_section[1].strip() or current_dep < next_dep:
+                new_body = new_body + list(sections[i])
+        return header + ''.join(new_body) + footer
 
     def removeUselessSpaces(self, text):
         """Cleanup multiple or trailing spaces."""
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index e295783..e0468ae 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -37,6 +37,7 @@
     unicode = str
 else:
     from HTMLParser import HTMLParser
+    from itertools import izip as zip
 
 try:
     import mwparserfromhell
@@ -820,6 +821,99 @@
     return text
 
 
+# -------------------------------
+# Functions dealing with sections
+# -------------------------------
+def extract_sections(text, site=None):
+    """
+    Return section headings and contents found in text.
+
+    @return: The returned tuple contains the text parsed into three
+        parts: The first part is a string containing header part above
+        the first heading. The last part is also a string containing
+        footer part after the last section. The middle part is a list
+        of tuples, each tuple containing a string with section heading
+        and a string with section content. Example article::
+
+            '''A''' is a thing.
+
+            == History of A ==
+            Some history...
+
+            == Usage of A ==
+            Some usage...
+
+            [[Category:Things starting with A]]
+
+        ...is parsed into the following tuple::
+
+            (header, body, footer)
+            header = "'''A''' is a thing."
+            body = [('== History of A ==', 'Some history...'),
+                    ('== Usage of A ==', 'Some usage...')]
+            footer = '[[Category:Things starting with A]]'
+
+    @rtype: tuple of (str, list of tuples, str)
+    """
+    headings = []
+    contents = []
+    body = []
+
+    # Find valid headings
+    heading_regex = _get_regexes(['header'], site)[0]
+    pos = 0
+    while True:
+        match = heading_regex.search(text[pos:])
+        if not match:
+            break
+        start = pos + match.start()
+        end = pos + match.end()
+        if not (isDisabled(text, start)
+                or isDisabled(text, end)):
+            headings += [(match.group(), start, end)]
+        pos = end
+
+    if headings:
+        # Assign them their contents
+        for i, current in enumerate(headings):
+            try:
+                following = headings[i + 1]
+            except IndexError:
+                following = None
+            if following:
+                contents.append(text[current[2]:following[1]])
+            else:
+                contents.append(text[current[2]:])
+        body = [(heading[0], section)
+                for heading, section in zip(headings, contents)]
+
+    # Find header and footer contents
+    header = text[:headings[0][1]] if headings else text
+
+    last_section = body[-1][1] if body else header
+    skippings = ['category', 'interwiki']
+    footer_regexes = _get_regexes(skippings, site)
+    # we want only interwikis, not interlanguage links
+    footer_regexes[1] = re.compile(
+        footer_regexes[1].pattern.replace(':?', ''))
+    # find where to cut
+    positions = []
+    for reg in footer_regexes:
+        match = reg.search(last_section)
+        if match:
+            positions.append(match.start())
+    pos = min(pos for pos in positions) if positions else len(last_section)
+
+    # Strip footer from last section content
+    last_section, footer = last_section[:pos], last_section[pos:]
+    if body:
+        body[-1] = (body[-1][0], last_section)
+    else:
+        header = last_section
+
+    return header, body, footer
+
+
 # -----------------------------------------------
 # Functions dealing with interwiki language links
 # -----------------------------------------------
diff --git a/tests/cosmetic_changes_tests.py b/tests/cosmetic_changes_tests.py
index 7d8a6a2..b1c6361 100644
--- a/tests/cosmetic_changes_tests.py
+++ b/tests/cosmetic_changes_tests.py
@@ -65,31 +65,6 @@
             '<code>&#32;</code>',
             self.cct.resolveHtmlEntities('<code>&#32;</code>'))
 
-    def test_removeEmptySections(self):
-        """Test removeEmptySections method."""
-        # same level
-        self.assertEqual(
-            '\n==Bar==',
-            self.cct.removeEmptySections('\n== Foo ==\n\n==Bar=='))
-        # different level
-        self.assertEqual(
-            '\n===Foo===\n\n==Bar==',
-            self.cct.removeEmptySections('\n===Foo===\n\n==Bar=='))
-        self.assertEqual(
-            '\n==Foo==\n\n===Bar===',
-            self.cct.removeEmptySections('\n==Foo==\n\n===Bar==='))
-        # comment inside
-        self.assertEqual(
-            '\n==Bar==',
-            self.cct.removeEmptySections('\n==Foo==\n<!-- Baz -->\n==Bar=='))
-        # comments and content between
-        testcase = '\n== Foo ==\n<!-- Baz -->\nBaz\n<!-- Foo -->\n== Bar =='
-        self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
-        # inside comment
-        self.assertEqual(
-            '<!--\n==Foo==\n\n==Bar==\n-->',
-            self.cct.removeEmptySections('<!--\n==Foo==\n\n==Bar==\n-->'))
-
     def test_removeUselessSpaces(self):
         """Test removeUselessSpaces method."""
         self.assertEqual('Foo bar',
@@ -294,6 +269,69 @@
 
     """Test cosmetic_changes requiring a live wiki."""
 
+    def test_removeEmptySections(self):
+        """Test removeEmptySections method."""
+        content = '\nSome content'
+        # same level
+        self.assertEqual(
+            '\n==Bar==' + content,
+            self.cct.removeEmptySections('\n== Foo ==\n\n==Bar==' + content))
+        # different level
+        self.assertEqual(
+            '\n==Bar==' + content,
+            self.cct.removeEmptySections('\n===Foo===\n\n==Bar==' + content))
+        testcase = '\n==Foo==\n\n===Bar===' + content
+        self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+        # multiple empty sections
+        self.assertEqual(
+            '\n==Baz==' + content,
+            self.cct.removeEmptySections('\n==Foo==\n==Bar==\n==Baz=='
+                                         + content))
+        # comment inside
+        self.assertEqual(
+            '\n==Bar==' + content,
+            self.cct.removeEmptySections('\n==Foo==\n<!-- Baz -->\n==Bar=='
+                                         + content))
+        # comments and content between
+        testcase = ('\n== Foo ==\n<!-- Baz -->\nBaz\n<!-- Foo -->\n== Bar =='
+                    + content)
+        self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+        # inside comment
+        testcase = '<!--\n==Foo==\n\n==Bar==\n-->' + content
+        self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+        testcase = '\n==Foo==\n<!--\n==Bar==\n-->' + content
+        self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+        testcase = '<!--\n==Foo==\n-->\n==Bar==' + content
+        self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+        # empty list item
+        self.assertEqual(
+            '\n==Baz==' + content,
+            self.cct.removeEmptySections('\n==Foo==\n*\n==Bar==\n#\n==Baz=='
+                                         + content))
+        self.assertEqual(
+            '\n==Baz==' + content,
+            self.cct.removeEmptySections('\n==Foo==\n* <!--item-->\n==Baz=='
+                                         + content))
+        testcase = '\n==Foo==\n* item\n==Bar==' + content
+        self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+        # empty first section
+        self.assertEqual(
+            '==Bar==' + content,
+            self.cct.removeEmptySections('==Foo==\n==Bar==' + content))
+        # empty last section
+        self.assertEqual(
+            '\n[[Category:Baz]]',
+            self.cct.removeEmptySections('\n==Bar==\n[[Category:Baz]]'))
+        # complicated
+        self.assertEqual(
+            '\n[[Category:Baz]]',
+            self.cct.removeEmptySections('\n==Bar==\n* <!--item-->'
+                                         '\n[[Category:Baz]]'))
+        self.assertEqual(
+            '\n[[cs:Foo]]\n[[Category:Baz]]',
+            self.cct.removeEmptySections('\n==Bar==\n[[cs:Foo]]'
+                                         '\n[[Category:Baz]]'))
+
     def test_translateAndCapitalizeNamespaces(self):
         """Test translateAndCapitalizeNamespaces method."""
         self.assertEqual(

To view, visit change 437227. To unsubscribe, or for help writing mail filters, visit settings.