jenkins-bot merged this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[bugfix] Remove multiple empty sections at once

+ Add textlib method to parse sections

Bug: T196324
Change-Id: Ic011f8be6ee64572e5b88a551ce18405d36c214d
---
M pywikibot/cosmetic_changes.py
M pywikibot/textlib.py
M tests/cosmetic_changes_tests.py
3 files changed, 184 insertions(+), 42 deletions(-)

diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index a01d7f2..c33cdd1 100755
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -647,11 +647,13 @@

def removeEmptySections(self, text):
"""Cleanup empty sections."""
- exceptions = ['comment', 'pre', 'source', 'nowiki', 'code',
- 'startspace']
-
- skippings = ['comment']
+ # comments, categories, and interwikis
+ skippings = ['comment', 'category', 'interwiki']
skip_regexes = _get_regexes(skippings, self.site)
+ # we want only interwikis, not interlanguage links
+ skip_regexes[1] = re.compile(
+ skip_regexes[1].pattern.replace(':?', ''))
+ # site defined templates
skip_templates = {
'cs': ('Pahýl[ _]část',), # stub section
}
@@ -659,25 +661,33 @@
for template in skip_templates[self.site.code]:
skip_regexes.append(
re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
+ # empty lists
+ skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))

+ # get stripped sections
stripped_text = text
for reg in skip_regexes:
stripped_text = reg.sub(r'', stripped_text)
+ strip_sections = textlib.extract_sections(
+ stripped_text, self.site)[1]

- stripped_pattern = re.compile(
- r'\n((=+) *[^\n=]+? *\2) *\n\s*(?=(\2 *[^\n=]+? *\2))')
- pos = 0
- while True:
- match = stripped_pattern.search(stripped_text[pos:])
- if not match:
- break
- pattern = re.compile(r'\n{}.+?(?={})'.format(
- match.group(1), match.group(3)), re.DOTALL)
- text = textlib.replaceExcept(text, pattern, r'\n',
- exceptions=exceptions)
- pos = match.end()
+ # get proper sections
+ header, sections, footer = textlib.extract_sections(text, self.site)

- return text
+ # iterate stripped sections and create a new page body
+ new_body = []
+ for i, strip_section in enumerate(strip_sections):
+ current_heading = sections[i][0]
+ try:
+ next_heading = sections[i + 1][0]
+ except IndexError:
+ next_heading = ''
+ current_dep = (len(current_heading)
+ - len(current_heading.lstrip('=')))
+ next_dep = len(next_heading) - len(next_heading.lstrip('='))
+ if strip_section[1].strip() or current_dep < next_dep:
+ new_body = new_body + list(sections[i])
+ return header + ''.join(new_body) + footer

def removeUselessSpaces(self, text):
"""Cleanup multiple or trailing spaces."""
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index e295783..e0468ae 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -37,6 +37,7 @@
unicode = str
else:
from HTMLParser import HTMLParser
+ from itertools import izip as zip

try:
import mwparserfromhell
@@ -820,6 +821,99 @@
return text


+# -------------------------------
+# Functions dealing with sections
+# -------------------------------
+def extract_sections(text, site=None):
+ """
+ Return section headings and contents found in text.
+
+ @return: The returned tuple contains the text parsed into three
+ parts: The first part is a string containing header part above
+ the first heading. The last part is also a string containing
+ footer part after the last section. The middle part is a list
+ of tuples, each tuple containing a string with section heading
+ and a string with section content. Example article::
+
+ '''A''' is a thing.
+
+ == History of A ==
+ Some history...
+
+ == Usage of A ==
+ Some usage...
+
+ [[Category:Things starting with A]]
+
+ ...is parsed into the following tuple::
+
+ (header, body, footer)
+ header = "'''A''' is a thing."
+ body = [('== History of A ==', 'Some history...'),
+ ('== Usage of A ==', 'Some usage...')]
+ footer = '[[Category:Things starting with A]]'
+
+ @rtype: tuple of (str, list of tuples, str)
+ """
+ headings = []
+ contents = []
+ body = []
+
+ # Find valid headings
+ heading_regex = _get_regexes(['header'], site)[0]
+ pos = 0
+ while True:
+ match = heading_regex.search(text[pos:])
+ if not match:
+ break
+ start = pos + match.start()
+ end = pos + match.end()
+ if not (isDisabled(text, start)
+ or isDisabled(text, end)):
+ headings += [(match.group(), start, end)]
+ pos = end
+
+ if headings:
+ # Assign them their contents
+ for i, current in enumerate(headings):
+ try:
+ following = headings[i + 1]
+ except IndexError:
+ following = None
+ if following:
+ contents.append(text[current[2]:following[1]])
+ else:
+ contents.append(text[current[2]:])
+ body = [(heading[0], section)
+ for heading, section in zip(headings, contents)]
+
+ # Find header and footer contents
+ header = text[:headings[0][1]] if headings else text
+
+ last_section = body[-1][1] if body else header
+ skippings = ['category', 'interwiki']
+ footer_regexes = _get_regexes(skippings, site)
+ # we want only interwikis, not interlanguage links
+ footer_regexes[1] = re.compile(
+ footer_regexes[1].pattern.replace(':?', ''))
+ # find where to cut
+ positions = []
+ for reg in footer_regexes:
+ match = reg.search(last_section)
+ if match:
+ positions.append(match.start())
+ pos = min(pos for pos in positions) if positions else len(last_section)
+
+ # Strip footer from last section content
+ last_section, footer = last_section[:pos], last_section[pos:]
+ if body:
+ body[-1] = (body[-1][0], last_section)
+ else:
+ header = last_section
+
+ return header, body, footer
+
+
# -----------------------------------------------
# Functions dealing with interwiki language links
# -----------------------------------------------
diff --git a/tests/cosmetic_changes_tests.py b/tests/cosmetic_changes_tests.py
index 7d8a6a2..b1c6361 100644
--- a/tests/cosmetic_changes_tests.py
+++ b/tests/cosmetic_changes_tests.py
@@ -65,31 +65,6 @@
'<code>&#32;</code>',
self.cct.resolveHtmlEntities('<code>&#32;</code>'))

- def test_removeEmptySections(self):
- """Test removeEmptySections method."""
- # same level
- self.assertEqual(
- '\n==Bar==',
- self.cct.removeEmptySections('\n== Foo ==\n\n==Bar=='))
- # different level
- self.assertEqual(
- '\n===Foo===\n\n==Bar==',
- self.cct.removeEmptySections('\n===Foo===\n\n==Bar=='))
- self.assertEqual(
- '\n==Foo==\n\n===Bar===',
- self.cct.removeEmptySections('\n==Foo==\n\n===Bar==='))
- # comment inside
- self.assertEqual(
- '\n==Bar==',
- self.cct.removeEmptySections('\n==Foo==\n<!-- Baz -->\n==Bar=='))
- # comments and content between
- testcase = '\n== Foo ==\n<!-- Baz -->\nBaz\n<!-- Foo -->\n== Bar =='
- self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
- # inside comment
- self.assertEqual(
- '<!--\n==Foo==\n\n==Bar==\n-->',
- self.cct.removeEmptySections('<!--\n==Foo==\n\n==Bar==\n-->'))
-
def test_removeUselessSpaces(self):
"""Test removeUselessSpaces method."""
self.assertEqual('Foo bar',
@@ -294,6 +269,69 @@

"""Test cosmetic_changes requiring a live wiki."""

+ def test_removeEmptySections(self):
+ """Test removeEmptySections method."""
+ content = '\nSome content'
+ # same level
+ self.assertEqual(
+ '\n==Bar==' + content,
+ self.cct.removeEmptySections('\n== Foo ==\n\n==Bar==' + content))
+ # different level
+ self.assertEqual(
+ '\n==Bar==' + content,
+ self.cct.removeEmptySections('\n===Foo===\n\n==Bar==' + content))
+ testcase = '\n==Foo==\n\n===Bar===' + content
+ self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+ # multiple empty sections
+ self.assertEqual(
+ '\n==Baz==' + content,
+ self.cct.removeEmptySections('\n==Foo==\n==Bar==\n==Baz=='
+ + content))
+ # comment inside
+ self.assertEqual(
+ '\n==Bar==' + content,
+ self.cct.removeEmptySections('\n==Foo==\n<!-- Baz -->\n==Bar=='
+ + content))
+ # comments and content between
+ testcase = ('\n== Foo ==\n<!-- Baz -->\nBaz\n<!-- Foo -->\n== Bar =='
+ + content)
+ self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+ # inside comment
+ testcase = '<!--\n==Foo==\n\n==Bar==\n-->' + content
+ self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+ testcase = '\n==Foo==\n<!--\n==Bar==\n-->' + content
+ self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+ testcase = '<!--\n==Foo==\n-->\n==Bar==' + content
+ self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+ # empty list item
+ self.assertEqual(
+ '\n==Baz==' + content,
+ self.cct.removeEmptySections('\n==Foo==\n*\n==Bar==\n#\n==Baz=='
+ + content))
+ self.assertEqual(
+ '\n==Baz==' + content,
+ self.cct.removeEmptySections('\n==Foo==\n* <!--item-->\n==Baz=='
+ + content))
+ testcase = '\n==Foo==\n* item\n==Bar==' + content
+ self.assertEqual(testcase, self.cct.removeEmptySections(testcase))
+ # empty first section
+ self.assertEqual(
+ '==Bar==' + content,
+ self.cct.removeEmptySections('==Foo==\n==Bar==' + content))
+ # empty last section
+ self.assertEqual(
+ '\n[[Category:Baz]]',
+ self.cct.removeEmptySections('\n==Bar==\n[[Category:Baz]]'))
+ # complicated
+ self.assertEqual(
+ '\n[[Category:Baz]]',
+ self.cct.removeEmptySections('\n==Bar==\n* <!--item-->'
+ '\n[[Category:Baz]]'))
+ self.assertEqual(
+ '\n[[cs:Foo]]\n[[Category:Baz]]',
+ self.cct.removeEmptySections('\n==Bar==\n[[cs:Foo]]'
+ '\n[[Category:Baz]]'))
+
def test_translateAndCapitalizeNamespaces(self):
"""Test translateAndCapitalizeNamespaces method."""
self.assertEqual(

To view, visit change 437227. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Ic011f8be6ee64572e5b88a551ce18405d36c214d
Gerrit-Change-Number: 437227
Gerrit-PatchSet: 11
Gerrit-Owner: Dvorapa <dvorapa@seznam.cz>
Gerrit-Reviewer: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: Dvorapa <dvorapa@seznam.cz>
Gerrit-Reviewer: Framawiki <framawiki@tools.wmflabs.org>
Gerrit-Reviewer: John Vandenberg <jayvdb@gmail.com>
Gerrit-Reviewer: Rachmat.Wahidi <rachmatwahidi.site@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: Zhuyifei1999 <zhuyifei1999@gmail.com>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444@gmail.com>
Gerrit-Reviewer: jenkins-bot