jenkins-bot submitted this change.

View Change


Approvals: Meno25: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
[FIX] Make header extraction more robust

Bug: T341787
Change-Id: I7799317194cd76b25ae56d2ccc4f06434b1b4987
---
M pywikibot/textlib.py
M tests/textlib_tests.py
2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index a1fee36..39f8d50 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -270,7 +270,7 @@
# section headers
'header': re.compile(
r'(?:(?<=\n)|\A)(?:<!--[\s\S]*?-->)*'
- r'=(?:[^\n]|<!--[\s\S]*?-->)+='
+ r'(=(?:[^\n]|<!--[\s\S]*?-->)+=)'
r' *(?:<!--[\s\S]*?--> *)*(?=\n|\Z)'),
# external links
'hyperlink': compileLinkR(),
@@ -933,7 +933,7 @@
# -------------------------------

#: Head pattern
-HEAD_PATTERN = re.compile('{0}[^=]+{0}'.format('(={1,6})'))
+HEAD_PATTERN = re.compile(r'(={1,6}).+\1', re.DOTALL)
TITLE_PATTERN = re.compile("'{3}([^']+)'{3}")

_Heading = namedtuple('_Heading', ('text', 'start', 'end'))
@@ -957,7 +957,7 @@
.. versionadded:: 8.2
"""
m = HEAD_PATTERN.match(self.title)
- return min(map(len, m.groups()))
+ return len(m[1])

@property
def heading(self) -> str:
@@ -998,9 +998,9 @@
headings = []
heading_regex = get_regexes('header')[0]
for match in heading_regex.finditer(text):
- start, end = match.span()
+ start, end = match.span(1)
if not isDisabled(text, start) and not isDisabled(text, end):
- headings.append(_Heading(match.group(), start, end))
+ headings.append(_Heading(match[1], start, end))
return headings


diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 6eeff21..b0a642b 100755
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -1626,6 +1626,24 @@
[('====title====', '\n'), ('==title 2==', '\ncontent')],
)

+ def test_with_comments(self):
+ """Test section headers surrounded by comments."""
+ text = ('text\n\n'
+ '<!--\n multiline comment\n-->== title ==\n'
+ 'content\n\n'
+ '<!-- comment --> == not title ==\n'
+ 'foo\n\n'
+ '== title 2 == <!-- trailing comment -->\n'
+ 'content 2')
+ result = extract_sections(text, self.site)
+ self._extract_sections_tests(
+ result,
+ 'text\n\n<!--\n multiline comment\n-->',
+ [('== title ==',
+ '\ncontent\n\n<!-- comment --> == not title ==\nfoo\n\n'),
+ ('== title 2 ==', ' <!-- trailing comment -->\ncontent 2')]
+ )
+
def test_long_comment(self):
r"""Test for text having a long expanse of white space.

@@ -1640,8 +1658,21 @@
result = extract_sections(text, self.site)
self._extract_sections_tests(result, text, [], '')

+ def test_empty_header(self):
+ """Test empty section headers."""
+ text = ('text\n\n'
+ '== ==\n'
+ '=====\n'
+ '=== ===\n')
+ result = extract_sections(text, self.site)
+ self._extract_sections_tests(
+ result,
+ 'text\n\n',
+ [('== ==', '\n'), ('=====', '\n'), ('=== ===', '\n')]
+ )
+
def test_unbalanced_headers(self):
- """Test unbalances section headers."""
+ """Test unbalanced section headers."""
text = ('text\n\n'
'====title===\n'
'==title 2===\n'

To view, visit change 938357. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I7799317194cd76b25ae56d2ccc4f06434b1b4987
Gerrit-Change-Number: 938357
Gerrit-PatchSet: 6
Gerrit-Owner: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-Reviewer: Meno25 <meno25mail@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged