jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/938357 )
Change subject: [FIX] Make header extraction more robust ......................................................................
[FIX] Make header extraction more robust
Bug: T341787 Change-Id: I7799317194cd76b25ae56d2ccc4f06434b1b4987 --- M pywikibot/textlib.py M tests/textlib_tests.py 2 files changed, 47 insertions(+), 6 deletions(-)
Approvals: Meno25: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index a1fee36..39f8d50 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -270,7 +270,7 @@ # section headers 'header': re.compile( r'(?:(?<=\n)|\A)(?:<!--[\s\S]*?-->)*' - r'=(?:[^\n]|<!--[\s\S]*?-->)+=' + r'(=(?:[^\n]|<!--[\s\S]*?-->)+=)' r' *(?:<!--[\s\S]*?--> *)*(?=\n|\Z)'), # external links 'hyperlink': compileLinkR(), @@ -933,7 +933,7 @@ # -------------------------------
#: Head pattern -HEAD_PATTERN = re.compile('{0}[^=]+{0}'.format('(={1,6})')) +HEAD_PATTERN = re.compile(r'(={1,6}).+\1', re.DOTALL) TITLE_PATTERN = re.compile("'{3}([^']+)'{3}")
_Heading = namedtuple('_Heading', ('text', 'start', 'end')) @@ -957,7 +957,7 @@ .. versionadded:: 8.2 """ m = HEAD_PATTERN.match(self.title) - return min(map(len, m.groups())) + return len(m[1])
@property def heading(self) -> str: @@ -998,9 +998,9 @@ headings = [] heading_regex = get_regexes('header')[0] for match in heading_regex.finditer(text): - start, end = match.span() + start, end = match.span(1) if not isDisabled(text, start) and not isDisabled(text, end): - headings.append(_Heading(match.group(), start, end)) + headings.append(_Heading(match[1], start, end)) return headings
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py index 6eeff21..b0a642b 100755 --- a/tests/textlib_tests.py +++ b/tests/textlib_tests.py @@ -1626,6 +1626,24 @@ [('====title====', '\n'), ('==title 2==', '\ncontent')], )
+ def test_with_comments(self): + """Test section headers surrounded by comments.""" + text = ('text\n\n' + '<!--\n multiline comment\n-->== title ==\n' + 'content\n\n' + '<!-- comment --> == not title ==\n' + 'foo\n\n' + '== title 2 == <!-- trailing comment -->\n' + 'content 2') + result = extract_sections(text, self.site) + self._extract_sections_tests( + result, + 'text\n\n<!--\n multiline comment\n-->', + [('== title ==', + '\ncontent\n\n<!-- comment --> == not title ==\nfoo\n\n'), + ('== title 2 ==', ' <!-- trailing comment -->\ncontent 2')] + ) + def test_long_comment(self): r"""Test for text having a long expanse of white space.
@@ -1640,8 +1658,21 @@ result = extract_sections(text, self.site) self._extract_sections_tests(result, text, [], '')
+ def test_empty_header(self): + """Test empty section headers.""" + text = ('text\n\n' + '== ==\n' + '=====\n' + '=== ===\n') + result = extract_sections(text, self.site) + self._extract_sections_tests( + result, + 'text\n\n', + [('== ==', '\n'), ('=====', '\n'), ('=== ===', '\n')] + ) + def test_unbalanced_headers(self): - """Test unbalances section headers.""" + """Test unbalanced section headers.""" text = ('text\n\n' '====title===\n' '==title 2===\n'
pywikibot-commits@lists.wikimedia.org