jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/930899 )
Change subject: [IMPR] improvement for textlib.extract_sections() function ......................................................................
[IMPR] improvement for textlib.extract_sections() function
- make _Content and _Section public classes - the Content namedtuple which is the result of textlib.extract_sections() function has an additional property 'title' which is the first main title found in header (anything enclosed within tripple quotes) - the Section nametuple of the sections list comes with a level property which indicates the level of the section header (usually 2-6) - the Section nametuple has also a heading property which gives the stripped section title. - simplify _extract_sections helper function - simplify cosmetic_changes module and archivebot script - update tests
Bug: T338748 Change-Id: I0cd3ac16fc7f3dce1b87b9b67f02766caec99aa6 --- M pywikibot/textlib.py M pywikibot/cosmetic_changes.py M tests/textlib_tests.py M scripts/archivebot.py 4 files changed, 199 insertions(+), 62 deletions(-)
Approvals: Matěj Suchánek: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py index bf0063c..ffd43a5 100644 --- a/pywikibot/cosmetic_changes.py +++ b/pywikibot/cosmetic_changes.py @@ -699,7 +699,7 @@ for reg in skip_regexes: stripped_text = reg.sub(r'', stripped_text) strip_sections = textlib.extract_sections( - stripped_text, self.site)[1] + stripped_text, self.site).sections
# get proper sections header, sections, footer = textlib.extract_sections(text, self.site) @@ -707,15 +707,13 @@ # iterate stripped sections and create a new page body new_body = [] for i, strip_section in enumerate(strip_sections): - current_heading = sections[i][0] + current_dep = sections[i].level try: - next_heading = sections[i + 1][0] + next_dep = sections[i + 1].level except IndexError: - next_heading = '' - current_dep = (len(current_heading) - - len(current_heading.lstrip('='))) - next_dep = len(next_heading) - len(next_heading.lstrip('=')) - if strip_section[1].strip() or current_dep < next_dep: + next_dep = 0 + + if strip_section.content.strip() or current_dep < next_dep: new_body.extend(sections[i]) return header + ''.join(new_body) + footer
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index a81abb1..9078b3c 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -933,12 +933,69 @@ # ------------------------------- # Functions dealing with sections # ------------------------------- + +#: Head pattern +HEAD_PATTERN = re.compile('{0}[^=]+{0}'.format('(={1,6})')) +TITLE_PATTERN = re.compile("'{3}([^']+)'{3}") + _Heading = namedtuple('_Heading', ('text', 'start', 'end')) -_Section = namedtuple('_Section', ('title', 'content')) -_Content = namedtuple('_Content', ('header', 'sections', 'footer'))
-def _extract_headings(text: str) -> list: +class Section(NamedTuple): + + """A namedtuple as part of :class:`Content` describing a page section. + + .. versionchanged:: 8.2 + ``_Section`` becomes a public class. + """ + + title: str #: section title including equal signs + content: str #: section content + + @property + def level(self) -> int: + """Return the section level. + + .. versionadded:: 8.2 + """ + m = HEAD_PATTERN.match(self.title) + return min(map(len, m.groups())) + + @property + def heading(self) -> str: + """Return the section title without equal signs. + + .. versionadded:: 8.2 + """ + level = self.level + return self.title[level:-level].strip() + + +class Content(NamedTuple): + + """A namedtuple as result of :func:`extract_sections` holding page content. + + .. versionchanged:: 8.2 + ``_Content`` becomes a public class. + """ + + header: str #: the page header + sections: List[Section] #: the page sections + footer: str #: the page footer + + @property + def title(self) -> str: + """Return the first main title found on the page. + + The first main title is anything enclosed within triple quotes. + + .. versionadded:: 8.2 + """ + m = TITLE_PATTERN.search(self.header) + return m[1].strip() if m else '' + + +def _extract_headings(text: str) -> List[_Heading]: """Return _Heading objects.""" headings = [] heading_regex = get_regexes('header')[0] @@ -949,59 +1006,90 @@ return headings
-def _extract_sections(text: str, headings) -> list: - """Return _Section objects.""" +def _extract_sections(text: str, headings) -> List[Section]: + """Return a list of :class:`Section` objects.""" + sections = [] if headings: # Assign them their contents - contents = [] for i, heading in enumerate(headings): try: next_heading = headings[i + 1] except IndexError: - contents.append(text[heading.end:]) + content = text[heading.end:] else: - contents.append(text[heading.end:next_heading.start]) - return [_Section(heading.text, content) - for heading, content in zip(headings, contents)] - return [] + content = text[heading.end:next_heading.start] + sections.append(Section(heading.text, content)) + + return sections
def extract_sections( - text: str, site=None -) -> NamedTuple('_Content', [('header', str), # noqa: F821 - ('sections', List[Tuple[str, str]]), # noqa: F821 - ('footer', str)]): # noqa: F821 - """ - Return section headings and contents found in text. + text: str, + site: Optional['pywikibot.site.BaseSite'] = None +) -> Content: + """Return section headings and contents found in text.
- :return: The returned namedtuple contains the text parsed into - header, contents and footer parts: The header part is a string - containing text part above the first heading. The footer part - is also a string containing text part after the last section. - The section part is a list of tuples, each tuple containing a - string with section heading and a string with section content. - Example article:: + The returned namedtuple :class:`Content` contains the text parsed + into *header*, *sections* and *footer* parts. The main title found + in the header which is the first text enclosed with ''' like + '''page title''' can be given by the *title* property.
- '''A''' is a thing. + The header part is a string containing text part above the first + heading.
- == History of A == - Some history... + The sections part is a list of :class:`Section` namedtuples, each + tuple containing a string with section title (including equal signs), + and a string with the section content. In addition the section + heading (the title without equal signs) can be given by the *heading* + property. Also the section level can be found by the *level* + property which is the number of the equal signs around the section + heading.
- == Usage of A == - Some usage... + The footer part is also a string containing text part after the last + section.
- [[Category:Things starting with A]] + **Examples:**
- ...is parsed into the following namedtuple:: + >>> text = """ + ... '''this''' is a Python module. + ... + ... == History of this == + ... This set of principles was posted in 1999... + ... + ... == Usage of this == + ... Enter "import this" for usage... + ... + ... === Details === + ... The Zen of Python... + ... + ... [[Category:Programming principles]] + ... """ + >>> site = pywikibot.Site('wikipedia:en') + >>> result = extract_sections(text, site) + >>> result.header.strip() + "'''this''' is a Python module." + >>> result.sections[0].title + '== History of this ==' + >>> result.sections[1].content.strip() + 'Enter "import this" for usage...' + >>> result.sections[2].heading + 'Details' + >>> result.sections[2].level + 3 + >>> result.footer.strip() + '[[Category:Programming principles]]' + >>> result.title + 'this'
- result = extract_sections(text, site) - result.header = "'''A''' is a thing." - result.sections = [('== History of A ==', 'Some history...'), - ('== Usage of A ==', 'Some usage...')] - result.footer = '[[Category:Things starting with A]]' - + .. note:: sections and text from templates are not extracted but + embedded as plain text. .. versionadded:: 3.0 - """ + .. versionchanged:: 8.2 + The :class:`Content` and :class:`Section` class have additional + properties. + + :return: The parsed namedtuple. + """ # noqa: D300, D301 headings = _extract_headings(text) sections = _extract_sections(text, headings) # Find header and footer contents @@ -1013,13 +1101,15 @@ r'({})*\Z'.format(r'|'.join((langlink_pattern, cat_regex.pattern, r'\s'))), last_section_content).group().lstrip() + if footer: if sections: - sections[-1] = _Section( + sections[-1] = Section( sections[-1].title, last_section_content[:-len(footer)]) else: header = header[:-len(footer)] - return _Content(header, sections, footer) + + return Content(header, sections, footer)
# ----------------------------------------------- diff --git a/scripts/archivebot.py b/scripts/archivebot.py index ca1aac9..76acb3a 100755 --- a/scripts/archivebot.py +++ b/scripts/archivebot.py @@ -395,11 +395,10 @@ else: self.header = header + footer
- for thread_heading, thread_content in threads: - cur_thread = DiscussionThread(thread_heading.strip('= '), - self.timestripper) + for thread in threads: + cur_thread = DiscussionThread(thread.heading, self.timestripper) # remove heading line - _, *lines = thread_content.replace(marker, '').splitlines() + _, *lines = thread.content.replace(marker, '').splitlines() for line in lines: cur_thread.feed_line(line) self.threads.append(cur_thread) diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py index a003295..6eeff21 100755 --- a/tests/textlib_tests.py +++ b/tests/textlib_tests.py @@ -1551,18 +1551,21 @@
"""Test the extract_sections function."""
- def _extract_sections_tests(self, result, header, sections, footer): + def _extract_sections_tests(self, result, header, sections, footer='', + title=''): """Test extract_sections function.""" self.assertIsInstance(result, tuple) self.assertIsInstance(result.sections, list) - self.assertEqual(result, (header, sections, footer)) self.assertEqual(result.header, header) self.assertEqual(result.sections, sections) self.assertEqual(result.footer, footer) - if result.sections: - for section in sections: - self.assertIsInstance(section, tuple) - self.assertLength(section, 2) + self.assertEqual(result.title, title) + self.assertEqual(result, (header, sections, footer)) + for section in result.sections: + self.assertIsInstance(section, tuple) + self.assertLength(section, 2) + self.assertIsInstance(section.level, int) + self.assertEqual(section.title.count('=') // 2, section.level)
def test_no_sections_no_footer(self): """Test for text having no sections or footer.""" @@ -1583,7 +1586,7 @@ 'content') result = extract_sections(text, self.site) self._extract_sections_tests( - result, 'text\n\n', [('==title==', '\ncontent')], '') + result, 'text\n\n', [('==title==', '\ncontent')])
def test_with_section_with_footer(self): """Test for text having sections and footer.""" @@ -1607,8 +1610,8 @@ self._extract_sections_tests( result, 'text\n\n', - [('=first level=', '\nfoo\n'), ('==title==', '\nbar')], - '') + [('=first level=', '\nfoo\n'), ('==title==', '\nbar')] + )
def test_with_h4_and_h2_sections(self): """Test for text having h4 and h2 sections.""" @@ -1621,7 +1624,7 @@ result, 'text\n\n', [('====title====', '\n'), ('==title 2==', '\ncontent')], - '') + )
def test_long_comment(self): r"""Test for text having a long expanse of white space. @@ -1637,6 +1640,30 @@ result = extract_sections(text, self.site) self._extract_sections_tests(result, text, [], '')
+ def test_unbalanced_headers(self): + """Test unbalances section headers.""" + text = ('text\n\n' + '====title===\n' + '==title 2===\n' + 'content') + result = extract_sections(text, self.site) + self._extract_sections_tests( + result, + 'text\n\n', + [('====title===', '\n'), ('==title 2===', '\ncontent')], + ) + + def test_title(self): + """Test title.""" + text = "Test ''' Pywikibot ''' title." + result = extract_sections(text, self.site) + self._extract_sections_tests( + result, + "Test ''' Pywikibot ''' title.", + [], + title='Pywikibot' + ) +
if __name__ == '__main__': # pragma: no cover with suppress(SystemExit):