[Gerrit] ...core[master]: [IMPR] improvement for textlib.extract_sections() function - Pywikibot-commits

28 Jun 2023

jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/930899 )
Change subject: [IMPR] improvement for textlib.extract_sections() function
......................................................................
[IMPR] improvement for textlib.extract_sections() function
- make _Content and _Section public classes
- the Content namedtuple which is the result of
  textlib.extract_sections() function has an additional property 'title'
  which is the first main title found in header (anything enclosed within
  tripple quotes)
- the Section nametuple of the sections list comes with a level property
  which indicates the level of the section header (usually 2-6)
- the Section nametuple has also a heading property which gives the
  stripped section title.
- simplify _extract_sections helper function
- simplify cosmetic_changes module and archivebot script
- update tests
Bug: T338748
Change-Id: I0cd3ac16fc7f3dce1b87b9b67f02766caec99aa6
---
M pywikibot/textlib.py
M pywikibot/cosmetic_changes.py
M tests/textlib_tests.py
M scripts/archivebot.py
4 files changed, 199 insertions(+), 62 deletions(-)
Approvals:
  Matěj Suchánek: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index bf0063c..ffd43a5 100644
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -699,7 +699,7 @@
         for reg in skip_regexes:
             stripped_text = reg.sub(r'', stripped_text)
         strip_sections = textlib.extract_sections(
-            stripped_text, self.site)[1]
+            stripped_text, self.site).sections
# get proper sections
         header, sections, footer = textlib.extract_sections(text, self.site)
@@ -707,15 +707,13 @@
         # iterate stripped sections and create a new page body
         new_body = []
         for i, strip_section in enumerate(strip_sections):
-            current_heading = sections[i][0]
+            current_dep = sections[i].level
             try:
-                next_heading = sections[i + 1][0]
+                next_dep = sections[i + 1].level
             except IndexError:
-                next_heading = ''
-            current_dep = (len(current_heading)
-                           - len(current_heading.lstrip('=')))
-            next_dep = len(next_heading) - len(next_heading.lstrip('='))
-            if strip_section[1].strip() or current_dep < next_dep:
+                next_dep = 0
+
+            if strip_section.content.strip() or current_dep < next_dep:
                 new_body.extend(sections[i])
         return header + ''.join(new_body) + footer
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index a81abb1..9078b3c 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -933,12 +933,69 @@
 # -------------------------------
 # Functions dealing with sections
 # -------------------------------
+
+#: Head pattern
+HEAD_PATTERN = re.compile('{0}[^=]+{0}'.format('(={1,6})'))
+TITLE_PATTERN = re.compile("'{3}([^']+)'{3}")
+
 _Heading = namedtuple('_Heading', ('text', 'start', 'end'))
-_Section = namedtuple('_Section', ('title', 'content'))
-_Content = namedtuple('_Content', ('header', 'sections', 'footer'))
-def _extract_headings(text: str) -> list:
+class Section(NamedTuple):
+
+    """A namedtuple as part of :class:`Content` describing a page section.
+
+    .. versionchanged:: 8.2
+       ``_Section`` becomes a public class.
+    """
+
+    title: str  #: section title including equal signs
+    content: str  #: section content
+
+    @property
+    def level(self) -> int:
+        """Return the section level.
+
+        .. versionadded:: 8.2
+        """
+        m = HEAD_PATTERN.match(self.title)
+        return min(map(len, m.groups()))
+
+    @property
+    def heading(self) -> str:
+        """Return the section title without equal signs.
+
+        .. versionadded:: 8.2
+        """
+        level = self.level
+        return self.title[level:-level].strip()
+
+
+class Content(NamedTuple):
+
+    """A namedtuple as result of :func:`extract_sections` holding page content.
+
+    .. versionchanged:: 8.2
+       ``_Content`` becomes a public class.
+    """
+
+    header: str  #: the page header
+    sections: List[Section]  #: the page sections
+    footer: str  #: the page footer
+
+    @property
+    def title(self) -> str:
+        """Return the first main title found on the page.
+
+        The first main title is anything enclosed within triple quotes.
+
+        .. versionadded:: 8.2
+        """
+        m = TITLE_PATTERN.search(self.header)
+        return m[1].strip() if m else ''
+
+
+def _extract_headings(text: str) -> List[_Heading]:
     """Return _Heading objects."""
     headings = []
     heading_regex = get_regexes('header')[0]
@@ -949,59 +1006,90 @@
     return headings
-def _extract_sections(text: str, headings) -> list:
-    """Return _Section objects."""
+def _extract_sections(text: str, headings) -> List[Section]:
+    """Return a list of :class:`Section` objects."""
+    sections = []
     if headings:
         # Assign them their contents
-        contents = []
         for i, heading in enumerate(headings):
             try:
                 next_heading = headings[i + 1]
             except IndexError:
-                contents.append(text[heading.end:])
+                content = text[heading.end:]
             else:
-                contents.append(text[heading.end:next_heading.start])
-        return [_Section(heading.text, content)
-                for heading, content in zip(headings, contents)]
-    return []
+                content = text[heading.end:next_heading.start]
+            sections.append(Section(heading.text, content))
+
+    return sections
def extract_sections(
-    text: str, site=None
-) -> NamedTuple('_Content', [('header', str),  # noqa: F821
-                             ('sections', List[Tuple[str, str]]),  # noqa: F821
-                             ('footer', str)]):  # noqa: F821
-    """
-    Return section headings and contents found in text.
+    text: str,
+    site: Optional['pywikibot.site.BaseSite'] = None
+) -> Content:
+    """Return section headings and contents found in text.
-    :return: The returned namedtuple contains the text parsed into
-        header, contents and footer parts: The header part is a string
-        containing text part above the first heading. The footer part
-        is also a string containing text part after the last section.
-        The section part is a list of tuples, each tuple containing a
-        string with section heading and a string with section content.
-        Example article::
+    The returned namedtuple :class:`Content` contains the text parsed
+    into *header*, *sections* and *footer* parts. The main title found
+    in the header which is the first text enclosed with ''' like
+    '''page title''' can be given by the *title* property.
-            '''A''' is a thing.
+    The header part is a string containing text part above the first
+    heading.
-            == History of A ==
-            Some history...
+    The sections part is a list of :class:`Section` namedtuples, each
+    tuple containing a string with section title (including equal signs),
+    and a string with the section content. In addition the section
+    heading (the title without equal signs) can be given by the *heading*
+    property. Also the section level can be found by the *level*
+    property which is the number of the equal signs around the section
+    heading.
-            == Usage of A ==
-            Some usage...
+    The footer part is also a string containing text part after the last
+    section.
-            [[Category:Things starting with A]]
+    **Examples:**
-        ...is parsed into the following namedtuple::
+    >>> text = """
+    ... '''this''' is a Python module.
+    ...
+    ... == History of this ==
+    ... This set of principles was posted in 1999...
+    ...
+    ... == Usage of this ==
+    ... Enter "import this" for usage...
+    ...
+    ... === Details ===
+    ... The Zen of Python...
+    ...
+    ... [[Category:Programming principles]]
+    ... """
+    >>> site = pywikibot.Site('wikipedia:en')
+    >>> result = extract_sections(text, site)
+    >>> result.header.strip()
+    "'''this''' is a Python module."
+    >>> result.sections[0].title
+    '== History of this =='
+    >>> result.sections[1].content.strip()
+    'Enter "import this" for usage...'
+    >>> result.sections[2].heading
+    'Details'
+    >>> result.sections[2].level
+    3
+    >>> result.footer.strip()
+    '[[Category:Programming principles]]'
+    >>> result.title
+    'this'
-            result = extract_sections(text, site)
-            result.header = "'''A''' is a thing."
-            result.sections = [('== History of A ==', 'Some history...'),
-                               ('== Usage of A ==', 'Some usage...')]
-            result.footer = '[[Category:Things starting with A]]'
-
+    .. note:: sections and text from templates are not extracted but
+       embedded as plain text.
     .. versionadded:: 3.0
-    """
+    .. versionchanged:: 8.2
+       The :class:`Content` and :class:`Section` class have additional
+       properties.
+
+    :return: The parsed namedtuple.
+    """  # noqa: D300, D301
     headings = _extract_headings(text)
     sections = _extract_sections(text, headings)
     # Find header and footer contents
@@ -1013,13 +1101,15 @@
         r'({})*\Z'.format(r'|'.join((langlink_pattern,
                                      cat_regex.pattern, r'\s'))),
         last_section_content).group().lstrip()
+
     if footer:
         if sections:
-            sections[-1] = _Section(
+            sections[-1] = Section(
                 sections[-1].title, last_section_content[:-len(footer)])
         else:
             header = header[:-len(footer)]
-    return _Content(header, sections, footer)
+
+    return Content(header, sections, footer)
# -----------------------------------------------
diff --git a/scripts/archivebot.py b/scripts/archivebot.py
index ca1aac9..76acb3a 100755
--- a/scripts/archivebot.py
+++ b/scripts/archivebot.py
@@ -395,11 +395,10 @@
         else:
             self.header = header + footer
-        for thread_heading, thread_content in threads:
-            cur_thread = DiscussionThread(thread_heading.strip('= '),
-                                          self.timestripper)
+        for thread in threads:
+            cur_thread = DiscussionThread(thread.heading, self.timestripper)
             # remove heading line
-            _, *lines = thread_content.replace(marker, '').splitlines()
+            _, *lines = thread.content.replace(marker, '').splitlines()
             for line in lines:
                 cur_thread.feed_line(line)
             self.threads.append(cur_thread)
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index a003295..6eeff21 100755
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -1551,18 +1551,21 @@
"""Test the extract_sections function."""
-    def _extract_sections_tests(self, result, header, sections, footer):
+    def _extract_sections_tests(self, result, header, sections, footer='',
+                                title=''):
         """Test extract_sections function."""
         self.assertIsInstance(result, tuple)
         self.assertIsInstance(result.sections, list)
-        self.assertEqual(result, (header, sections, footer))
         self.assertEqual(result.header, header)
         self.assertEqual(result.sections, sections)
         self.assertEqual(result.footer, footer)
-        if result.sections:
-            for section in sections:
-                self.assertIsInstance(section, tuple)
-                self.assertLength(section, 2)
+        self.assertEqual(result.title, title)
+        self.assertEqual(result, (header, sections, footer))
+        for section in result.sections:
+            self.assertIsInstance(section, tuple)
+            self.assertLength(section, 2)
+            self.assertIsInstance(section.level, int)
+            self.assertEqual(section.title.count('=') // 2, section.level)
def test_no_sections_no_footer(self):
         """Test for text having no sections or footer."""
@@ -1583,7 +1586,7 @@
                 'content')
         result = extract_sections(text, self.site)
         self._extract_sections_tests(
-            result, 'text\n\n', [('==title==', '\ncontent')], '')
+            result, 'text\n\n', [('==title==', '\ncontent')])
def test_with_section_with_footer(self):
         """Test for text having sections and footer."""
@@ -1607,8 +1610,8 @@
         self._extract_sections_tests(
             result,
             'text\n\n',
-            [('=first level=', '\nfoo\n'), ('==title==', '\nbar')],
-            '')
+            [('=first level=', '\nfoo\n'), ('==title==', '\nbar')]
+        )
def test_with_h4_and_h2_sections(self):
         """Test for text having h4 and h2 sections."""
@@ -1621,7 +1624,7 @@
             result,
             'text\n\n',
             [('====title====', '\n'), ('==title 2==', '\ncontent')],
-            '')
+        )
def test_long_comment(self):
         r"""Test for text having a long expanse of white space.
@@ -1637,6 +1640,30 @@
         result = extract_sections(text, self.site)
         self._extract_sections_tests(result, text, [], '')
+    def test_unbalanced_headers(self):
+        """Test unbalances section headers."""
+        text = ('text\n\n'
+                '====title===\n'
+                '==title 2===\n'
+                'content')
+        result = extract_sections(text, self.site)
+        self._extract_sections_tests(
+            result,
+            'text\n\n',
+            [('====title===', '\n'), ('==title 2===', '\ncontent')],
+        )
+
+    def test_title(self):
+        """Test title."""
+        text = "Test ''' Pywikibot ''' title."
+        result = extract_sections(text, self.site)
+        self._extract_sections_tests(
+            result,
+            "Test ''' Pywikibot ''' title.",
+            [],
+            title='Pywikibot'
+        )
+
if __name__ == '__main__':  # pragma: no cover
     with suppress(SystemExit):
-- 
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/930899
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I0cd3ac16fc7f3dce1b87b9b67f02766caec99aa6
Gerrit-Change-Number: 930899
Gerrit-PatchSet: 21
Gerrit-Owner: Xqt info@gno.de
Gerrit-Reviewer: Matěj Suchánek matejsuchanek97@gmail.com
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged