jenkins-bot submitted this change.

View Change


Approvals: Matěj Suchánek: Looks good to me, approved jenkins-bot: Verified
[IMPR] improvement for textlib.extract_sections() function

- make _Content and _Section public classes
- the Content namedtuple which is the result of
textlib.extract_sections() function has an additional property 'title'
which is the first main title found in header (anything enclosed within
tripple quotes)
- the Section nametuple of the sections list comes with a level property
which indicates the level of the section header (usually 2-6)
- the Section nametuple has also a heading property which gives the
stripped section title.
- simplify _extract_sections helper function
- simplify cosmetic_changes module and archivebot script
- update tests

Bug: T338748
Change-Id: I0cd3ac16fc7f3dce1b87b9b67f02766caec99aa6
---
M pywikibot/textlib.py
M pywikibot/cosmetic_changes.py
M tests/textlib_tests.py
M scripts/archivebot.py
4 files changed, 199 insertions(+), 62 deletions(-)

diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index bf0063c..ffd43a5 100644
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -699,7 +699,7 @@
for reg in skip_regexes:
stripped_text = reg.sub(r'', stripped_text)
strip_sections = textlib.extract_sections(
- stripped_text, self.site)[1]
+ stripped_text, self.site).sections

# get proper sections
header, sections, footer = textlib.extract_sections(text, self.site)
@@ -707,15 +707,13 @@
# iterate stripped sections and create a new page body
new_body = []
for i, strip_section in enumerate(strip_sections):
- current_heading = sections[i][0]
+ current_dep = sections[i].level
try:
- next_heading = sections[i + 1][0]
+ next_dep = sections[i + 1].level
except IndexError:
- next_heading = ''
- current_dep = (len(current_heading)
- - len(current_heading.lstrip('=')))
- next_dep = len(next_heading) - len(next_heading.lstrip('='))
- if strip_section[1].strip() or current_dep < next_dep:
+ next_dep = 0
+
+ if strip_section.content.strip() or current_dep < next_dep:
new_body.extend(sections[i])
return header + ''.join(new_body) + footer

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index a81abb1..9078b3c 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -933,12 +933,69 @@
# -------------------------------
# Functions dealing with sections
# -------------------------------
+
+#: Head pattern
+HEAD_PATTERN = re.compile('{0}[^=]+{0}'.format('(={1,6})'))
+TITLE_PATTERN = re.compile("'{3}([^']+)'{3}")
+
_Heading = namedtuple('_Heading', ('text', 'start', 'end'))
-_Section = namedtuple('_Section', ('title', 'content'))
-_Content = namedtuple('_Content', ('header', 'sections', 'footer'))


-def _extract_headings(text: str) -> list:
+class Section(NamedTuple):
+
+ """A namedtuple as part of :class:`Content` describing a page section.
+
+ .. versionchanged:: 8.2
+ ``_Section`` becomes a public class.
+ """
+
+ title: str #: section title including equal signs
+ content: str #: section content
+
+ @property
+ def level(self) -> int:
+ """Return the section level.
+
+ .. versionadded:: 8.2
+ """
+ m = HEAD_PATTERN.match(self.title)
+ return min(map(len, m.groups()))
+
+ @property
+ def heading(self) -> str:
+ """Return the section title without equal signs.
+
+ .. versionadded:: 8.2
+ """
+ level = self.level
+ return self.title[level:-level].strip()
+
+
+class Content(NamedTuple):
+
+ """A namedtuple as result of :func:`extract_sections` holding page content.
+
+ .. versionchanged:: 8.2
+ ``_Content`` becomes a public class.
+ """
+
+ header: str #: the page header
+ sections: List[Section] #: the page sections
+ footer: str #: the page footer
+
+ @property
+ def title(self) -> str:
+ """Return the first main title found on the page.
+
+ The first main title is anything enclosed within triple quotes.
+
+ .. versionadded:: 8.2
+ """
+ m = TITLE_PATTERN.search(self.header)
+ return m[1].strip() if m else ''
+
+
+def _extract_headings(text: str) -> List[_Heading]:
"""Return _Heading objects."""
headings = []
heading_regex = get_regexes('header')[0]
@@ -949,59 +1006,90 @@
return headings


-def _extract_sections(text: str, headings) -> list:
- """Return _Section objects."""
+def _extract_sections(text: str, headings) -> List[Section]:
+ """Return a list of :class:`Section` objects."""
+ sections = []
if headings:
# Assign them their contents
- contents = []
for i, heading in enumerate(headings):
try:
next_heading = headings[i + 1]
except IndexError:
- contents.append(text[heading.end:])
+ content = text[heading.end:]
else:
- contents.append(text[heading.end:next_heading.start])
- return [_Section(heading.text, content)
- for heading, content in zip(headings, contents)]
- return []
+ content = text[heading.end:next_heading.start]
+ sections.append(Section(heading.text, content))
+
+ return sections


def extract_sections(
- text: str, site=None
-) -> NamedTuple('_Content', [('header', str), # noqa: F821
- ('sections', List[Tuple[str, str]]), # noqa: F821
- ('footer', str)]): # noqa: F821
- """
- Return section headings and contents found in text.
+ text: str,
+ site: Optional['pywikibot.site.BaseSite'] = None
+) -> Content:
+ """Return section headings and contents found in text.

- :return: The returned namedtuple contains the text parsed into
- header, contents and footer parts: The header part is a string
- containing text part above the first heading. The footer part
- is also a string containing text part after the last section.
- The section part is a list of tuples, each tuple containing a
- string with section heading and a string with section content.
- Example article::
+ The returned namedtuple :class:`Content` contains the text parsed
+ into *header*, *sections* and *footer* parts. The main title found
+ in the header which is the first text enclosed with ''' like
+ '''page title''' can be given by the *title* property.

- '''A''' is a thing.
+ The header part is a string containing text part above the first
+ heading.

- == History of A ==
- Some history...
+ The sections part is a list of :class:`Section` namedtuples, each
+ tuple containing a string with section title (including equal signs),
+ and a string with the section content. In addition the section
+ heading (the title without equal signs) can be given by the *heading*
+ property. Also the section level can be found by the *level*
+ property which is the number of the equal signs around the section
+ heading.

- == Usage of A ==
- Some usage...
+ The footer part is also a string containing text part after the last
+ section.

- [[Category:Things starting with A]]
+ **Examples:**

- ...is parsed into the following namedtuple::
+ >>> text = \"\"\"
+ ... '''this''' is a Python module.
+ ...
+ ... == History of this ==
+ ... This set of principles was posted in 1999...
+ ...
+ ... == Usage of this ==
+ ... Enter "import this" for usage...
+ ...
+ ... === Details ===
+ ... The Zen of Python...
+ ...
+ ... [[Category:Programming principles]]
+ ... \"\"\"
+ >>> site = pywikibot.Site('wikipedia:en')
+ >>> result = extract_sections(text, site)
+ >>> result.header.strip()
+ "'''this''' is a Python module."
+ >>> result.sections[0].title
+ '== History of this =='
+ >>> result.sections[1].content.strip()
+ 'Enter "import this" for usage...'
+ >>> result.sections[2].heading
+ 'Details'
+ >>> result.sections[2].level
+ 3
+ >>> result.footer.strip()
+ '[[Category:Programming principles]]'
+ >>> result.title
+ 'this'

- result = extract_sections(text, site)
- result.header = "'''A''' is a thing."
- result.sections = [('== History of A ==', 'Some history...'),
- ('== Usage of A ==', 'Some usage...')]
- result.footer = '[[Category:Things starting with A]]'
-
+ .. note:: sections and text from templates are not extracted but
+ embedded as plain text.
.. versionadded:: 3.0
- """
+ .. versionchanged:: 8.2
+ The :class:`Content` and :class:`Section` class have additional
+ properties.
+
+ :return: The parsed namedtuple.
+ """ # noqa: D300, D301
headings = _extract_headings(text)
sections = _extract_sections(text, headings)
# Find header and footer contents
@@ -1013,13 +1101,15 @@
r'({})*\Z'.format(r'|'.join((langlink_pattern,
cat_regex.pattern, r'\s'))),
last_section_content).group().lstrip()
+
if footer:
if sections:
- sections[-1] = _Section(
+ sections[-1] = Section(
sections[-1].title, last_section_content[:-len(footer)])
else:
header = header[:-len(footer)]
- return _Content(header, sections, footer)
+
+ return Content(header, sections, footer)


# -----------------------------------------------
diff --git a/scripts/archivebot.py b/scripts/archivebot.py
index ca1aac9..76acb3a 100755
--- a/scripts/archivebot.py
+++ b/scripts/archivebot.py
@@ -395,11 +395,10 @@
else:
self.header = header + footer

- for thread_heading, thread_content in threads:
- cur_thread = DiscussionThread(thread_heading.strip('= '),
- self.timestripper)
+ for thread in threads:
+ cur_thread = DiscussionThread(thread.heading, self.timestripper)
# remove heading line
- _, *lines = thread_content.replace(marker, '').splitlines()
+ _, *lines = thread.content.replace(marker, '').splitlines()
for line in lines:
cur_thread.feed_line(line)
self.threads.append(cur_thread)
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index a003295..6eeff21 100755
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -1551,18 +1551,21 @@

"""Test the extract_sections function."""

- def _extract_sections_tests(self, result, header, sections, footer):
+ def _extract_sections_tests(self, result, header, sections, footer='',
+ title=''):
"""Test extract_sections function."""
self.assertIsInstance(result, tuple)
self.assertIsInstance(result.sections, list)
- self.assertEqual(result, (header, sections, footer))
self.assertEqual(result.header, header)
self.assertEqual(result.sections, sections)
self.assertEqual(result.footer, footer)
- if result.sections:
- for section in sections:
- self.assertIsInstance(section, tuple)
- self.assertLength(section, 2)
+ self.assertEqual(result.title, title)
+ self.assertEqual(result, (header, sections, footer))
+ for section in result.sections:
+ self.assertIsInstance(section, tuple)
+ self.assertLength(section, 2)
+ self.assertIsInstance(section.level, int)
+ self.assertEqual(section.title.count('=') // 2, section.level)

def test_no_sections_no_footer(self):
"""Test for text having no sections or footer."""
@@ -1583,7 +1586,7 @@
'content')
result = extract_sections(text, self.site)
self._extract_sections_tests(
- result, 'text\n\n', [('==title==', '\ncontent')], '')
+ result, 'text\n\n', [('==title==', '\ncontent')])

def test_with_section_with_footer(self):
"""Test for text having sections and footer."""
@@ -1607,8 +1610,8 @@
self._extract_sections_tests(
result,
'text\n\n',
- [('=first level=', '\nfoo\n'), ('==title==', '\nbar')],
- '')
+ [('=first level=', '\nfoo\n'), ('==title==', '\nbar')]
+ )

def test_with_h4_and_h2_sections(self):
"""Test for text having h4 and h2 sections."""
@@ -1621,7 +1624,7 @@
result,
'text\n\n',
[('====title====', '\n'), ('==title 2==', '\ncontent')],
- '')
+ )

def test_long_comment(self):
r"""Test for text having a long expanse of white space.
@@ -1637,6 +1640,30 @@
result = extract_sections(text, self.site)
self._extract_sections_tests(result, text, [], '')

+ def test_unbalanced_headers(self):
+ """Test unbalances section headers."""
+ text = ('text\n\n'
+ '====title===\n'
+ '==title 2===\n'
+ 'content')
+ result = extract_sections(text, self.site)
+ self._extract_sections_tests(
+ result,
+ 'text\n\n',
+ [('====title===', '\n'), ('==title 2===', '\ncontent')],
+ )
+
+ def test_title(self):
+ """Test title."""
+ text = "Test ''' Pywikibot ''' title."
+ result = extract_sections(text, self.site)
+ self._extract_sections_tests(
+ result,
+ "Test ''' Pywikibot ''' title.",
+ [],
+ title='Pywikibot'
+ )
+

if __name__ == '__main__': # pragma: no cover
with suppress(SystemExit):

To view, visit change 930899. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I0cd3ac16fc7f3dce1b87b9b67f02766caec99aa6
Gerrit-Change-Number: 930899
Gerrit-PatchSet: 21
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged