Xqt has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/770098 )
Change subject: [IMPR] Add TextExtracts support ......................................................................
[IMPR] Add TextExtracts support
- add extract() to APISite which uses MediaWiki TextExtracts extension - add extract() to Page which either calls the site method or provides a 'wiki' variant wich gives an abstract of the wikitext - support 'lines' parameter to only return this given lines of text wrapped into 79 chars length - use extract('wiki') in speedy_delete.py
Bug: T72682 Change-Id: I3cfaafad050dec62e135cd36b14a929b14029294 --- M pywikibot/page/_basepage.py M pywikibot/site/_apisite.py M pywikibot/site/_extensions.py M scripts/speedy_delete.py 4 files changed, 124 insertions(+), 13 deletions(-)
Approvals: Xqt: Verified; Looks good to me, approved
diff --git a/pywikibot/page/_basepage.py b/pywikibot/page/_basepage.py index 05bc27b..7521f71 100644 --- a/pywikibot/page/_basepage.py +++ b/pywikibot/page/_basepage.py @@ -25,6 +25,7 @@ from html.entities import name2codepoint from http import HTTPStatus from itertools import chain +from textwrap import shorten, wrap from typing import Any, Optional, Union from urllib.parse import quote_from_bytes from warnings import warn @@ -625,6 +626,71 @@ self._parsed_text = self.site.get_parsed_page(self) return self._parsed_text
+ def extract(self, variant: str = 'plain', *, + lines: Optional[int] = None, + chars: Optional[int] = None, + sentences: Optional[int] = None, + intro: bool = True) -> str: + """Retrieve an extract of this page. + + .. versionadded:: 7.1 + + :param variant: The variant of extract, either 'plain' for plain + text, 'html' for limited HTML (both excludes templates and + any text formatting) or 'wiki' for bare wikitext which also + includes any templates for example. + :param lines: if not None, wrap the extract into lines with + width of 79 chars and return a string with that given number + of lines. + :param chars: How many characters to return. Actual text + returned might be slightly longer. + :param sentences: How many sentences to return + :param intro: Return only content before the first section + :raises NoPageError: given page does not exist + :raises NotImplementedError: "wiki" variant does not support + `sencence` parameter. + :raises ValueError: `variant` parameter must be "plain", "html" or + "wiki" + + .. seealso:: :meth:`APISite.extract() + <pywikibot.site._extensions.TextExtractsMixin.extract>`. + """ + if variant in ('plain', 'html'): + extract = self.site.extract(self, chars=chars, sentences=sentences, + intro=intro, + plaintext=variant == 'plain') + elif variant == 'wiki': + if not self.exists(): + raise NoPageError(self) + if sentences: + raise NotImplementedError( + "'wiki' variant of extract method does not support " + "'sencence' parameter") + + extract = self.text[:] + if intro: + pos = extract.find('\n=') + if pos: + extract = extract[:pos] + if chars: + extract = shorten(extract, chars, break_long_words=False, + placeholder='…') + else: + raise ValueError( + 'variant parameter must be "plain", "html" or "wiki", not "{}"' + .format(variant)) + + if not lines: + return extract + + text_lines = [] + for i, text in enumerate(extract.splitlines(), start=1): + text_lines += wrap(text, width=79) or [''] + if i >= lines: + break + + return '\n'.join(text_lines[:min(lines, len(text_lines))]) + def properties(self, force: bool = False) -> dict: """ Return the properties of the page. diff --git a/pywikibot/site/_apisite.py b/pywikibot/site/_apisite.py index 118fed7..40e5db8 100644 --- a/pywikibot/site/_apisite.py +++ b/pywikibot/site/_apisite.py @@ -56,6 +56,7 @@ ProofreadPageMixin, ThanksFlowMixin, ThanksMixin, + TextExtractsMixin, UrlShortenerMixin, WikibaseClientMixin, ) @@ -88,6 +89,7 @@ LinterMixin, PageImagesMixin, ProofreadPageMixin, + TextExtractsMixin, ThanksFlowMixin, ThanksMixin, UrlShortenerMixin, diff --git a/pywikibot/site/_extensions.py b/pywikibot/site/_extensions.py index 4aca5d5..d513d3a 100644 --- a/pywikibot/site/_extensions.py +++ b/pywikibot/site/_extensions.py @@ -4,12 +4,16 @@ # # Distributed under the terms of the MIT license. # +from typing import Optional + import pywikibot from pywikibot.data import api from pywikibot.echo import Notification from pywikibot.exceptions import ( APIError, + Error, InconsistentTitleError, + NoPageError, SiteDefinitionError, ) from pywikibot.site._decorators import need_extension, need_right @@ -713,3 +717,50 @@ req = self.simple_request(action='shortenurl', url=url) data = req.submit() return data['shortenurl']['shorturl'] + + +class TextExtractsMixin: + + """APISite mixin for TextExtracts extension. + + .. versionadded:: 7.1 + """ + + @need_extension('TextExtracts') + def extract(self, page: 'pywikibot.Page', *, + chars: Optional[int] = None, + sentences: Optional[int] = None, + intro: bool = True, + plaintext: bool = True) -> str: + """Retrieve an extract of a page. + + :param page: The Page object for which the extract is read + :param chars: How many characters to return. Actual text + returned might be slightly longer. + :param sentences: How many sentences to return + :param intro: Return only content before the first section + :param plaintext: if True, return extracts as plain text instead + of limited HTML + + .. seealso:: + + - https://www.mediawiki.org/wiki/Extension:TextExtracts + + - :meth:`pywikibot.page.BasePage.extract`. + """ + if not page.exists(): + raise NoPageError(page) + req = self._simple_request(action='query', + prop='extracts', + titles=page.title(with_section=False), + exchars=chars, + exsentences=sentences, + exintro=intro, + explaintext=plaintext) + data = req.submit()['query']['pages'] + if '-1' in data: + msg = data['-1'].get('invalidreason', + 'Unknown exception:\n{}'.format(data['-1'])) + raise Error(msg) + + return data[str(page.pageid)]['extract'] diff --git a/scripts/speedy_delete.py b/scripts/speedy_delete.py index d512230..ed8f386 100755 --- a/scripts/speedy_delete.py +++ b/scripts/speedy_delete.py @@ -26,7 +26,7 @@ # Distributed under the terms of the MIT license. # import time -from textwrap import fill, wrap +from textwrap import fill
import pywikibot from pywikibot import i18n, pagegenerators @@ -430,18 +430,10 @@ """Process one page.""" page = self.current_page
- page_text = [] - for text in page.text.split('\n'): - page_text += wrap(text, width=79) or [''] - - pywikibot.output(color_format('{blue}{}{default}', '_' * 80)) - if len(page_text) > self.LINES: - pywikibot.output(color_format( - '{blue}The page detail is too many lines, ' - 'only output first {} lines:{default}', self.LINES)) - pywikibot.output( - '\n'.join(page_text[:min(self.LINES, len(page_text))])) - pywikibot.output(color_format('{blue}{}{default}', '_' * 80)) + color_line = color_format('{blue}{}{default}', '_' * 80) + pywikibot.output(color_line) + pywikibot.output(page.extract('wiki', lines=self.LINES)) + pywikibot.output(color_line)
choice = pywikibot.input_choice( 'Input action?',