Xqt submitted this change.

View Change

Approvals: Xqt: Verified; Looks good to me, approved
[IMPR] Add TextExtracts support

- add extract() to APISite which uses MediaWiki TextExtracts extension
- add extract() to Page which either calls the site method or provides
a 'wiki' variant wich gives an abstract of the wikitext
- support 'lines' parameter to only return this given lines of text
wrapped into 79 chars length
- use extract('wiki') in speedy_delete.py

Bug: T72682
Change-Id: I3cfaafad050dec62e135cd36b14a929b14029294
---
M pywikibot/page/_basepage.py
M pywikibot/site/_apisite.py
M pywikibot/site/_extensions.py
M scripts/speedy_delete.py
4 files changed, 124 insertions(+), 13 deletions(-)

diff --git a/pywikibot/page/_basepage.py b/pywikibot/page/_basepage.py
index 05bc27b..7521f71 100644
--- a/pywikibot/page/_basepage.py
+++ b/pywikibot/page/_basepage.py
@@ -25,6 +25,7 @@
from html.entities import name2codepoint
from http import HTTPStatus
from itertools import chain
+from textwrap import shorten, wrap
from typing import Any, Optional, Union
from urllib.parse import quote_from_bytes
from warnings import warn
@@ -625,6 +626,71 @@
self._parsed_text = self.site.get_parsed_page(self)
return self._parsed_text

+ def extract(self, variant: str = 'plain', *,
+ lines: Optional[int] = None,
+ chars: Optional[int] = None,
+ sentences: Optional[int] = None,
+ intro: bool = True) -> str:
+ """Retrieve an extract of this page.
+
+ .. versionadded:: 7.1
+
+ :param variant: The variant of extract, either 'plain' for plain
+ text, 'html' for limited HTML (both excludes templates and
+ any text formatting) or 'wiki' for bare wikitext which also
+ includes any templates for example.
+ :param lines: if not None, wrap the extract into lines with
+ width of 79 chars and return a string with that given number
+ of lines.
+ :param chars: How many characters to return. Actual text
+ returned might be slightly longer.
+ :param sentences: How many sentences to return
+ :param intro: Return only content before the first section
+ :raises NoPageError: given page does not exist
+ :raises NotImplementedError: "wiki" variant does not support
+ `sencence` parameter.
+ :raises ValueError: `variant` parameter must be "plain", "html" or
+ "wiki"
+
+ .. seealso:: :meth:`APISite.extract()
+ <pywikibot.site._extensions.TextExtractsMixin.extract>`.
+ """
+ if variant in ('plain', 'html'):
+ extract = self.site.extract(self, chars=chars, sentences=sentences,
+ intro=intro,
+ plaintext=variant == 'plain')
+ elif variant == 'wiki':
+ if not self.exists():
+ raise NoPageError(self)
+ if sentences:
+ raise NotImplementedError(
+ "'wiki' variant of extract method does not support "
+ "'sencence' parameter")
+
+ extract = self.text[:]
+ if intro:
+ pos = extract.find('\n=')
+ if pos:
+ extract = extract[:pos]
+ if chars:
+ extract = shorten(extract, chars, break_long_words=False,
+ placeholder='…')
+ else:
+ raise ValueError(
+ 'variant parameter must be "plain", "html" or "wiki", not "{}"'
+ .format(variant))
+
+ if not lines:
+ return extract
+
+ text_lines = []
+ for i, text in enumerate(extract.splitlines(), start=1):
+ text_lines += wrap(text, width=79) or ['']
+ if i >= lines:
+ break
+
+ return '\n'.join(text_lines[:min(lines, len(text_lines))])
+
def properties(self, force: bool = False) -> dict:
"""
Return the properties of the page.
diff --git a/pywikibot/site/_apisite.py b/pywikibot/site/_apisite.py
index 118fed7..40e5db8 100644
--- a/pywikibot/site/_apisite.py
+++ b/pywikibot/site/_apisite.py
@@ -56,6 +56,7 @@
ProofreadPageMixin,
ThanksFlowMixin,
ThanksMixin,
+ TextExtractsMixin,
UrlShortenerMixin,
WikibaseClientMixin,
)
@@ -88,6 +89,7 @@
LinterMixin,
PageImagesMixin,
ProofreadPageMixin,
+ TextExtractsMixin,
ThanksFlowMixin,
ThanksMixin,
UrlShortenerMixin,
diff --git a/pywikibot/site/_extensions.py b/pywikibot/site/_extensions.py
index 4aca5d5..d513d3a 100644
--- a/pywikibot/site/_extensions.py
+++ b/pywikibot/site/_extensions.py
@@ -4,12 +4,16 @@
#
# Distributed under the terms of the MIT license.
#
+from typing import Optional
+
import pywikibot
from pywikibot.data import api
from pywikibot.echo import Notification
from pywikibot.exceptions import (
APIError,
+ Error,
InconsistentTitleError,
+ NoPageError,
SiteDefinitionError,
)
from pywikibot.site._decorators import need_extension, need_right
@@ -713,3 +717,50 @@
req = self.simple_request(action='shortenurl', url=url)
data = req.submit()
return data['shortenurl']['shorturl']
+
+
+class TextExtractsMixin:
+
+ """APISite mixin for TextExtracts extension.
+
+ .. versionadded:: 7.1
+ """
+
+ @need_extension('TextExtracts')
+ def extract(self, page: 'pywikibot.Page', *,
+ chars: Optional[int] = None,
+ sentences: Optional[int] = None,
+ intro: bool = True,
+ plaintext: bool = True) -> str:
+ """Retrieve an extract of a page.
+
+ :param page: The Page object for which the extract is read
+ :param chars: How many characters to return. Actual text
+ returned might be slightly longer.
+ :param sentences: How many sentences to return
+ :param intro: Return only content before the first section
+ :param plaintext: if True, return extracts as plain text instead
+ of limited HTML
+
+ .. seealso::
+
+ - https://www.mediawiki.org/wiki/Extension:TextExtracts
+
+ - :meth:`pywikibot.page.BasePage.extract`.
+ """
+ if not page.exists():
+ raise NoPageError(page)
+ req = self._simple_request(action='query',
+ prop='extracts',
+ titles=page.title(with_section=False),
+ exchars=chars,
+ exsentences=sentences,
+ exintro=intro,
+ explaintext=plaintext)
+ data = req.submit()['query']['pages']
+ if '-1' in data:
+ msg = data['-1'].get('invalidreason',
+ 'Unknown exception:\n{}'.format(data['-1']))
+ raise Error(msg)
+
+ return data[str(page.pageid)]['extract']
diff --git a/scripts/speedy_delete.py b/scripts/speedy_delete.py
index d512230..ed8f386 100755
--- a/scripts/speedy_delete.py
+++ b/scripts/speedy_delete.py
@@ -26,7 +26,7 @@
# Distributed under the terms of the MIT license.
#
import time
-from textwrap import fill, wrap
+from textwrap import fill

import pywikibot
from pywikibot import i18n, pagegenerators
@@ -430,18 +430,10 @@
"""Process one page."""
page = self.current_page

- page_text = []
- for text in page.text.split('\n'):
- page_text += wrap(text, width=79) or ['']
-
- pywikibot.output(color_format('{blue}{}{default}', '_' * 80))
- if len(page_text) > self.LINES:
- pywikibot.output(color_format(
- '{blue}The page detail is too many lines, '
- 'only output first {} lines:{default}', self.LINES))
- pywikibot.output(
- '\n'.join(page_text[:min(self.LINES, len(page_text))]))
- pywikibot.output(color_format('{blue}{}{default}', '_' * 80))
+ color_line = color_format('{blue}{}{default}', '_' * 80)
+ pywikibot.output(color_line)
+ pywikibot.output(page.extract('wiki', lines=self.LINES))
+ pywikibot.output(color_line)

choice = pywikibot.input_choice(
'Input action?',

To view, visit change 770098. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I3cfaafad050dec62e135cd36b14a929b14029294
Gerrit-Change-Number: 770098
Gerrit-PatchSet: 8
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Multichill <maarten@mdammers.nl>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Ricordisamoa <ricordisamoa@disroot.org>
Gerrit-MessageType: merged