[Pywikibot-commits] [Gerrit] ...core[master]: [IMPR] Add TextExtracts support

19 Mar 2022

Xqt has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/770098 )
Change subject: [IMPR] Add TextExtracts support
......................................................................
[IMPR] Add TextExtracts support
- add extract() to APISite which uses MediaWiki TextExtracts extension
- add extract() to Page which either calls the site method or provides
  a 'wiki' variant wich gives an abstract of the wikitext
- support 'lines' parameter to only return this given lines of text
  wrapped into 79 chars length
- use extract('wiki') in speedy_delete.py
Bug: T72682
Change-Id: I3cfaafad050dec62e135cd36b14a929b14029294
---
M pywikibot/page/_basepage.py
M pywikibot/site/_apisite.py
M pywikibot/site/_extensions.py
M scripts/speedy_delete.py
4 files changed, 124 insertions(+), 13 deletions(-)
Approvals:
  Xqt: Verified; Looks good to me, approved

diff --git a/pywikibot/page/_basepage.py b/pywikibot/page/_basepage.py
index 05bc27b..7521f71 100644
--- a/pywikibot/page/_basepage.py
+++ b/pywikibot/page/_basepage.py
@@ -25,6 +25,7 @@
 from html.entities import name2codepoint
 from http import HTTPStatus
 from itertools import chain
+from textwrap import shorten, wrap
 from typing import Any, Optional, Union
 from urllib.parse import quote_from_bytes
 from warnings import warn
@@ -625,6 +626,71 @@
             self._parsed_text = self.site.get_parsed_page(self)
         return self._parsed_text
+    def extract(self, variant: str = 'plain', *,
+                lines: Optional[int] = None,
+                chars: Optional[int] = None,
+                sentences: Optional[int] = None,
+                intro: bool = True) -> str:
+        """Retrieve an extract of this page.
+
+        .. versionadded:: 7.1
+
+        :param variant: The variant of extract, either 'plain' for plain
+            text, 'html' for limited HTML (both excludes templates and
+            any text formatting) or 'wiki' for bare wikitext which also
+            includes any templates for example.
+        :param lines: if not None, wrap the extract into lines with
+            width of 79 chars and return a string with that given number
+            of lines.
+        :param chars: How many characters to return.  Actual text
+            returned might be slightly longer.
+        :param sentences: How many sentences to return
+        :param intro: Return only content before the first section
+        :raises NoPageError: given page does not exist
+        :raises NotImplementedError: "wiki" variant does not support
+            `sencence` parameter.
+        :raises ValueError: `variant` parameter must be "plain", "html" or
+            "wiki"
+
+        .. seealso:: :meth:`APISite.extract()
+           <pywikibot.site._extensions.TextExtractsMixin.extract>`.
+        """
+        if variant in ('plain', 'html'):
+            extract = self.site.extract(self, chars=chars, sentences=sentences,
+                                        intro=intro,
+                                        plaintext=variant == 'plain')
+        elif variant == 'wiki':
+            if not self.exists():
+                raise NoPageError(self)
+            if sentences:
+                raise NotImplementedError(
+                    "'wiki' variant of extract method does not support "
+                    "'sencence' parameter")
+
+            extract = self.text[:]
+            if intro:
+                pos = extract.find('\n=')
+                if pos:
+                    extract = extract[:pos]
+            if chars:
+                extract = shorten(extract, chars, break_long_words=False,
+                                  placeholder='…')
+        else:
+            raise ValueError(
+                'variant parameter must be "plain", "html" or "wiki", not "{}"'
+                .format(variant))
+
+        if not lines:
+            return extract
+
+        text_lines = []
+        for i, text in enumerate(extract.splitlines(), start=1):
+            text_lines += wrap(text, width=79) or ['']
+            if i >= lines:
+                break
+
+        return '\n'.join(text_lines[:min(lines, len(text_lines))])
+
     def properties(self, force: bool = False) -> dict:
         """
         Return the properties of the page.
diff --git a/pywikibot/site/_apisite.py b/pywikibot/site/_apisite.py
index 118fed7..40e5db8 100644
--- a/pywikibot/site/_apisite.py
+++ b/pywikibot/site/_apisite.py
@@ -56,6 +56,7 @@
     ProofreadPageMixin,
     ThanksFlowMixin,
     ThanksMixin,
+    TextExtractsMixin,
     UrlShortenerMixin,
     WikibaseClientMixin,
 )
@@ -88,6 +89,7 @@
     LinterMixin,
     PageImagesMixin,
     ProofreadPageMixin,
+    TextExtractsMixin,
     ThanksFlowMixin,
     ThanksMixin,
     UrlShortenerMixin,
diff --git a/pywikibot/site/_extensions.py b/pywikibot/site/_extensions.py
index 4aca5d5..d513d3a 100644
--- a/pywikibot/site/_extensions.py
+++ b/pywikibot/site/_extensions.py
@@ -4,12 +4,16 @@
 #
 # Distributed under the terms of the MIT license.
 #
+from typing import Optional
+
 import pywikibot
 from pywikibot.data import api
 from pywikibot.echo import Notification
 from pywikibot.exceptions import (
     APIError,
+    Error,
     InconsistentTitleError,
+    NoPageError,
     SiteDefinitionError,
 )
 from pywikibot.site._decorators import need_extension, need_right
@@ -713,3 +717,50 @@
         req = self.simple_request(action='shortenurl', url=url)
         data = req.submit()
         return data['shortenurl']['shorturl']
+
+
+class TextExtractsMixin:
+
+    """APISite mixin for TextExtracts extension.
+
+    .. versionadded:: 7.1
+    """
+
+    @need_extension('TextExtracts')
+    def extract(self, page: 'pywikibot.Page', *,
+                chars: Optional[int] = None,
+                sentences: Optional[int] = None,
+                intro: bool = True,
+                plaintext: bool = True) -> str:
+        """Retrieve an extract of a page.
+
+        :param page: The Page object for which the extract is read
+        :param chars: How many characters to return.  Actual text
+            returned might be slightly longer.
+        :param sentences: How many sentences to return
+        :param intro: Return only content before the first section
+        :param plaintext: if True, return extracts as plain text instead
+            of limited HTML
+
+        .. seealso::
+
+           - https://www.mediawiki.org/wiki/Extension:TextExtracts
+
+           - :meth:`pywikibot.page.BasePage.extract`.
+        """
+        if not page.exists():
+            raise NoPageError(page)
+        req = self._simple_request(action='query',
+                                   prop='extracts',
+                                   titles=page.title(with_section=False),
+                                   exchars=chars,
+                                   exsentences=sentences,
+                                   exintro=intro,
+                                   explaintext=plaintext)
+        data = req.submit()['query']['pages']
+        if '-1' in data:
+            msg = data['-1'].get('invalidreason',
+                                 'Unknown exception:\n{}'.format(data['-1']))
+            raise Error(msg)
+
+        return data[str(page.pageid)]['extract']
diff --git a/scripts/speedy_delete.py b/scripts/speedy_delete.py
index d512230..ed8f386 100755
--- a/scripts/speedy_delete.py
+++ b/scripts/speedy_delete.py
@@ -26,7 +26,7 @@
 # Distributed under the terms of the MIT license.
 #
 import time
-from textwrap import fill, wrap
+from textwrap import fill
import pywikibot
 from pywikibot import i18n, pagegenerators
@@ -430,18 +430,10 @@
         """Process one page."""
         page = self.current_page
-        page_text = []
-        for text in page.text.split('\n'):
-            page_text += wrap(text, width=79) or ['']
-
-        pywikibot.output(color_format('{blue}{}{default}', '_' * 80))
-        if len(page_text) > self.LINES:
-            pywikibot.output(color_format(
-                '{blue}The page detail is too many lines, '
-                'only output first {} lines:{default}', self.LINES))
-        pywikibot.output(
-            '\n'.join(page_text[:min(self.LINES, len(page_text))]))
-        pywikibot.output(color_format('{blue}{}{default}', '_' * 80))
+        color_line = color_format('{blue}{}{default}', '_' * 80)
+        pywikibot.output(color_line)
+        pywikibot.output(page.extract('wiki', lines=self.LINES))
+        pywikibot.output(color_line)
choice = pywikibot.input_choice(
             'Input action?',
-- 
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/770098
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I3cfaafad050dec62e135cd36b14a929b14029294
Gerrit-Change-Number: 770098
Gerrit-PatchSet: 8
Gerrit-Owner: Xqt info@gno.de
Gerrit-Reviewer: Multichill maarten@mdammers.nl
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Ricordisamoa ricordisamoa@disroot.org
Gerrit-MessageType: merged



    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

[Pywikibot-commits] [Gerrit] ...core[master]: [IMPR] Add TextExtracts support