jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1037928?usp=email )
Change subject: [fix] re-implement Page.main_authors() using xtools ......................................................................
[fix] re-implement Page.main_authors() using xtools
Additional parameters are used to filter the result.
Bug: T366100 Change-Id: I9bed91ae63de6d2583b9e3ccb0970900e167b340 --- M pywikibot/page/_toolforge.py M requirements.txt M tests/wikiblame_tests.py M tox.ini 4 files changed, 193 insertions(+), 55 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/page/_toolforge.py b/pywikibot/page/_toolforge.py index ebdf5b2..b701c1f 100644 --- a/pywikibot/page/_toolforge.py +++ b/pywikibot/page/_toolforge.py @@ -11,9 +11,21 @@
import collections import re +from typing import TYPE_CHECKING
import pywikibot -from pywikibot import config +from pywikibot import textlib +from pywikibot.tools import deprecated, deprecated_args + +try: + import wikitextparser +except ImportError as e: + wikitextparser = e + +if TYPE_CHECKING: + import datetime + from pywikibot import Timestamp + DATETYPE = str | Timestamp | datetime.datetime | datetime.date | None
class WikiBlameMixin: @@ -24,7 +36,10 @@ """
#: Supported wikipedia site codes - WIKIBLAME_CODES = 'als', 'bar', 'de', 'en', 'it', 'nds', 'sco' + WIKIBLAME_CODES = ( + 'ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id', 'it', 'ja', 'nl', 'pl', + 'pt', 'tr', + )
def _check_wh_supported(self): """Check if WikiHistory is supported.""" @@ -44,68 +59,161 @@ if not self.exists(): raise pywikibot.exceptions.NoPageError(self)
- def main_authors(self, *, - onlynew: bool | None = None) -> collections.Counter: - """Retrieve the 5 topmost main authors of an article. + if isinstance(wikitextparser, ImportError): + raise wikitextparser
- This method uses WikiHistory to retrieve the text based main - authorship. + @deprecated('authorsship', since='9.3.0') + @deprecated_args(onlynew=None) # since 9.2.0 + def main_authors(self) -> collections.Counter[str, int]: + """Retrieve the 5 topmost main authors of an article.
Sample:
>>> import pywikibot - >>> site = pywikibot.Site('wikipedia:nds') - >>> page = pywikibot.Page(site, 'Python (Programmeerspraak)') - >>> auth = page.main_authors(onlynew=False) - >>> auth - Counter({'RebeccaBreu': 99, 'Slomox': 1}) + >>> site = pywikibot.Site('wikipedia:eu') + >>> page = pywikibot.Page(site, 'Python (informatika)') + >>> auth = page.main_authors() + >>> auth.most_common(1) + [('Ksarasola', 80)]
- .. note:: Only implemented for main namespace pages. - .. note:: Only wikipedias of :attr:`WIKIBLAME_CODES` are supported. - .. attention:: This method does not return new results due to - :phab:`366100`. + .. important:: Only implemented for main namespace pages and + only wikipedias of :attr:`WIKIBLAME_CODES` are supported. .. seealso:: - https://wikihistory.toolforge.org - https://de.wikipedia.org/wiki/Wikipedia:Technik/Cloud/wikihistory + - https://xtools.wmcloud.org/authorship/
.. versionchanged:: 9.2 do not use any wait cycles due to :phab:`366100`. + .. versionchanged:: 9.3 + https://xtools.wmcloud.org/authorship/ is used to retrieve + authors + .. deprecated:: 9.3 + use :meth:`authorship` instead.
- :param onlynew: Currently meaningless - :return: Number of edits for each username - :raise NotImplementedError: unsupported site or unsupported namespace - :raise pywikibot.exceptions.NoPageError: The page does not exist - :raise pywikibot.exceptions.TimeoutError: No cached results found + :return: Percentage of edits for each username + + :raise ImportError: missing ``wikitextparser`` module. + :raise NotImplementedError: unsupported site or unsupported + namespace. + :raise Error: Error response from xtools. + :raise NoPageError: The page does not exist. + :raise requests.exceptions.HTTPError: 429 Client Error: Too Many + Requests for url; login to meta family first. """ - baseurl = 'https://wikihistory.toolforge.org' - pattern = (r'><bdi>(?P<author>.+?)</bdi></a>\s' - r'((?P<percent>\d{1,3})&') + return collections.Counter( + {user: int(cnt) for user, (_, cnt) in self.authorship(5).items()}) + + def authorship( + self, + n: int | None = None, + *, + min_chars: int = 0, + min_pct: float = 0.0, + max_pct_sum: float | None = None, + revid: int | None = None, + date: DATETYPE = None, + ) -> dict[str, tuple[int, float]]: + """Retrieve authorship attributon of an article. + + This method uses XTools/Authorship to retrieve the authors + measured by character count. + + Sample: + + >>> import pywikibot + >>> site = pywikibot.Site('wikipedia:en') + >>> page = pywikibot.Page(site, 'Pywikibot') + >>> auth = page.authorship() + >>> auth + {'1234qwer1234qwer4': (68, 100.0)} + + .. important:: Only implemented for main namespace pages and + only wikipedias of :attr:`WIKIBLAME_CODES` are supported. + .. seealso:: + - https://xtools.wmcloud.org/authorship/ + - https://www.mediawiki.org/wiki/XTools/Authorship + - https://www.mediawiki.org/wiki/WikiWho + + .. versionadded:: 9.3 + this method replaces :meth:`main_authors`. + + :param n: Only return the first *n* or fewer authors. + :param min_chars: Only return authors with more than *min_chars* + chars changes. + :param min_pct: Only return authors with more than *min_pct* + percentage edits. + :param max_pct_sum: Only return authors until the prcentage sum + reached *max_pct_sum*. + :param revid: The revision id for the authors should be found. + If ``None`` or ``0``, the latest revision is be used. Cannot + be used together with *date*. + :param date: The revision date for the authors should be found. + If ``None``, it will be ignored. Cannot be used together + with *revid*. If the parameter is a string it must be given + in the form ``YYYY-MM-DD`` + :return: Character count and percentage of edits for each + username. + + :raise ImportError: missing ``wikitextparser`` module + :raise NotImplementedError: unsupported site or unsupported + namespace. + :raise Error: Error response from xtools. + :raiseNoPageError: The page does not exist. + :raise requests.exceptions.HTTPError: 429 Client Error: Too Many + Requests for url; login to meta family first. + """ + baseurl = 'https://xtools.wmcloud.org/authorship/%7Burl%7D&format=wikitext' + pattern = r'[[.+[|/](?P<user>.+)]]'
self._check_wh_supported()
- url = baseurl + '/wiki/getauthors.php?wiki={}wiki&page_id={}'.format( - self.site.code, self.pageid) - if onlynew: - url += '&onlynew=1' + if revid and date: + raise ValueError( + 'You cannot specify revid together with date argument')
- for current_retries in range(config.max_retries): - r = pywikibot.comms.http.fetch(url) - if r.status_code != 200: - r.raise_for_status() + if date is None: + show = revid or 0 + else: + show = str(date)[:10]
- if 'Timeout' not in r.text: # window.setTimeout in result - return collections.Counter( - {user: int(cnt) - for user, cnt in re.findall(pattern, r.text)}) + url = '{}.wikipedia.org/{}/{}?uselang={}'.format( + self.site.code, + self.title(as_url=True, with_ns=False, with_section=False), + show, + 'en', + ) + url = baseurl.format(url=url)
- break # T366100 + r = pywikibot.comms.http.fetch(url) + if r.status_code != 200: + r.raise_for_status()
- delay = pywikibot.config.retry_wait * 2 ** current_retries - pywikibot.warning('WikiHistory timeout.\n' - f'Waiting {delay:.1f} seconds before retrying.') - pywikibot.sleep(delay) - if onlynew is None and current_retries >= config.max_retries - 2: - url += '&onlynew=1' + result: list[list[str]] = [] + try: + table = wikitextparser.parse(r.text).tables[0] + except IndexError: + pattern = textlib.get_regexes('code')[0] + msg = pattern.search(r.text)[0] + raise pywikibot.exceptions.Error(textlib.removeHTMLParts(msg))
- raise pywikibot.exceptions.TimeoutError( - 'Maximum retries attempted without success.') + pct_sum = 0.0 + for row in table.data(): + if row[0] == 'Rank': + continue # skip headline + + rank = int(row[0]) + user = re.match(pattern, row[1])['user'] + chars = int(row[3].replace(',', '_')) + percent = float(row[4].rstrip('%')) + + # take into account tht data() is ordered + if n and rank > n or chars < min_chars or percent < min_pct: + break + + result.append((user, chars, percent)) + pct_sum += percent + if max_pct_sum and pct_sum >= max_pct_sum: + break + + return {user: (chars, percent) for user, chars, percent in result} diff --git a/requirements.txt b/requirements.txt index 5da319c..9120e09 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,7 +26,8 @@ # MediaWiki markup parser # mwparserfromhell is mandatory but wikitextparser can be used instead # mwparserfromhell is still required for commons_information.py and patrol.py -# wikitextparser>=0.47.5 +# wikitextparser is required for Page.authorship() +wikitextparser>=0.47.5
# OAuth support # mwoauth 0.2.4 is needed because it supports getting identity information diff --git a/tests/wikiblame_tests.py b/tests/wikiblame_tests.py index 3789b6a..0d5057c 100644 --- a/tests/wikiblame_tests.py +++ b/tests/wikiblame_tests.py @@ -1,16 +1,17 @@ """Tests for the WikiHistoryMixin.""" # -# (C) Pywikibot team, 2022-2023 +# (C) Pywikibot team, 2022-2024 # # Distributed under the terms of the MIT license. # from __future__ import annotations
+import re import unittest from contextlib import suppress
import pywikibot -from tests.aspects import TestCase +from tests.aspects import TestCase, require_modules
class TestWikiBlameMixin(TestCase): @@ -18,17 +19,44 @@ """Test WikiBlameMixin using nds wiki."""
family = 'wikipedia' - code = 'nds' + code = 'nl'
+ def test_exceptions(self): + """Test that main_authors fails if page does not exist.""" + page = pywikibot.Page(self.site, 'Pywikibot') + title = re.escape(page.title(as_link=True)) + with self.assertRaisesRegex(pywikibot.exceptions.NoPageError, + f"Page {title} doesn't exist"): + page.authorship() + + page = pywikibot.Page(self.site, 'Project:Pywikibot') + with self.assertRaisesRegex( + NotImplementedError, + 'main_authors method is implemented for main namespace only'): + page.authorship() + + @require_modules('wikitextparser') def test_main_authors(self): """Test main_authors() method.""" - page = pywikibot.Page(self.site, 'Python (Programmeerspraak)') - auth = page.main_authors(onlynew=False) + page = pywikibot.Page(self.site, 'Python (programmeertaal)') + auth = page.authorship(5) self.assertLessEqual(len(auth), 5) - self.assertLessEqual(sum(auth.values()), 100) - user, value = auth.most_common(1)[0] - self.assertEqual(user, 'RebeccaBreu') - self.assertGreater(value, 0) + self.assertLessEqual(sum(pct for _, pct in auth.values()), 100) + user, values = next(iter(auth.items())) + self.assertEqual(user, 'Emperor045') + self.assertIsInstance(values[0], int) + self.assertIsInstance(values[1], float) + + @require_modules('wikitextparser') + def test_restrictions(self): + """Test main_authors() method with restrictions.""" + page = pywikibot.Page(pywikibot.Site('wikipedia:en'), 'Python') + auth = page.authorship(min_chars=100, min_pct=5.0) + self.assertLessEqual(len(auth), 4) + for k, (chars, pct) in auth.items(): + with self.subTest(user=k): + self.assertGreaterEqual(chars, 100) + self.assertGreaterEqual(pct, 5.0)
if __name__ == '__main__': diff --git a/tox.ini b/tox.ini index c254410..f11f8dd 100644 --- a/tox.ini +++ b/tox.ini @@ -82,6 +82,7 @@
deps = pytest >= 7.0.1 + wikitextparser .[eventstreams] .[mysql]
pywikibot-commits@lists.wikimedia.org