jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[fix] re-implement Page.main_authors() using xtools

Additional parameters are used to filter the result.

Bug: T366100
Change-Id: I9bed91ae63de6d2583b9e3ccb0970900e167b340
---
M pywikibot/page/_toolforge.py
M requirements.txt
M tests/wikiblame_tests.py
M tox.ini
4 files changed, 193 insertions(+), 55 deletions(-)

diff --git a/pywikibot/page/_toolforge.py b/pywikibot/page/_toolforge.py
index ebdf5b2..b701c1f 100644
--- a/pywikibot/page/_toolforge.py
+++ b/pywikibot/page/_toolforge.py
@@ -11,9 +11,21 @@

import collections
import re
+from typing import TYPE_CHECKING

import pywikibot
-from pywikibot import config
+from pywikibot import textlib
+from pywikibot.tools import deprecated, deprecated_args
+
+try:
+ import wikitextparser
+except ImportError as e:
+ wikitextparser = e
+
+if TYPE_CHECKING:
+ import datetime
+ from pywikibot import Timestamp
+ DATETYPE = str | Timestamp | datetime.datetime | datetime.date | None


class WikiBlameMixin:
@@ -24,7 +36,10 @@
"""

#: Supported wikipedia site codes
- WIKIBLAME_CODES = 'als', 'bar', 'de', 'en', 'it', 'nds', 'sco'
+ WIKIBLAME_CODES = (
+ 'ar', 'de', 'en', 'es', 'eu', 'fr', 'hu', 'id', 'it', 'ja', 'nl', 'pl',
+ 'pt', 'tr',
+ )

def _check_wh_supported(self):
"""Check if WikiHistory is supported."""
@@ -44,68 +59,161 @@
if not self.exists():
raise pywikibot.exceptions.NoPageError(self)

- def main_authors(self, *,
- onlynew: bool | None = None) -> collections.Counter:
- """Retrieve the 5 topmost main authors of an article.
+ if isinstance(wikitextparser, ImportError):
+ raise wikitextparser

- This method uses WikiHistory to retrieve the text based main
- authorship.
+ @deprecated('authorsship', since='9.3.0')
+ @deprecated_args(onlynew=None) # since 9.2.0
+ def main_authors(self) -> collections.Counter[str, int]:
+ """Retrieve the 5 topmost main authors of an article.

Sample:

>>> import pywikibot
- >>> site = pywikibot.Site('wikipedia:nds')
- >>> page = pywikibot.Page(site, 'Python (Programmeerspraak)')
- >>> auth = page.main_authors(onlynew=False)
- >>> auth
- Counter({'RebeccaBreu': 99, 'Slomox': 1})
+ >>> site = pywikibot.Site('wikipedia:eu')
+ >>> page = pywikibot.Page(site, 'Python (informatika)')
+ >>> auth = page.main_authors()
+ >>> auth.most_common(1)
+ [('Ksarasola', 80)]

- .. note:: Only implemented for main namespace pages.
- .. note:: Only wikipedias of :attr:`WIKIBLAME_CODES` are supported.
- .. attention:: This method does not return new results due to
- :phab:`366100`.
+ .. important:: Only implemented for main namespace pages and
+ only wikipedias of :attr:`WIKIBLAME_CODES` are supported.
.. seealso::
- https://wikihistory.toolforge.org
- https://de.wikipedia.org/wiki/Wikipedia:Technik/Cloud/wikihistory
+ - https://xtools.wmcloud.org/authorship/

.. versionchanged:: 9.2
do not use any wait cycles due to :phab:`366100`.
+ .. versionchanged:: 9.3
+ https://xtools.wmcloud.org/authorship/ is used to retrieve
+ authors
+ .. deprecated:: 9.3
+ use :meth:`authorship` instead.

- :param onlynew: Currently meaningless
- :return: Number of edits for each username
- :raise NotImplementedError: unsupported site or unsupported namespace
- :raise pywikibot.exceptions.NoPageError: The page does not exist
- :raise pywikibot.exceptions.TimeoutError: No cached results found
+ :return: Percentage of edits for each username
+
+ :raise ImportError: missing ``wikitextparser`` module.
+ :raise NotImplementedError: unsupported site or unsupported
+ namespace.
+ :raise Error: Error response from xtools.
+ :raise NoPageError: The page does not exist.
+ :raise requests.exceptions.HTTPError: 429 Client Error: Too Many
+ Requests for url; login to meta family first.
"""
- baseurl = 'https://wikihistory.toolforge.org'
- pattern = (r'><bdi>(?P<author>.+?)</bdi></a>\s'
- r'\((?P<percent>\d{1,3})&')
+ return collections.Counter(
+ {user: int(cnt) for user, (_, cnt) in self.authorship(5).items()})
+
+ def authorship(
+ self,
+ n: int | None = None,
+ *,
+ min_chars: int = 0,
+ min_pct: float = 0.0,
+ max_pct_sum: float | None = None,
+ revid: int | None = None,
+ date: DATETYPE = None,
+ ) -> dict[str, tuple[int, float]]:
+ """Retrieve authorship attributon of an article.
+
+ This method uses XTools/Authorship to retrieve the authors
+ measured by character count.
+
+ Sample:
+
+ >>> import pywikibot
+ >>> site = pywikibot.Site('wikipedia:en')
+ >>> page = pywikibot.Page(site, 'Pywikibot')
+ >>> auth = page.authorship()
+ >>> auth
+ {'1234qwer1234qwer4': (68, 100.0)}
+
+ .. important:: Only implemented for main namespace pages and
+ only wikipedias of :attr:`WIKIBLAME_CODES` are supported.
+ .. seealso::
+ - https://xtools.wmcloud.org/authorship/
+ - https://www.mediawiki.org/wiki/XTools/Authorship
+ - https://www.mediawiki.org/wiki/WikiWho
+
+ .. versionadded:: 9.3
+ this method replaces :meth:`main_authors`.
+
+ :param n: Only return the first *n* or fewer authors.
+ :param min_chars: Only return authors with more than *min_chars*
+ chars changes.
+ :param min_pct: Only return authors with more than *min_pct*
+ percentage edits.
+ :param max_pct_sum: Only return authors until the prcentage sum
+ reached *max_pct_sum*.
+ :param revid: The revision id for the authors should be found.
+ If ``None`` or ``0``, the latest revision is be used. Cannot
+ be used together with *date*.
+ :param date: The revision date for the authors should be found.
+ If ``None``, it will be ignored. Cannot be used together
+ with *revid*. If the parameter is a string it must be given
+ in the form ``YYYY-MM-DD``
+ :return: Character count and percentage of edits for each
+ username.
+
+ :raise ImportError: missing ``wikitextparser`` module
+ :raise NotImplementedError: unsupported site or unsupported
+ namespace.
+ :raise Error: Error response from xtools.
+ :raiseNoPageError: The page does not exist.
+ :raise requests.exceptions.HTTPError: 429 Client Error: Too Many
+ Requests for url; login to meta family first.
+ """
+ baseurl = 'https://xtools.wmcloud.org/authorship/{url}&format=wikitext'
+ pattern = r'\[\[.+[|/](?P<user>.+)\]\]'

self._check_wh_supported()

- url = baseurl + '/wiki/getauthors.php?wiki={}wiki&page_id={}'.format(
- self.site.code, self.pageid)
- if onlynew:
- url += '&onlynew=1'
+ if revid and date:
+ raise ValueError(
+ 'You cannot specify revid together with date argument')

- for current_retries in range(config.max_retries):
- r = pywikibot.comms.http.fetch(url)
- if r.status_code != 200:
- r.raise_for_status()
+ if date is None:
+ show = revid or 0
+ else:
+ show = str(date)[:10]

- if 'Timeout' not in r.text: # window.setTimeout in result
- return collections.Counter(
- {user: int(cnt)
- for user, cnt in re.findall(pattern, r.text)})
+ url = '{}.wikipedia.org/{}/{}?uselang={}'.format(
+ self.site.code,
+ self.title(as_url=True, with_ns=False, with_section=False),
+ show,
+ 'en',
+ )
+ url = baseurl.format(url=url)

- break # T366100
+ r = pywikibot.comms.http.fetch(url)
+ if r.status_code != 200:
+ r.raise_for_status()

- delay = pywikibot.config.retry_wait * 2 ** current_retries
- pywikibot.warning('WikiHistory timeout.\n'
- f'Waiting {delay:.1f} seconds before retrying.')
- pywikibot.sleep(delay)
- if onlynew is None and current_retries >= config.max_retries - 2:
- url += '&onlynew=1'
+ result: list[list[str]] = []
+ try:
+ table = wikitextparser.parse(r.text).tables[0]
+ except IndexError:
+ pattern = textlib.get_regexes('code')[0]
+ msg = pattern.search(r.text)[0]
+ raise pywikibot.exceptions.Error(textlib.removeHTMLParts(msg))

- raise pywikibot.exceptions.TimeoutError(
- 'Maximum retries attempted without success.')
+ pct_sum = 0.0
+ for row in table.data():
+ if row[0] == 'Rank':
+ continue # skip headline
+
+ rank = int(row[0])
+ user = re.match(pattern, row[1])['user']
+ chars = int(row[3].replace(',', '_'))
+ percent = float(row[4].rstrip('%'))
+
+ # take into account tht data() is ordered
+ if n and rank > n or chars < min_chars or percent < min_pct:
+ break
+
+ result.append((user, chars, percent))
+ pct_sum += percent
+ if max_pct_sum and pct_sum >= max_pct_sum:
+ break
+
+ return {user: (chars, percent) for user, chars, percent in result}
diff --git a/requirements.txt b/requirements.txt
index 5da319c..9120e09 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,7 +26,8 @@
# MediaWiki markup parser
# mwparserfromhell is mandatory but wikitextparser can be used instead
# mwparserfromhell is still required for commons_information.py and patrol.py
-# wikitextparser>=0.47.5
+# wikitextparser is required for Page.authorship()
+wikitextparser>=0.47.5

# OAuth support
# mwoauth 0.2.4 is needed because it supports getting identity information
diff --git a/tests/wikiblame_tests.py b/tests/wikiblame_tests.py
index 3789b6a..0d5057c 100644
--- a/tests/wikiblame_tests.py
+++ b/tests/wikiblame_tests.py
@@ -1,16 +1,17 @@
"""Tests for the WikiHistoryMixin."""
#
-# (C) Pywikibot team, 2022-2023
+# (C) Pywikibot team, 2022-2024
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations

+import re
import unittest
from contextlib import suppress

import pywikibot
-from tests.aspects import TestCase
+from tests.aspects import TestCase, require_modules


class TestWikiBlameMixin(TestCase):
@@ -18,17 +19,44 @@
"""Test WikiBlameMixin using nds wiki."""

family = 'wikipedia'
- code = 'nds'
+ code = 'nl'

+ def test_exceptions(self):
+ """Test that main_authors fails if page does not exist."""
+ page = pywikibot.Page(self.site, 'Pywikibot')
+ title = re.escape(page.title(as_link=True))
+ with self.assertRaisesRegex(pywikibot.exceptions.NoPageError,
+ f"Page {title} doesn't exist"):
+ page.authorship()
+
+ page = pywikibot.Page(self.site, 'Project:Pywikibot')
+ with self.assertRaisesRegex(
+ NotImplementedError,
+ 'main_authors method is implemented for main namespace only'):
+ page.authorship()
+
+ @require_modules('wikitextparser')
def test_main_authors(self):
"""Test main_authors() method."""
- page = pywikibot.Page(self.site, 'Python (Programmeerspraak)')
- auth = page.main_authors(onlynew=False)
+ page = pywikibot.Page(self.site, 'Python (programmeertaal)')
+ auth = page.authorship(5)
self.assertLessEqual(len(auth), 5)
- self.assertLessEqual(sum(auth.values()), 100)
- user, value = auth.most_common(1)[0]
- self.assertEqual(user, 'RebeccaBreu')
- self.assertGreater(value, 0)
+ self.assertLessEqual(sum(pct for _, pct in auth.values()), 100)
+ user, values = next(iter(auth.items()))
+ self.assertEqual(user, 'Emperor045')
+ self.assertIsInstance(values[0], int)
+ self.assertIsInstance(values[1], float)
+
+ @require_modules('wikitextparser')
+ def test_restrictions(self):
+ """Test main_authors() method with restrictions."""
+ page = pywikibot.Page(pywikibot.Site('wikipedia:en'), 'Python')
+ auth = page.authorship(min_chars=100, min_pct=5.0)
+ self.assertLessEqual(len(auth), 4)
+ for k, (chars, pct) in auth.items():
+ with self.subTest(user=k):
+ self.assertGreaterEqual(chars, 100)
+ self.assertGreaterEqual(pct, 5.0)


if __name__ == '__main__':
diff --git a/tox.ini b/tox.ini
index c254410..f11f8dd 100644
--- a/tox.ini
+++ b/tox.ini
@@ -82,6 +82,7 @@

deps =
pytest >= 7.0.1
+ wikitextparser
.[eventstreams]
.[mysql]


To view, visit change 1037928. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-MessageType: merged
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I9bed91ae63de6d2583b9e3ccb0970900e167b340
Gerrit-Change-Number: 1037928
Gerrit-PatchSet: 14
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot