jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/1022005?usp=email )
Change subject: [fix] use filter_unique() in Site.alllinks() for MW >= 1.43 ......................................................................
[fix] use filter_unique() in Site.alllinks() for MW >= 1.43
unique parameter is not supported with MW 1.43 currently and it might be dropped in misermode. Therefore use filter_unique() to ensure getting unique pages.
Bug: T359427 Change-Id: I16b7bd439dccfcc67b814e913955dea02a2700b4 --- M pywikibot/site/_generators.py M tests/site_generators_tests.py 2 files changed, 26 insertions(+), 14 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/site/_generators.py b/pywikibot/site/_generators.py index 457ae3c..a1be9e6 100644 --- a/pywikibot/site/_generators.py +++ b/pywikibot/site/_generators.py @@ -984,33 +984,47 @@ ) -> Generator[pywikibot.Page, None, None]: """Iterate all links to pages (which need not exist) in one namespace.
- Note that, in practice, links that were found on pages that have - been deleted may not have been removed from the links table, so this - method can return false positives. + .. note:: In practice, links that were found on pages that have + been deleted may not have been removed from the links table, + so this method can return false positives. + + .. caution:: *unique* parameter is no longer supported by + MediaWiki 1.43 or higher. Pywikibot uses + :func:`tools.itertools.filter_unique` in that case which + might be memory intensive. Use it with care.
.. seealso:: :api:`Alllinks`
:param start: Start at this title (page need not exist). :param prefix: Only yield pages starting with this string. :param namespace: Iterate pages from this (single) namespace - :param unique: If True, only iterate each link title once (default: - iterate once for each linking page) - :param fromids: if True, include the pageid of the page containing - each link (default: False) as the '_fromid' attribute of the Page; - cannot be combined with unique - :raises KeyError: the namespace identifier was not resolved - :raises TypeError: the namespace identifier has an inappropriate - type such as bool, or an iterable with more than one namespace + :param unique: If True, only iterate each link title once + (default: False) + :param fromids: if True, include the pageid of the page + containing each link (default: False) as the '_fromid' + attribute of the Page; cannot be combined with *unique* + :raises KeyError: the *namespace* identifier was not resolved + :raises TypeError: the *namespace* identifier has an + inappropriate type such as bool, or an iterable with more + than one namespace """ if unique and fromids: raise Error('alllinks: unique and fromids cannot both be True.') algen = self._generator(api.ListGenerator, type_arg='alllinks', namespaces=namespace, alfrom=start, - total=total, alunique=unique) + total=total) if prefix: algen.request['alprefix'] = prefix if fromids: algen.request['alprop'] = 'title|ids' + if not unique: + pass + elif self.mw_version < '1.43': + algen.request['alunique'] = True + else: + # unique filter for mw >= 1.43, use (title, ns) as key + # See: T359425, T359427 + algen = filter_unique(algen, key=lambda x: (x['title'], x['ns'])) for link in algen: p = pywikibot.Page(self, link['title'], link['ns']) if fromids: diff --git a/tests/site_generators_tests.py b/tests/site_generators_tests.py index 87f97ba..bc56f4a 100755 --- a/tests/site_generators_tests.py +++ b/tests/site_generators_tests.py @@ -338,8 +338,6 @@ def test_all_links(self): """Test the site.alllinks() method.""" mysite = self.get_site() - if mysite.sitename in ('wikipedia:de', 'wikipedia:en'): - self.skipTest(f'skipping test on {mysite} due to T359427') fwd = list(mysite.alllinks(total=10)) uniq = list(mysite.alllinks(total=10, unique=True))
pywikibot-commits@lists.wikimedia.org