jenkins-bot merged this change.
Use a key for filter_unique where appropriate
filter_unique can potentially consume a lot of memory since it stores
returned objects in a set if no `key` is provided. (T199615)
This patch adds an appropriate `key` to filter_unique calls wherever
the aforementioned issue might be a problem.
filter_unique has several usages:
- tools_tests.py:
We don't have any memory issues here.
- site.py:
No change is needed. The only usage of filter_unique is to filter
unique pageids which do not consume much memory.
- pagegenerators.py, scripts/template.py, scripts/nowcommons.py:
filter_unique is applied to page generators. Use
'{}:{}:{}'.format(*page._cmpkey()) as the key.
This patch-set is being proposed as an alternative to [1] that avoids
making `hash` the default key of `filter_unique`.
[1]: https://gerrit.wikimedia.org/r/c/pywikibot/core/+/445854/
Bug: T199615
Change-Id: Ia701e9f8bece713174ea4bccd104c0ae25ff5c1d
---
M pywikibot/pagegenerators.py
M scripts/nowcommons.py
M scripts/template.py
3 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 898237c..29e058a 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -29,6 +29,7 @@
import sys
from datetime import timedelta
+from functools import partial
from warnings import warn
import pywikibot
@@ -403,13 +404,12 @@
"""
# This is the function that will be used to de-duplicate iterators.
- # See the documentation in L{pywikibot.tools.filter_unique} for reasons
- # why this should be changed to improve space and time of execution.
- _filter_unique = staticmethod(filter_unique)
+ _filter_unique = staticmethod(partial(
+ filter_unique, key=lambda p: '{}:{}:{}'.format(*p._cmpkey())))
# The seen list can not yet be shared at present, due to `intersect` mode
# not being known until after all generators have been created.
# When not in intersect mode, _filter_unique could be:
- # functools.partial(filter_unique, container=global_seen_list)
+ # functools.partial(filter_unique, container=global_seen_list, key=...)
def __init__(self, site=None, positional_arg_name=None):
"""
@@ -1619,7 +1619,6 @@
@type namespaces: list of int
@param site: Site for generator results.
@type site: L{pywikibot.site.BaseSite}
-
"""
if site is None:
site = pywikibot.Site()
@@ -1630,9 +1629,9 @@
.format(user.username, site))
return _filter_unique(
- contrib[0]
- for contrib in user.contributions(namespaces=namespaces, total=total)
- )
+ (contrib[0] for contrib in user.contributions(
+ namespaces=namespaces, total=total)),
+ key=lambda p: '{}:{}:{}'.format(*p._cmpkey()))
def NamespaceFilterPageGenerator(generator, namespaces, site=None):
diff --git a/scripts/nowcommons.py b/scripts/nowcommons.py
index a82f52a..b29d7df 100755
--- a/scripts/nowcommons.py
+++ b/scripts/nowcommons.py
@@ -222,7 +222,7 @@
only_template_inclusion=True)
for t in self.nc_templates)
gen = chain(*gens)
- gen = filter_unique(gen)
+ gen = filter_unique(gen, key=lambda p: '{}:{}:{}'.format(*p._cmpkey()))
gen = pg.PreloadingGenerator(gen)
return gen
diff --git a/scripts/template.py b/scripts/template.py
index 570e23c..0a729b9 100755
--- a/scripts/template.py
+++ b/scripts/template.py
@@ -359,7 +359,7 @@
for t in old_templates
)
gen = chain(*gens)
- gen = filter_unique(gen)
+ gen = filter_unique(gen, key=lambda p: '{}:{}:{}'.format(*p._cmpkey()))
if user:
gen = pagegenerators.UserEditFilterGenerator(gen, user, timestamp,
skip,
To view, visit change 451824. To unsubscribe, or for help writing mail filters, visit settings.