jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/631282 )
Change subject: [bugfix] Avoid duplicates in intersect_generators()
......................................................................
[bugfix] Avoid duplicates in intersect_generators()
Avoid duplicates in intersect_generators().
Add also the possibility to have duplicates if all generators contain
the same page several times.
The solution uses a cache, so for infinite streams there will be memory
issues.
Bug: T263947
Change-Id: I86adcacded455ecd87a35b140dd371cc6caca5a4
---
M pywikibot/tools/__init__.py
M tests/thread_tests.py
2 files changed, 47 insertions(+), 15 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 10bf6bc..b44f246 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -692,7 +692,7 @@
.format(thd, thd.queue.qsize()), self._logger)
-def intersect_generators(genlist):
+def intersect_generators(genlist, allow_duplicates=False):
"""
Intersect generators listed in genlist.
@@ -707,6 +707,8 @@
@param genlist: list of page generators
@type genlist: list
+ @param allow_duplicates: allow duplicates if present in all generators
+ @type allow_duplicates: bool
"""
# If any generator is empty, no pages are going to be returned
for source in genlist:
@@ -717,7 +719,8 @@
# Item is cached to check that it is found n_gen
# times before being yielded.
- cache = collections.defaultdict(set)
+ from collections import Counter
+ cache = collections.defaultdict(Counter)
n_gen = len(genlist)
# Class to keep track of alive threads.
@@ -729,6 +732,9 @@
threaded_gen.daemon = True
thrlist.append(threaded_gen)
+ ones = Counter(thrlist)
+ seen = {}
+
while True:
# Get items from queues in a round-robin way.
for t in thrlist:
@@ -736,14 +742,23 @@
# TODO: evaluate if True and timeout is necessary.
item = t.queue.get(True, 0.1)
- # Cache entry is a set of thread.
- # Duplicates from same thread are not counted twice.
- cache[item].add(t)
+ if not allow_duplicates and hash(item) in seen:
+ continue
+
+ # Cache entry is a Counter of ThreadedGenerator objects.
+ cache[item].update([t])
if len(cache[item]) == n_gen:
- yield item
- # Remove item from cache.
- # No chance of seeing it again (see later: early stop).
- cache.pop(item)
+ if allow_duplicates:
+ yield item
+ # Remove item from cache if possible.
+ if all(el == 1 for el in cache[item].values()):
+ cache.pop(item)
+ else:
+ cache[item] -= ones
+ else:
+ yield item
+ cache.pop(item)
+ seen[hash(item)] = True
active = thrlist.active_count()
max_cache = n_gen
diff --git a/tests/thread_tests.py b/tests/thread_tests.py
index 615395b..1516b0e 100644
--- a/tests/thread_tests.py
+++ b/tests/thread_tests.py
@@ -5,6 +5,7 @@
#
# Distributed under the terms of the MIT license.
#
+from collections import Counter
from contextlib import suppress
from tests.aspects import unittest, TestCase
@@ -53,6 +54,18 @@
self.assertCountEqual(set(result), result)
self.assertCountEqual(result, set_result)
+ def assertEqualItertoolsWithDuplicates(self, gens):
+ """Assert intersect_generators result equals Counter
intersection."""
+ # If they are a generator, we need to convert to a list
+ # first otherwise the generator is empty the second time.
+ datasets = [list(gen) for gen in gens]
+ counter_result = Counter(datasets[0])
+ for dataset in datasets[1:]:
+ counter_result = counter_result & Counter(dataset)
+ counter_result = list(counter_result.elements())
+ result = list(intersect_generators(datasets, allow_duplicates=True))
+ self.assertCountEqual(counter_result, result)
+
class BasicGeneratorIntersectTestCase(GeneratorIntersectTestCase):
@@ -61,17 +74,21 @@
net = False
def test_intersect_basic(self):
- """Test basic interset without duplicates."""
+ """Test basic intersect without duplicates."""
self.assertEqualItertools(['abc', 'db', 'ba'])
def test_intersect_with_dups(self):
- """Test basic interset with duplicates."""
+ """Test basic intersect with duplicates."""
self.assertEqualItertools(['aabc', 'dddb', 'baa'])
- @unittest.expectedFailure
- def test_intersect_with_dups_failing(self):
- """Failing basic intersect test with
duplicates."""
- self.assertEqualItertools(['abb', 'bb'])
+ def test_intersect_with_accepted_dups(self):
+ """Test intersect with duplicates accepted."""
+ self.assertEqualItertoolsWithDuplicates(['abc', 'db',
'ba'])
+ self.assertEqualItertoolsWithDuplicates(['aabc', 'dddb',
'baa'])
+ self.assertEqualItertoolsWithDuplicates(['abb', 'bb'])
+ self.assertEqualItertoolsWithDuplicates(['bb', 'abb'])
+ self.assertEqualItertoolsWithDuplicates(['abbcd', 'abcba'])
+ self.assertEqualItertoolsWithDuplicates(['abcba', 'abbcd'])
if __name__ == '__main__': # pragma: no cover
--
To view, visit
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/631282
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I86adcacded455ecd87a35b140dd371cc6caca5a4
Gerrit-Change-Number: 631282
Gerrit-PatchSet: 2
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged