jenkins-bot submitted this change.

View Change

Approvals: Matěj Suchánek: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] use *iterables instead of genlist in intersect_generators

- as common in itertool enable multiple argumensts for *iterables
instead of a single genlist list
- deprecate then old behaviour
- force allow_duplicates to be a keyword argument and deprecate
the unwanted unwanted usage as positional argument
- early return if there are less than 2 iterables
- don't import Counter again
- update documentation and add a sample
- update tests accordingly

Change-Id: Iec0fc7f9c220883b8f3b5e76786539b9320bb3f2
---
M pywikibot/pagegenerators.py
M pywikibot/tools/__init__.py
M tests/thread_tests.py
3 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 5be66b9..1545ca2 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -593,7 +593,7 @@
'"-intersect" ignored as only one generator is specified.')
elif self.intersect:
# By definition no duplicates are possible.
- dupfiltergen = intersect_generators(self.gens)
+ dupfiltergen = intersect_generators(*self.gens)
else:
dupfiltergen = _filter_unique_pages(itertools.chain(*self.gens))

diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 52b40ff..996efd8 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -926,11 +926,10 @@
.format(thd, thd.queue.qsize()), self._logger)


-def intersect_generators(genlist, allow_duplicates=False):
- """
- Intersect generators listed in genlist.
+def intersect_generators(*iterables, allow_duplicates: bool = False):
+ """Intersect generators listed in iterables.

- Yield items only if they are yielded by all generators in genlist.
+ Yield items only if they are yielded by all generators of iterables.
Threads (via ThreadedGenerator) are used in order to run generators
in parallel, so that items can be yielded before generators are
exhausted.
@@ -939,13 +938,42 @@
Quitting before all generators are finished is attempted if
there is no more chance of finding an item in all queues.

- :param genlist: list of page generators
- :type genlist: list
- :param allow_duplicates: allow duplicates if present in all generators
- :type allow_duplicates: bool
+ Sample:
+ >>> iterables = 'mississippi', 'missouri'
+ >>> list(intersect_generators(*iterables))
+ ['m', 'i', 's']
+ >>> list(intersect_generators(*iterables, allow_duplicates=True))
+ ['m', 'i', 's', 's', 'i']
+
+ :param iterables: page generators
+ :param allow_duplicates: optional keyword argument to allow duplicates
+ if present in all generators
"""
+ # 'allow_duplicates' must be given as keyword argument
+ if iterables and iterables[-1] in (True, False):
+ allow_duplicates = iterables[-1]
+ iterables = iterables[:-1]
+ issue_deprecation_warning("'allow_duplicates' as positional argument",
+ 'keyword argument "allow_duplicates={}"'
+ .format(allow_duplicates),
+ since='6.4.0')
+
+ # iterables must not be given as tuple or list
+ if len(iterables) == 1 and isinstance(iterables[0], (list, tuple)):
+ iterables = iterables[0]
+ issue_deprecation_warning("'iterables' as list type",
+ "consecutive iterables or use '*' to unpack",
+ since='6.4.0')
+
+ if not iterables:
+ return
+
+ if len(iterables) == 1:
+ yield from iterables[0]
+ return
+
# If any generator is empty, no pages are going to be returned
- for source in genlist:
+ for source in iterables:
if not source:
debug('At least one generator ({!r}) is empty and execution was '
'skipped immediately.'.format(source), 'intersect')
@@ -953,20 +981,19 @@

# Item is cached to check that it is found n_gen
# times before being yielded.
- from collections import Counter
- cache = collections.defaultdict(Counter)
- n_gen = len(genlist)
+ cache = collections.defaultdict(collections.Counter)
+ n_gen = len(iterables)

# Class to keep track of alive threads.
# Start new threads and remove completed threads.
thrlist = ThreadList()

- for source in genlist:
+ for source in iterables:
threaded_gen = ThreadedGenerator(name=repr(source), target=source)
threaded_gen.daemon = True
thrlist.append(threaded_gen)

- ones = Counter(thrlist)
+ ones = collections.Counter(thrlist)
seen = {}

while True:
diff --git a/tests/thread_tests.py b/tests/thread_tests.py
index dfc7c07..44ec655 100644
--- a/tests/thread_tests.py
+++ b/tests/thread_tests.py
@@ -49,7 +49,7 @@
# first otherwise the generator is empty the second time.
datasets = [list(gen) for gen in gens]
set_result = set(datasets[0]).intersection(*datasets[1:])
- result = list(intersect_generators(datasets))
+ result = list(intersect_generators(*datasets))

self.assertCountEqual(set(result), result)
self.assertCountEqual(result, set_result)
@@ -63,7 +63,7 @@
for dataset in datasets[1:]:
counter_result = counter_result & Counter(dataset)
counter_result = list(counter_result.elements())
- result = list(intersect_generators(datasets, allow_duplicates=True))
+ result = list(intersect_generators(*datasets, allow_duplicates=True))
self.assertCountEqual(counter_result, result)



To view, visit change 700445. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Iec0fc7f9c220883b8f3b5e76786539b9320bb3f2
Gerrit-Change-Number: 700445
Gerrit-PatchSet: 7
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged