jenkins-bot has submitted this change and it was merged.
Change subject: site.py: yield preloaded pages in the same order as requested ......................................................................
site.py: yield preloaded pages in the same order as requested
Change-Id: I3d1400b27fd61fe14b13dbe6138e310cdbe3048c --- M pywikibot/site.py M tests/pagegenerators_tests.py M tests/site_tests.py 3 files changed, 89 insertions(+), 26 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/site.py b/pywikibot/site.py index 2800514..a43aa0f 100644 --- a/pywikibot/site.py +++ b/pywikibot/site.py @@ -19,6 +19,7 @@ import datetime import functools import hashlib +import heapq import itertools import json import mimetypes @@ -3091,38 +3092,55 @@ langlinks=False, pageprops=False): """Return a generator to a list of preloaded pages.
- Note that [at least in current implementation] pages may be iterated - in a different order than in the underlying pagelist. + Pages are iterated in the same order than in the underlying pagelist. + In case of duplicates in a groupsize batch, return the first entry.
@param pagelist: an iterable that returns Page objects @param groupsize: how many Pages to query at a time @type groupsize: int - @param templates: preload list of templates in the pages - @param langlinks: preload list of language links found in the pages + @param templates: preload pages (typically templates) transcluded in + the provided pages + @type templates: bool + @param langlinks: preload all language links from the provided pages + to other languages + @type langlinks: bool + @param pageprops: preload various properties defined in the page content + @type pageprops: bool
""" + props = 'revisions|info|categoryinfo' + if templates: + props += '|templates' + if langlinks: + props += '|langlinks' + if pageprops: + props += '|pageprops' + + rvprop = ['ids', 'flags', 'timestamp', 'user', 'comment', 'content'] + for sublist in itergroup(pagelist, groupsize): + # Do not use p.pageid property as it will force page loading. pageids = [str(p._pageid) for p in sublist if hasattr(p, "_pageid") and p._pageid > 0] - cache = dict((p.title(withSection=False), p) for p in sublist) + cache = {} + # In case of duplicates, return the first entry. + for priority, page in enumerate(sublist): + cache.setdefault(page.title(withSection=False), + (priority, page))
- props = "revisions|info|categoryinfo" - if templates: - props += '|templates' - if langlinks: - props += '|langlinks' - if pageprops: - props += '|pageprops' + prio_queue = [] + next_prio = 0 rvgen = api.PropertyGenerator(props, site=self) rvgen.set_maximum_items(-1) # suppress use of "rvlimit" parameter if len(pageids) == len(sublist): # only use pageids if all pages have them - rvgen.request["pageids"] = "|".join(pageids) + rvgen.request['pageids'] = set(pageids) else: - rvgen.request["titles"] = "|".join(list(cache.keys())) - rvgen.request[u"rvprop"] = u"ids|flags|timestamp|user|comment|content" + rvgen.request['titles'] = list(cache.keys()) + rvgen.request['rvprop'] = rvprop pywikibot.output(u"Retrieving %s pages from %s." % (len(cache), self)) + for pagedata in rvgen: pywikibot.debug(u"Preloading %s" % pagedata, _logger) try: @@ -3148,8 +3166,20 @@ pywikibot.debug(u"pageids=%s" % pageids, _logger) pywikibot.debug(u"titles=%s" % list(cache.keys()), _logger) continue - page = cache[pagedata['title']] + priority, page = cache[pagedata['title']] api.update_page(page, pagedata, rvgen.props) + priority, page = heapq.heappushpop(prio_queue, (priority, page)) + # Smallest priority matches expected one; yield. + if priority == next_prio: + yield page + next_prio += 1 + else: + # Push back onto the heap. + heapq.heappush(prio_queue, (priority, page)) + + # Empty the heap. + while prio_queue: + priority, page = heapq.heappop(prio_queue) yield page
def validate_tokens(self, types): diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py index b5b8f57..5148716 100755 --- a/tests/pagegenerators_tests.py +++ b/tests/pagegenerators_tests.py @@ -481,30 +481,46 @@ def test_basic(self): """Test PreloadingGenerator with a list of pages.""" mainpage = self.get_mainpage() - links = list(self.site.pagelinks(mainpage, total=10)) + links = [page for page in self.site.pagelinks(mainpage, total=20) + if page.exists()] count = 0 for page in PreloadingGenerator(links, groupsize=20): self.assertIsInstance(page, pywikibot.Page) self.assertIsInstance(page.exists(), bool) - if page.exists(): - self.assertEqual(len(page._revisions), 1) - self.assertIsNotNone(page._revisions[page._revid].text) - self.assertFalse(hasattr(page, '_pageprops')) + self.assertEqual(len(page._revisions), 1) + self.assertIsNotNone(page._revisions[page._revid].text) + self.assertFalse(hasattr(page, '_pageprops')) count += 1 self.assertEqual(len(links), count)
def test_low_step(self): """Test PreloadingGenerator with a list of pages.""" mainpage = self.get_mainpage() - links = list(self.site.pagelinks(mainpage, total=20)) + links = [page for page in self.site.pagelinks(mainpage, total=20) + if page.exists()] count = 0 for page in PreloadingGenerator(links, groupsize=10): self.assertIsInstance(page, pywikibot.Page) self.assertIsInstance(page.exists(), bool) - if page.exists(): - self.assertEqual(len(page._revisions), 1) - self.assertIsNotNone(page._revisions[page._revid].text) - self.assertFalse(hasattr(page, '_pageprops')) + self.assertEqual(len(page._revisions), 1) + self.assertIsNotNone(page._revisions[page._revid].text) + self.assertFalse(hasattr(page, '_pageprops')) + count += 1 + self.assertEqual(len(links), count) + + def test_order(self): + """Test outcome is following same order of input.""" + mainpage = self.get_mainpage() + links = [page for page in self.site.pagelinks(mainpage, total=20) + if page.exists()] + count = 0 + for page in PreloadingGenerator(links, groupsize=10): + self.assertIsInstance(page, pywikibot.Page) + self.assertIsInstance(page.exists(), bool) + self.assertEqual(len(page._revisions), 1) + self.assertIsNotNone(page._revisions[page._revid].text) + self.assertFalse(hasattr(page, '_pageprops')) + self.assertEqual(page, links[count]) count += 1 self.assertEqual(len(links), count)
diff --git a/tests/site_tests.py b/tests/site_tests.py index 677e32d..22b59de 100644 --- a/tests/site_tests.py +++ b/tests/site_tests.py @@ -2463,6 +2463,23 @@
"""Test site.preloadpages()."""
+ def test_order(self): + """Test outcome is following same order of input.""" + mainpage = self.get_mainpage() + links = [page for page in self.site.pagelinks(mainpage, total=20) + if page.exists()] + pages = list(self.site.preloadpages(links, groupsize=5)) + self.assertEqual(pages, links) + + def test_duplicates(self): + """Test outcome is following same order of input.""" + mainpage = self.get_mainpage() + links = [page for page in self.site.pagelinks(mainpage, total=20) + if page.exists()] + dupl_links = links + links[::-1] + pages = list(self.site.preloadpages(dupl_links, groupsize=40)) + self.assertEqual(pages, links) + def test_pageids(self): """Test basic preloading with pageids.""" mysite = self.get_site()