jenkins-bot has submitted this change and it was merged.
Change subject: pagegenerators: introduce pageids generator ......................................................................
pagegenerators: introduce pageids generator
Retrieve pages based on pageids.
Split request in groups of pages; number of pages per request is determined via paraminfo information and user limit permissions.
Pages are returned in the same order as requested.
Bug: T133209 Change-Id: Iab190339c8fa3ead62168a08c2c243a3bb1e54d5 --- M pywikibot/pagegenerators.py M pywikibot/site.py M tests/pagegenerators_tests.py M tests/site_tests.py 4 files changed, 225 insertions(+), 0 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index e4bc8da..df77b2c 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -300,6 +300,10 @@ "-page:pagetitle", and supplied multiple times for multiple pages.
+-pageid Work on a single pageid. Argument can also be given as + "-pageid:pageid1,pageid2,." or "-pageid:'pageid1|pageid2|..'" + and supplied multiple times for multiple pages. + -grep A regular expression that needs to match the article otherwise the page won't be returned. Multiple -grep:regexpr can be provided and the page will @@ -758,6 +762,10 @@ if not value: value = pywikibot.input(u'What page do you want to use?') gen = [pywikibot.Page(pywikibot.Link(value, self.site))] + elif arg == '-pageid': + if not value: + value = pywikibot.input(u'What pageid do you want to use?') + gen = PagesFromPageidGenerator(value, site=self.site) elif arg == '-uncatfiles': gen = UnCategorizedImageGenerator(site=self.site) elif arg == '-uncatcat': @@ -1336,6 +1344,25 @@ yield pywikibot.Page(pywikibot.Link(title, site))
+def PagesFromPageidGenerator(pageids, site=None): + """ + Return a page generator from pageids. + + Pages are iterated in the same order than in the underlying pageids. + Pageids are filtered and only one page is returned in case of + duplicate pageid. + + @param pageids: an iterable that returns pageids, or a comma-separated + string of pageids (e.g. '945097,1483753,956608') + @param site: Site for generator results. + @type site: L{pywikibot.site.BaseSite} + """ + if site is None: + site = pywikibot.Site() + + return site.load_pages_from_pageids(pageids) + + @deprecated_args(number='total', step=None) def UserContributionsGenerator(username, namespaces=None, site=None, total=None, _filter_unique=filter_unique): diff --git a/pywikibot/site.py b/pywikibot/site.py index 8789299..9250a5b 100644 --- a/pywikibot/site.py +++ b/pywikibot/site.py @@ -19,6 +19,7 @@ import datetime import functools import hashlib +import heapq import itertools import json import mimetypes @@ -70,6 +71,7 @@ manage_wrapping, MediaWikiVersion, first_upper, normalize_username, merge_unique_dicts, PY2, + filter_unique, ) from pywikibot.tools.ip import is_IP
@@ -3093,6 +3095,63 @@
return page._redirtarget
+ def load_pages_from_pageids(self, pageids): + """ + Return a page generator from pageids. + + Pages are iterated in the same order than in the underlying pageids. + + Pageids are filtered and only one page is returned in case of + duplicate pageids. + + @param pageids: an iterable that returns pageids (str or int), + or a comma- or pipe-separated string of pageids + (e.g. '945097,1483753, 956608' or '945097|483753|956608') + """ + if isinstance(pageids, basestring): + pageids = pageids.replace('|', ',') + pageids = pageids.split(',') + pageids = [p.strip() for p in pageids] + + # Validate pageids. + gen = (str(int(p)) for p in pageids if int(p) > 0) + + # Find out how many pages can be specified at a time. + parameter = self._paraminfo.parameter('query+info', 'prop') + if self.logged_in() and self.has_right('apihighlimits'): + groupsize = int(parameter['highlimit']) + else: + groupsize = int(parameter['limit']) + + for sublist in itergroup(filter_unique(gen), groupsize): + # Store the order of the input data. + priority_dict = dict(zip(sublist, range(len(sublist)))) + + prio_queue = [] + next_prio = 0 + params = {'pageids': sublist, } + rvgen = api.PropertyGenerator('info', site=self, parameters=params) + + for pagedata in rvgen: + title = pagedata['title'] + pageid = str(pagedata['pageid']) + page = pywikibot.Page(pywikibot.Link(title, source=self)) + api.update_page(page, pagedata) + priority, page = heapq.heappushpop(prio_queue, + (priority_dict[pageid], page)) + # Smallest priority matches expected one; yield early. + if priority == next_prio: + yield page + next_prio += 1 + else: + # Push onto the heap. + heapq.heappush(prio_queue, (priority, page)) + + # Extract data in the same order of the input data. + while prio_queue: + priority, page = heapq.heappop(prio_queue) + yield page + def preloadpages(self, pagelist, groupsize=50, templates=False, langlinks=False, pageprops=False): """Return a generator to a list of preloaded pages. diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py index ff3ef7e..dc92beb 100755 --- a/tests/pagegenerators_tests.py +++ b/tests/pagegenerators_tests.py @@ -191,6 +191,34 @@ self.assertEqual(len(tuple(gen)), 9)
+class TestPagesFromPageidGenerator(TestCase): + + """Test PagesFromPageidGenerator method.""" + + family = 'wikisource' + code = 'en' + + base_title = 'Page:06-24-1920 -The Story of the Jones County Calf Case.pdf/%s' + + def setUp(self): + """Setup tests.""" + super(TestPagesFromPageidGenerator, self).setUp() + self.site = self.get_site() + self.titles = [self.base_title % i for i in range(1, 11)] + + def test_PagesFromPageidGenerator(self): + """Test PagesFromPageidGenerator.""" + gen_pages = pagegenerators.PagesFromTitlesGenerator(self.titles, + self.site) + pageids = [] + for page in gen_pages: + page.latest_revision_id # Force page info loading. + pageids.append(page._pageid) + + gen = pagegenerators.PagesFromPageidGenerator(pageids, self.site) + self.assertPagelistTitles(gen, self.titles) + + class TestCategoryFilterPageGenerator(TestCase):
"""Test CategoryFilterPageGenerator method.""" @@ -848,6 +876,30 @@ self.assertIsNotNone(gen) self.assertPagesInNamespaces(gen, set([1, 3]))
+ def test_pageid(self): + """Test pageid parameter.""" + # Get reference pages and their pageids. + gf = pagegenerators.GeneratorFactory(site=self.get_site()) + self.assertTrue(gf.handleArg('-prefixindex:a')) + gf.handleArg('-limit:10') + gen = gf.getCombinedGenerator() + pages = list(gen) + self.assertEqual(len(pages), 10) + # pipe-separated used as test reference. + pageids = '|'.join(str(page._pageid) for page in pages) + + # Get by pageids. + gf = pagegenerators.GeneratorFactory(site=self.get_site()) + gf.handleArg('-pageid:%s' % pageids) + gen = gf.getCombinedGenerator() + self.assertIsNotNone(gen) + pages_from_pageid = list(gen) + self.assertEqual(len(pages_from_pageid), 10) + for page_a, page_b in zip(pages, pages_from_pageid): + self.assertIsInstance(page_a, pywikibot.Page) + self.assertIsInstance(page_b, pywikibot.Page) + self.assertTrue(page_a, page_b) + def test_pagegenerator(self): """Test page generator.""" gf = pagegenerators.GeneratorFactory(site=self.site) diff --git a/tests/site_tests.py b/tests/site_tests.py index a3a2525..5c247dd 100644 --- a/tests/site_tests.py +++ b/tests/site_tests.py @@ -2459,6 +2459,93 @@ self.assertTrue(site.is_uploaddisabled())
+class TestLoadPagesFromPageids(DefaultSiteTestCase): + + """Test site.load_pages_from_pageids().""" + + cached = True + + def setUp(self): + """Setup tests.""" + super(TestLoadPagesFromPageids, self).setUp() + self.site = self.get_site() + mainpage = pywikibot.Page(pywikibot.Link('Main Page', self.site)) + self.links = list(self.site.pagelinks(mainpage, total=10)) + + def test_load_from_pageids_iterable_of_str(self): + """Test basic loading with pageids.""" + pageids = [str(page._pageid) for page in self.links] + gen = self.site.load_pages_from_pageids(pageids) + for count, page in enumerate(gen, start=1): + self.assertIsInstance(page, pywikibot.Page) + self.assertIsInstance(page.exists(), bool) + if page.exists(): + self.assertTrue(hasattr(page, '_pageid')) + self.assertIn(page, self.links) + self.assertEqual(count, len(self.links)) + + def test_load_from_pageids_iterable_of_int(self): + """Test basic loading with pageids.""" + pageids = [page._pageid for page in self.links] + gen = self.site.load_pages_from_pageids(pageids) + for count, page in enumerate(gen, start=1): + self.assertIsInstance(page, pywikibot.Page) + self.assertIsInstance(page.exists(), bool) + if page.exists(): + self.assertTrue(hasattr(page, '_pageid')) + self.assertIn(page, self.links) + self.assertEqual(count, len(self.links)) + + def test_load_from_pageids_iterable_in_order(self): + """Test loading with pageids is ordered.""" + pageids = [page._pageid for page in self.links] + gen = self.site.load_pages_from_pageids(pageids) + for page in gen: + link = self.links.pop(0) + self.assertIsInstance(page, pywikibot.Page) + self.assertIsInstance(page.exists(), bool) + if page.exists(): + self.assertTrue(hasattr(page, '_pageid')) + self.assertEqual(page, link) + + def test_load_from_pageids_iterable_with_duplicate(self): + """Test loading with duplicate pageids.""" + pageids = [page._pageid for page in self.links] + pageids = pageids + pageids + gen = self.site.load_pages_from_pageids(pageids) + for count, page in enumerate(gen, start=1): + self.assertIsInstance(page, pywikibot.Page) + self.assertIsInstance(page.exists(), bool) + if page.exists(): + self.assertTrue(hasattr(page, '_pageid')) + self.assertIn(page, self.links) + self.assertEqual(count, len(self.links)) + + def test_load_from_pageids_comma_separated(self): + """Test loading from comma-separated pageids.""" + pageids = ', '.join(str(page._pageid) for page in self.links) + gen = self.site.load_pages_from_pageids(pageids) + for count, page in enumerate(gen, start=1): + self.assertIsInstance(page, pywikibot.Page) + self.assertIsInstance(page.exists(), bool) + if page.exists(): + self.assertTrue(hasattr(page, '_pageid')) + self.assertIn(page, self.links) + self.assertEqual(count, len(self.links)) + + def test_load_from_pageids_pipe_separated(self): + """Test loading from comma-separated pageids.""" + pageids = '|'.join(str(page._pageid) for page in self.links) + gen = self.site.load_pages_from_pageids(pageids) + for count, page in enumerate(gen, start=1): + self.assertIsInstance(page, pywikibot.Page) + self.assertIsInstance(page.exists(), bool) + if page.exists(): + self.assertTrue(hasattr(page, '_pageid')) + self.assertIn(page, self.links) + self.assertEqual(count, len(self.links)) + + class TestPagePreloading(DefaultSiteTestCase):
"""Test site.preloadpages()."""
pywikibot-commits@lists.wikimedia.org