jenkins-bot has submitted this change and it was merged.
Change subject: pagegenerators: introduce pageids generator
......................................................................
pagegenerators: introduce pageids generator
Retrieve pages based on pageids.
Split request in groups of pages; number of pages per request is
determined via paraminfo information and user limit permissions.
Pages are returned in the same order as requested.
Bug: T133209
Change-Id: Iab190339c8fa3ead62168a08c2c243a3bb1e54d5
---
M pywikibot/pagegenerators.py
M pywikibot/site.py
M tests/pagegenerators_tests.py
M tests/site_tests.py
4 files changed, 225 insertions(+), 0 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index e4bc8da..df77b2c 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -300,6 +300,10 @@
"-page:pagetitle", and supplied multiple times for
multiple pages.
+-pageid Work on a single pageid. Argument can also be given as
+ "-pageid:pageid1,pageid2,." or
"-pageid:'pageid1|pageid2|..'"
+ and supplied multiple times for multiple pages.
+
-grep A regular expression that needs to match the article
otherwise the page won't be returned.
Multiple -grep:regexpr can be provided and the page will
@@ -758,6 +762,10 @@
if not value:
value = pywikibot.input(u'What page do you want to use?')
gen = [pywikibot.Page(pywikibot.Link(value, self.site))]
+ elif arg == '-pageid':
+ if not value:
+ value = pywikibot.input(u'What pageid do you want to use?')
+ gen = PagesFromPageidGenerator(value, site=self.site)
elif arg == '-uncatfiles':
gen = UnCategorizedImageGenerator(site=self.site)
elif arg == '-uncatcat':
@@ -1336,6 +1344,25 @@
yield pywikibot.Page(pywikibot.Link(title, site))
+def PagesFromPageidGenerator(pageids, site=None):
+ """
+ Return a page generator from pageids.
+
+ Pages are iterated in the same order than in the underlying pageids.
+ Pageids are filtered and only one page is returned in case of
+ duplicate pageid.
+
+ @param pageids: an iterable that returns pageids, or a comma-separated
+ string of pageids (e.g. '945097,1483753,956608')
+ @param site: Site for generator results.
+ @type site: L{pywikibot.site.BaseSite}
+ """
+ if site is None:
+ site = pywikibot.Site()
+
+ return site.load_pages_from_pageids(pageids)
+
+
@deprecated_args(number='total', step=None)
def UserContributionsGenerator(username, namespaces=None, site=None,
total=None, _filter_unique=filter_unique):
diff --git a/pywikibot/site.py b/pywikibot/site.py
index 8789299..9250a5b 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -19,6 +19,7 @@
import datetime
import functools
import hashlib
+import heapq
import itertools
import json
import mimetypes
@@ -70,6 +71,7 @@
manage_wrapping, MediaWikiVersion, first_upper, normalize_username,
merge_unique_dicts,
PY2,
+ filter_unique,
)
from pywikibot.tools.ip import is_IP
@@ -3093,6 +3095,63 @@
return page._redirtarget
+ def load_pages_from_pageids(self, pageids):
+ """
+ Return a page generator from pageids.
+
+ Pages are iterated in the same order than in the underlying pageids.
+
+ Pageids are filtered and only one page is returned in case of
+ duplicate pageids.
+
+ @param pageids: an iterable that returns pageids (str or int),
+ or a comma- or pipe-separated string of pageids
+ (e.g. '945097,1483753, 956608' or '945097|483753|956608')
+ """
+ if isinstance(pageids, basestring):
+ pageids = pageids.replace('|', ',')
+ pageids = pageids.split(',')
+ pageids = [p.strip() for p in pageids]
+
+ # Validate pageids.
+ gen = (str(int(p)) for p in pageids if int(p) > 0)
+
+ # Find out how many pages can be specified at a time.
+ parameter = self._paraminfo.parameter('query+info', 'prop')
+ if self.logged_in() and self.has_right('apihighlimits'):
+ groupsize = int(parameter['highlimit'])
+ else:
+ groupsize = int(parameter['limit'])
+
+ for sublist in itergroup(filter_unique(gen), groupsize):
+ # Store the order of the input data.
+ priority_dict = dict(zip(sublist, range(len(sublist))))
+
+ prio_queue = []
+ next_prio = 0
+ params = {'pageids': sublist, }
+ rvgen = api.PropertyGenerator('info', site=self, parameters=params)
+
+ for pagedata in rvgen:
+ title = pagedata['title']
+ pageid = str(pagedata['pageid'])
+ page = pywikibot.Page(pywikibot.Link(title, source=self))
+ api.update_page(page, pagedata)
+ priority, page = heapq.heappushpop(prio_queue,
+ (priority_dict[pageid], page))
+ # Smallest priority matches expected one; yield early.
+ if priority == next_prio:
+ yield page
+ next_prio += 1
+ else:
+ # Push onto the heap.
+ heapq.heappush(prio_queue, (priority, page))
+
+ # Extract data in the same order of the input data.
+ while prio_queue:
+ priority, page = heapq.heappop(prio_queue)
+ yield page
+
def preloadpages(self, pagelist, groupsize=50, templates=False,
langlinks=False, pageprops=False):
"""Return a generator to a list of preloaded pages.
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index ff3ef7e..dc92beb 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -191,6 +191,34 @@
self.assertEqual(len(tuple(gen)), 9)
+class TestPagesFromPageidGenerator(TestCase):
+
+ """Test PagesFromPageidGenerator method."""
+
+ family = 'wikisource'
+ code = 'en'
+
+ base_title = 'Page:06-24-1920 -The Story of the Jones County Calf
Case.pdf/%s'
+
+ def setUp(self):
+ """Setup tests."""
+ super(TestPagesFromPageidGenerator, self).setUp()
+ self.site = self.get_site()
+ self.titles = [self.base_title % i for i in range(1, 11)]
+
+ def test_PagesFromPageidGenerator(self):
+ """Test PagesFromPageidGenerator."""
+ gen_pages = pagegenerators.PagesFromTitlesGenerator(self.titles,
+ self.site)
+ pageids = []
+ for page in gen_pages:
+ page.latest_revision_id # Force page info loading.
+ pageids.append(page._pageid)
+
+ gen = pagegenerators.PagesFromPageidGenerator(pageids, self.site)
+ self.assertPagelistTitles(gen, self.titles)
+
+
class TestCategoryFilterPageGenerator(TestCase):
"""Test CategoryFilterPageGenerator method."""
@@ -848,6 +876,30 @@
self.assertIsNotNone(gen)
self.assertPagesInNamespaces(gen, set([1, 3]))
+ def test_pageid(self):
+ """Test pageid parameter."""
+ # Get reference pages and their pageids.
+ gf = pagegenerators.GeneratorFactory(site=self.get_site())
+ self.assertTrue(gf.handleArg('-prefixindex:a'))
+ gf.handleArg('-limit:10')
+ gen = gf.getCombinedGenerator()
+ pages = list(gen)
+ self.assertEqual(len(pages), 10)
+ # pipe-separated used as test reference.
+ pageids = '|'.join(str(page._pageid) for page in pages)
+
+ # Get by pageids.
+ gf = pagegenerators.GeneratorFactory(site=self.get_site())
+ gf.handleArg('-pageid:%s' % pageids)
+ gen = gf.getCombinedGenerator()
+ self.assertIsNotNone(gen)
+ pages_from_pageid = list(gen)
+ self.assertEqual(len(pages_from_pageid), 10)
+ for page_a, page_b in zip(pages, pages_from_pageid):
+ self.assertIsInstance(page_a, pywikibot.Page)
+ self.assertIsInstance(page_b, pywikibot.Page)
+ self.assertTrue(page_a, page_b)
+
def test_pagegenerator(self):
"""Test page generator."""
gf = pagegenerators.GeneratorFactory(site=self.site)
diff --git a/tests/site_tests.py b/tests/site_tests.py
index a3a2525..5c247dd 100644
--- a/tests/site_tests.py
+++ b/tests/site_tests.py
@@ -2459,6 +2459,93 @@
self.assertTrue(site.is_uploaddisabled())
+class TestLoadPagesFromPageids(DefaultSiteTestCase):
+
+ """Test site.load_pages_from_pageids()."""
+
+ cached = True
+
+ def setUp(self):
+ """Setup tests."""
+ super(TestLoadPagesFromPageids, self).setUp()
+ self.site = self.get_site()
+ mainpage = pywikibot.Page(pywikibot.Link('Main Page', self.site))
+ self.links = list(self.site.pagelinks(mainpage, total=10))
+
+ def test_load_from_pageids_iterable_of_str(self):
+ """Test basic loading with pageids."""
+ pageids = [str(page._pageid) for page in self.links]
+ gen = self.site.load_pages_from_pageids(pageids)
+ for count, page in enumerate(gen, start=1):
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertIsInstance(page.exists(), bool)
+ if page.exists():
+ self.assertTrue(hasattr(page, '_pageid'))
+ self.assertIn(page, self.links)
+ self.assertEqual(count, len(self.links))
+
+ def test_load_from_pageids_iterable_of_int(self):
+ """Test basic loading with pageids."""
+ pageids = [page._pageid for page in self.links]
+ gen = self.site.load_pages_from_pageids(pageids)
+ for count, page in enumerate(gen, start=1):
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertIsInstance(page.exists(), bool)
+ if page.exists():
+ self.assertTrue(hasattr(page, '_pageid'))
+ self.assertIn(page, self.links)
+ self.assertEqual(count, len(self.links))
+
+ def test_load_from_pageids_iterable_in_order(self):
+ """Test loading with pageids is ordered."""
+ pageids = [page._pageid for page in self.links]
+ gen = self.site.load_pages_from_pageids(pageids)
+ for page in gen:
+ link = self.links.pop(0)
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertIsInstance(page.exists(), bool)
+ if page.exists():
+ self.assertTrue(hasattr(page, '_pageid'))
+ self.assertEqual(page, link)
+
+ def test_load_from_pageids_iterable_with_duplicate(self):
+ """Test loading with duplicate pageids."""
+ pageids = [page._pageid for page in self.links]
+ pageids = pageids + pageids
+ gen = self.site.load_pages_from_pageids(pageids)
+ for count, page in enumerate(gen, start=1):
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertIsInstance(page.exists(), bool)
+ if page.exists():
+ self.assertTrue(hasattr(page, '_pageid'))
+ self.assertIn(page, self.links)
+ self.assertEqual(count, len(self.links))
+
+ def test_load_from_pageids_comma_separated(self):
+ """Test loading from comma-separated pageids."""
+ pageids = ', '.join(str(page._pageid) for page in self.links)
+ gen = self.site.load_pages_from_pageids(pageids)
+ for count, page in enumerate(gen, start=1):
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertIsInstance(page.exists(), bool)
+ if page.exists():
+ self.assertTrue(hasattr(page, '_pageid'))
+ self.assertIn(page, self.links)
+ self.assertEqual(count, len(self.links))
+
+ def test_load_from_pageids_pipe_separated(self):
+ """Test loading from comma-separated pageids."""
+ pageids = '|'.join(str(page._pageid) for page in self.links)
+ gen = self.site.load_pages_from_pageids(pageids)
+ for count, page in enumerate(gen, start=1):
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertIsInstance(page.exists(), bool)
+ if page.exists():
+ self.assertTrue(hasattr(page, '_pageid'))
+ self.assertIn(page, self.links)
+ self.assertEqual(count, len(self.links))
+
+
class TestPagePreloading(DefaultSiteTestCase):
"""Test site.preloadpages()."""
--
To view, visit
https://gerrit.wikimedia.org/r/280080
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Iab190339c8fa3ead62168a08c2c243a3bb1e54d5
Gerrit-PatchSet: 17
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>