jenkins-bot has submitted this change and it was merged.
Change subject: [FEAT] Category: Get newest pages ......................................................................
[FEAT] Category: Get newest pages
This returns the pages which have been added to a category ordered by the creation date from newest to oldest. To do that it needs to cache all pages and check on each iteration if any previously cached page was created after the current page is added. If that is the case the cached pages are newer than the current page and any other page which will be checked as the generator progresses as they are ordered by addition to the category and only pages are returned which were added to the category before the current page was added.
Change-Id: I9bb3f74bbe2e3319ed2dbcdefab414a3204c2c32 --- M pywikibot/page.py M tests/category_tests.py 2 files changed, 90 insertions(+), 0 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/page.py b/pywikibot/page.py index c63af01..61e1d1b 100644 --- a/pywikibot/page.py +++ b/pywikibot/page.py @@ -2439,6 +2439,74 @@ """ return self.site.categoryinfo(self)
+ def newest_pages(self, total=None): + """ + Return pages in a category ordered by the creation date. + + If two or more pages are created at the same time, the pages are + returned in the order they were added to the category. The most recently + added page is returned first. + + It only allows to return the pages ordered from newest to oldest, as it + is impossible to determine the oldest page in a category without + checking all pages. But it is possible to check the category in order + with the newly added first and it yields all pages which were created + after the currently checked page was added (and thus there is no page + created after any of the cached but added before the currently checked). + + @param total: The total number of pages queried. + @type total: int + @return: A page generator of all pages in a category ordered by the + creation date. From newest to oldest. Note: It currently only + returns Page instances and not a subclass of it if possible. This + might change so don't expect to only get Page instances. + @rtype: generator + """ + def check_cache(latest): + """Return the cached pages in order and not more than total.""" + cached = [] + for timestamp in sorted((ts for ts in cache if ts > latest), + reverse=True): + # The complete list can be removed, it'll either yield all of + # them, or only a portion but will skip the rest anyway + cached += cache.pop(timestamp)[:None if total is None else + total - len(cached)] + if total and len(cached) >= total: + break # already got enough + assert(total is None or len(cached) <= total) + return cached + + # all pages which have been checked but where created before the + # current page was added, at some point they will be created after + # the current page was added. It saves all pages via the creation + # timestamp. Be prepared for multiple pages. + cache = defaultdict(list) + # TODO: Make site.categorymembers is usable as it returns pages + # There is no total defined, as it's not known how many pages need to be + # checked before the total amount of new pages was found. In worst case + # all pages of a category need to be checked. + for member in pywikibot.data.api.QueryGenerator( + site=self.site, list='categorymembers', cmsort='timestamp', + cmdir='older', cmprop='timestamp|title', + cmtitle=self.title()): + # TODO: Upcast to suitable class + page = pywikibot.Page(self.site, member['title']) + assert(page.namespace() == member['ns']) + cached = check_cache(pywikibot.Timestamp.fromISOformat( + member['timestamp'])) + for cached_page in cached: + yield cached_page + if total is not None: + total -= len(cached) + if total <= 0: + break + cache[page.oldest_revision.timestamp] += [page] + else: + # clear cache + assert(total is None or total > 0) + for cached_page in check_cache(pywikibot.Timestamp.min): + yield cached_page + # ### DEPRECATED METHODS #### @deprecated("list(Category.subcategories(...))") def subcategoriesList(self, recurse=False): diff --git a/tests/category_tests.py b/tests/category_tests.py index 8923884..d9d543c 100644 --- a/tests/category_tests.py +++ b/tests/category_tests.py @@ -209,6 +209,28 @@ self.assertEqual(cat.aslink(sortKey='Foo'), '[[Category:Wikipedia categories|Foo]]')
+class CategoryNewestPages(TestCase): + + """Test newest_pages feature on French Wikinews.""" + + family = 'wikinews' + code = 'fr' + + cached = True + + def test_newest_pages(self): + """Test that the pages are getting older.""" + cat = pywikibot.Category(self.get_site(), u'Catégorie:Yukon Quest 2015') + last = pywikibot.Timestamp.max + count = 0 + for page in cat.newest_pages(): + creation_stamp = page.oldest_revision.timestamp + self.assertLessEqual(creation_stamp, last) + last = creation_stamp + count += 1 + self.assertEqual(count, cat.categoryinfo['size']) + + if __name__ == '__main__': try: unittest.main()
pywikibot-commits@lists.wikimedia.org