[Pywikipedia-svn] SVN: [9086] trunk/pywikipedia/catlib.py

17 Mar 2011

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9086
Revision: 9086
Author:   saper
Date:     2011-03-17 14:20:21 +0000 (Thu, 17 Mar 2011)
Log Message:
-----------
Introduce sorting of category members by timestamp as well as asc/desc direction
* Add sortby parameter values "timestamp" or "sortkey" with sortkey being the default
* Add sortdir parameter with values "asc" or "desc"
* Implement paging for timestamps across multiple API invocations
Modified Paths:
--------------
    trunk/pywikipedia/catlib.py
Modified: trunk/pywikipedia/catlib.py
===================================================================

--- trunk/pywikipedia/catlib.py	2011-03-16 14:23:31 UTC (rev 9085)
+++ trunk/pywikipedia/catlib.py	2011-03-17 14:20:21 UTC (rev 9086)
@@ -97,7 +97,8 @@
         else:
             return '[[%s]]' % titleWithSortKey
-    def _getAndCacheContents(self, recurse=False, purge=False, startFrom=None, cache=None):
+    def _getAndCacheContents(self, recurse=False, purge=False, startFrom=None, cache=None, 
+                                   sortby=None, sortdir=None):
         """
         Cache results of _parseCategory for a second call.
@@ -133,10 +134,11 @@
                         # contents of subcategory are cached by calling
                         # this method recursively; therefore, do not cache
                         # them again
-                        for item in subcat._getAndCacheContents(newrecurse, purge, cache=cache):
+                        for item in subcat._getAndCacheContents(newrecurse, purge, cache=cache,
+                                            sortby=sortby, sortdir=sortdir):
                             yield item
         else:
-            for tag, page in self._parseCategory(purge, startFrom):
+            for tag, page in self._parseCategory(purge, startFrom, sortby, sortdir):
                 if tag == ARTICLE:
                     self.articleCache.append(page)
                     if not page in cache:
@@ -151,23 +153,26 @@
                             # contents of subcategory are cached by calling
                             # this method recursively; therefore, do not cache
                             # them again
-                            for item in page._getAndCacheContents(newrecurse, purge, cache=cache):
+                            for item in page._getAndCacheContents(newrecurse, purge, cache=cache,
+                                             sortby=sortby, sortdir=sortdir):
                                 yield item
             if not startFrom:
                 self.completelyCached = True
-    def _getContentsNaive(self, recurse=False, startFrom=None):
+    def _getContentsNaive(self, recurse=False, startFrom=None, sortby=None, sortdir=None):
         """
         Simple category content yielder. Naive, do not attempts to
         cache anything
         """
-        for tag, page in self._parseCategory(startFrom=startFrom):
+        for tag, page in self._parseCategory(startFrom=startFrom, 
+                                             sortby=sortby, sortdir=sortdir):
             yield tag, page
             if tag == SUBCATEGORY and recurse:
-                for item in page._getContentsNaive(recurse=True):
+                for item in page._getContentsNaive(recurse=True, 
+                                                   sortby=sortby, sortdir=sortdir):
                     yield item
-    def _parseCategory(self, purge=False, startFrom=None):
+    def _parseCategory(self, purge=False, startFrom=None, sortby=None, sortdir=None):
         """
         Yields all articles and subcategories that are in this category by API.
@@ -194,6 +199,10 @@
             'cmprop': ['title', 'ids', 'sortkey', 'timestamp'],
             #'': '',
         }
+        if sortby:
+            params['cmsort'] = sortby
+        if sortdir:
+            params['cmdir'] = sortdir
         while True:
             if wikipedia.config.special_page_limit > 500:
                 params['cmlimit'] = 500
@@ -201,9 +210,9 @@
                 params['cmlimit'] = wikipedia.config.special_page_limit
if currentPageOffset:
-                params['cmcontinue'] = currentPageOffset
+                params.update(currentPageOffset)
                 wikipedia.output('Getting [[%s]] list from %s...'
-                                 % (self.title(), currentPageOffset[:-1])) # cmcontinue last key is '|'
+                                 % (self.title(), "%s=%s" % currentPageOffset.popitem()))
             elif startFrom:
                 params['cmstartsortkey'] = startFrom
                 wikipedia.output('Getting [[%s]] list starting at %s...'
@@ -230,7 +239,7 @@
                     break
             # try to find a link to the next list page
             if 'query-continue' in data and count < params['cmlimit']:
-                currentPageOffset = data['query-continue']['categorymembers']['cmcontinue']
+                currentPageOffset = data['query-continue']['categorymembers']
             else:
                 break
@@ -339,7 +348,8 @@
             else:
                 break
-    def subcategories(self, recurse=False, startFrom=None, cacheResults=False):
+    def subcategories(self, recurse=False, startFrom=None, cacheResults=False,
+                            sortby=None, sortdir=None):
         """
         Yields all subcategories of the current category.
@@ -360,11 +370,12 @@
             gen = self._getAndCacheContents
         else:
             gen = self._getContentsNaive
-        for tag, subcat in gen(recurse=recurse, startFrom=startFrom):
+        for tag, subcat in gen(recurse=recurse, startFrom=startFrom, sortby=sortby,
+                               sortdir=sortdir):
             if tag == SUBCATEGORY:
                 yield subcat
-    def subcategoriesList(self, recurse=False):
+    def subcategoriesList(self, recurse=False, sortby=None, sortdir=None):
         """
         Creates a list of all subcategories of the current category.
@@ -374,11 +385,12 @@
         The elements of the returned list are sorted and unique.
         """
         subcats = []
-        for cat in self.subcategories(recurse):
+        for cat in self.subcategories(recurse, sortby=sortby, sortdir=sortdir):
             subcats.append(cat)
         return unique(subcats)
-    def articles(self, recurse=False, startFrom=None, cacheResults=False):
+    def articles(self, recurse=False, startFrom=None, cacheResults=False,
+                       sortby=None, sortdir=None):
         """
         Yields all articles of the current category.
@@ -398,11 +410,12 @@
             gen = self._getAndCacheContents
         else:
             gen = self._getContentsNaive
-        for tag, page in gen(recurse=recurse, startFrom=startFrom):
+        for tag, page in gen(recurse=recurse, startFrom=startFrom,
+                             sortby=sortby, sortdir=sortdir):
             if tag == ARTICLE:
                 yield page
-    def articlesList(self, recurse=False):
+    def articlesList(self, recurse=False, sortby=None, sortdir=None):
         """
         Creates a list of all articles of the current category.
@@ -413,7 +426,7 @@
         The elements of the returned list are sorted and unique.
         """
         articles = []
-        for article in self.articles(recurse):
+        for article in self.articles(recurse, sortby=sortby, sortdir=sortdir):
             articles.append(article)
         return unique(articles)

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Pywikipedia-svn] SVN: [9086] trunk/pywikipedia/catlib.py