SVN: [7136] trunk/pywikipedia/catlib.py - Pywikipedia-svn

9 Aug 2009

Revision: 7136
Author:   alexsh
Date:     2009-08-09 05:18:21 +0000 (Sun, 09 Aug 2009)
Log Message:
-----------
catlib.py
*Category()._parseCategory(): Change use API query(list=categorymembers), could except to old way(moved to _oldParseCategory) if version not support or NotImplementedError
(tested for 12 hours in [[ja:WP:BOTREQ]])
Modified Paths:
--------------
    trunk/pywikipedia/catlib.py
Modified: trunk/pywikipedia/catlib.py
===================================================================

--- trunk/pywikipedia/catlib.py	2009-08-08 15:28:21 UTC (rev 7135)
+++ trunk/pywikipedia/catlib.py	2009-08-09 05:18:21 UTC (rev 7136)
@@ -166,6 +166,69 @@
def _parseCategory(self, purge=False, startFrom=None):
         """
+        Yields all articles and subcategories that are in this category by API.
+
+        Set startFrom to a string which is the title of the page to start from.
+
+        Yielded results are tuples in the form (tag, page) where tag is one
+        of the constants ARTICLE and SUBCATEGORY, and title is the Page or Category
+        object.
+
+        Note that results of this method need not be unique.
+
+        This should not be used outside of this module.
+        """
+        try:
+            if self.site().versionnumber() >= 11:
+                api_url = self.site().api_address()
+                del api_url
+            else:
+                raise NotImplementedError # version not support
+        except NotImplementedError:
+            for tag, page in self._oldParseCategory(purge, startFrom):
+                yield tag, page
+            return
+        
+        currentPageOffset = None
+        while True:
+            params = {
+                'action': 'query',
+                'list': 'categorymembers',
+                'cmtitle': self.title(),
+                'cmprop': 'title',#|ids|sortkey|timestamp',
+                #'cmlimit': config.special_page_limit,
+                #'': '',
+            }
+            if currentPageOffset:
+                params['cmcontinue'] = currentPageOffset
+                wikipedia.output('Getting [[%s]] list from %s by API...'
+                                 % (self.title(), currentPageOffset[:-1])) # cmcontinue last key is '|'
+            elif startFrom:
+                params['cmstart'] = startFrom
+                wikipedia.output('Getting [[%s]] list starting at %s by API...'
+                                 % (self.title(), startFrom))
+            else:
+                wikipedia.output('Getting [[%s]] by API...' % self.title())
+            
+            wikipedia.get_throttle()
+            data = query.GetData(params, self.site())
+            
+            for memb in data['query']['categorymembers']:
+                # For MediaWiki versions where subcats look like articles
+                if isCatTitle(memb['title'], self.site()):
+                    yield SUBCATEGORY, Category(self.site(), memb['title'])
+                elif memb['ns'] == 6 and self.site().image_namespace() in memb['title']:
+                    yield ARTICLE, wikipedia.ImagePage(self.site(), title)
+                else:
+                    yield ARTICLE, wikipedia.Page(self.site(), memb['title'])
+            # try to find a link to the next list page
+            if data.has_key('query-continue'):
+                currentPageOffset = data['query-continue']['categorymembers']['cmcontinue']
+            else:
+                break
+
+    def _oldParseCategory(self, purge=False, startFrom=None):
+        """
         Yields all articles and subcategories that are in this category.
Set purge to True to instruct MediaWiki not to serve a cached version.
@@ -247,16 +310,14 @@
                     pass
                 # For MediaWiki versions where subcats look like articles
                 elif isCatTitle(title, self.site()):
-                    ncat = Category(self.site(), title)
-                    yield SUBCATEGORY, ncat
+                    yield SUBCATEGORY, Category(self.site(), title)
                 else:
                     yield ARTICLE, wikipedia.Page(self.site(), title)
             if Rsubcat:
                 # For MediaWiki versions where subcats look differently
                 for titleWithoutNamespace in Rsubcat.findall(txt):
                     title = 'Category:%s' % titleWithoutNamespace
-                    ncat = Category(self.site(), title)
-                    yield SUBCATEGORY, ncat
+                    yield SUBCATEGORY, Category(self.site(), title)
             if Rimage:
                 # For MediaWiki versions where images work through galleries
                 for title in Rimage.findall(txt):