[Pywikipedia-l] SVN: [6038] trunk/pywikipedia/catlib.py

Tue Oct 28 10:23:45 UTC 2008

Revision: 6038
Author:   nicdumz
Date:     2008-10-28 10:23:45 +0000 (Tue, 28 Oct 2008)

Log Message:
-----------
Fix for [2193942 ] reading category: memory leak and slow down :

Let's not assume users want to crawl several times a category, and make the default behavior NON CACHING. Why would a user iterate several times on a category content anyway ?

Modified Paths:
--------------
    trunk/pywikipedia/catlib.py

Modified: trunk/pywikipedia/catlib.py
===================================================================

--- trunk/pywikipedia/catlib.py	2008-10-27 20:51:31 UTC (rev 6037)
+++ trunk/pywikipedia/catlib.py	2008-10-28 10:23:45 UTC (rev 6038)
@@ -93,7 +93,7 @@
         else:
             return '[[%s]]' % titleWithSortKey
 
-    def _getContents(self, recurse=False, purge=False, startFrom=None, cache=None):
+    def _getAndCacheContents(self, recurse=False, purge=False, startFrom=None, cache=None):
         """
         Cache results of _parseCategory for a second call.
 
@@ -129,7 +129,7 @@
                         # contents of subcategory are cached by calling
                         # this method recursively; therefore, do not cache
                         # them again
-                        for item in subcat._getContents(newrecurse, purge, cache=cache):
+                        for item in subcat._getAndCacheContents(newrecurse, purge, cache=cache):
                             yield item
         else:
             for tag, page in self._parseCategory(purge, startFrom):
@@ -147,11 +147,22 @@
                             # contents of subcategory are cached by calling
                             # this method recursively; therefore, do not cache
                             # them again
-                            for item in page._getContents(newrecurse, purge, cache=cache):
+                            for item in page._getAndCacheContents(newrecurse, purge, cache=cache):
                                 yield item
             if not startFrom:
                 self.completelyCached = True
 
+    def _getContentsNaive(self, recurse=False, startFrom=None):
+        """
+        Simple category content yielder. Naive, do not attempts to
+        cache anything
+        """
+        for tag, page in self._parseCategory(startFrom=startFrom):
+            yield tag, page
+            if tag == SUBCATEGORY and recurse:
+                for item in page._getContentsNaive(recurse=True):
+                    yield item
+
     def _parseCategory(self, purge=False, startFrom=None):
         """
         Yields all articles and subcategories that are in this category.
@@ -259,7 +270,7 @@
             else:
                 break
 
-    def subcategories(self, recurse=False, startFrom=None):
+    def subcategories(self, recurse=False, startFrom=None, cacheResults=False):
         """
         Yields all subcategories of the current category.
 
@@ -269,9 +280,18 @@
         equivalent to recurse = False, recurse = 1 gives first-level
         subcategories of subcategories but no deeper, etcetera).
 
+        cacheResults - cache the category contents: useful if you need to 
+        do several passes on the category members list. The simple cache
+        system is *not* meant to be memory or cpu efficient for large
+        categories
+
         Results a sorted (as sorted by MediaWiki), but need not be unique.
         """
-        for tag, subcat in self._getContents(recurse, startFrom=startFrom):
+        if cacheResults:
+            gen = self._getAndCacheContents
+        else:
+            gen = self._getContentsNaive
+        for tag, subcat in gen(recurse=recurse, startFrom=startFrom):
             if tag == SUBCATEGORY:
                 yield subcat
 
@@ -289,7 +309,7 @@
             subcats.append(cat)
         return unique(subcats)
 
-    def articles(self, recurse=False, startFrom=None):
+    def articles(self, recurse=False, startFrom=None, cacheResults=False):
         """
         Yields all articles of the current category.
 
@@ -297,10 +317,19 @@
         Recurse can be a number to restrict the depth at which subcategories
         are included.
 
+        cacheResults - cache the category contents: useful if you need to 
+        do several passes on the category members list. The simple cache
+        system is *not* meant to be memory or cpu efficient for large
+        categories
+
         Results are unsorted (except as sorted by MediaWiki), and need not
         be unique.
         """
-        for tag, page in self._getContents(recurse, startFrom=startFrom):
+        if cacheResults:
+            gen = self._getAndCacheContents
+        else:
+            gen = self._getContentsNaive
+        for tag, page in gen(recurse=recurse, startFrom=startFrom):
             if tag == ARTICLE:
                 yield page
 
@@ -342,7 +371,7 @@
 
     def isEmpty(self):
         # TODO: rename; naming conflict with Page.isEmpty
-        for tag, title in self._getContents(purge = True):
+        for tag, title in self._parseCategory():
             return False
         return True