Revision: 6038 Author: nicdumz Date: 2008-10-28 10:23:45 +0000 (Tue, 28 Oct 2008)
Log Message: ----------- Fix for [2193942 ] reading category: memory leak and slow down :
Let's not assume users want to crawl several times a category, and make the default behavior NON CACHING. Why would a user iterate several times on a category content anyway ?
Modified Paths: -------------- trunk/pywikipedia/catlib.py
Modified: trunk/pywikipedia/catlib.py =================================================================== --- trunk/pywikipedia/catlib.py 2008-10-27 20:51:31 UTC (rev 6037) +++ trunk/pywikipedia/catlib.py 2008-10-28 10:23:45 UTC (rev 6038) @@ -93,7 +93,7 @@ else: return '[[%s]]' % titleWithSortKey
- def _getContents(self, recurse=False, purge=False, startFrom=None, cache=None): + def _getAndCacheContents(self, recurse=False, purge=False, startFrom=None, cache=None): """ Cache results of _parseCategory for a second call.
@@ -129,7 +129,7 @@ # contents of subcategory are cached by calling # this method recursively; therefore, do not cache # them again - for item in subcat._getContents(newrecurse, purge, cache=cache): + for item in subcat._getAndCacheContents(newrecurse, purge, cache=cache): yield item else: for tag, page in self._parseCategory(purge, startFrom): @@ -147,11 +147,22 @@ # contents of subcategory are cached by calling # this method recursively; therefore, do not cache # them again - for item in page._getContents(newrecurse, purge, cache=cache): + for item in page._getAndCacheContents(newrecurse, purge, cache=cache): yield item if not startFrom: self.completelyCached = True
+ def _getContentsNaive(self, recurse=False, startFrom=None): + """ + Simple category content yielder. Naive, do not attempts to + cache anything + """ + for tag, page in self._parseCategory(startFrom=startFrom): + yield tag, page + if tag == SUBCATEGORY and recurse: + for item in page._getContentsNaive(recurse=True): + yield item + def _parseCategory(self, purge=False, startFrom=None): """ Yields all articles and subcategories that are in this category. @@ -259,7 +270,7 @@ else: break
- def subcategories(self, recurse=False, startFrom=None): + def subcategories(self, recurse=False, startFrom=None, cacheResults=False): """ Yields all subcategories of the current category.
@@ -269,9 +280,18 @@ equivalent to recurse = False, recurse = 1 gives first-level subcategories of subcategories but no deeper, etcetera).
+ cacheResults - cache the category contents: useful if you need to + do several passes on the category members list. The simple cache + system is *not* meant to be memory or cpu efficient for large + categories + Results a sorted (as sorted by MediaWiki), but need not be unique. """ - for tag, subcat in self._getContents(recurse, startFrom=startFrom): + if cacheResults: + gen = self._getAndCacheContents + else: + gen = self._getContentsNaive + for tag, subcat in gen(recurse=recurse, startFrom=startFrom): if tag == SUBCATEGORY: yield subcat
@@ -289,7 +309,7 @@ subcats.append(cat) return unique(subcats)
- def articles(self, recurse=False, startFrom=None): + def articles(self, recurse=False, startFrom=None, cacheResults=False): """ Yields all articles of the current category.
@@ -297,10 +317,19 @@ Recurse can be a number to restrict the depth at which subcategories are included.
+ cacheResults - cache the category contents: useful if you need to + do several passes on the category members list. The simple cache + system is *not* meant to be memory or cpu efficient for large + categories + Results are unsorted (except as sorted by MediaWiki), and need not be unique. """ - for tag, page in self._getContents(recurse, startFrom=startFrom): + if cacheResults: + gen = self._getAndCacheContents + else: + gen = self._getContentsNaive + for tag, page in gen(recurse=recurse, startFrom=startFrom): if tag == ARTICLE: yield page
@@ -342,7 +371,7 @@
def isEmpty(self): # TODO: rename; naming conflict with Page.isEmpty - for tag, title in self._getContents(purge = True): + for tag, title in self._parseCategory(): return False return True