Revision: 6756 Author: nicdumz Date: 2009-04-30 01:47:36 +0000 (Thu, 30 Apr 2009)
Log Message: ----------- Adding an experimental contents_on_disk feature: save the Page contents on disk, in a python shelf, and load them only when needed, instead of loading the contents in RAM.
Activating this option might slow down a bit the whole interwiki process: fetching an entry on disk is slower than simply fetching in RAM the attribute. This should however greatly reduce the memory consumption.
Modified Paths: -------------- trunk/pywikipedia/config.py trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2009-04-30 00:26:14 UTC (rev 6755) +++ trunk/pywikipedia/config.py 2009-04-30 01:47:36 UTC (rev 6756) @@ -234,6 +234,11 @@ # Save file with local articles without interwikis. without_interwiki = False
+# Experimental feature: +# Store the page contents on disk (/cache/ directory) instead of loading +# them in RAM. +interwiki_contents_on_disk = False + ############## SOLVE_DISAMBIGUATION SETTINGS ############ # # Set disambiguation_comment[FAMILY][LANG] to a non-empty string to override
Modified: trunk/pywikipedia/interwiki.py =================================================================== --- trunk/pywikipedia/interwiki.py 2009-04-30 00:26:14 UTC (rev 6755) +++ trunk/pywikipedia/interwiki.py 2009-04-30 01:47:36 UTC (rev 6756) @@ -270,13 +270,14 @@ # (C) Rob W.W. Hooft, 2003 # (C) Daniel Herding, 2004 # (C) Yuri Astrakhan, 2005-2006 +# (C) Pywikipedia bot team, 2007-2009 # # Distributed under the terms of the MIT license. # __version__ = '$Id$' #
-import sys, copy, re +import sys, copy, re, os import time import codecs import socket @@ -501,7 +502,74 @@ minsubjects = config.interwiki_min_subjects nobackonly = False hintsareright = False + contentsondisk = config.interwiki_contents_on_disk
+class StoredPage(wikipedia.Page): + """ + Store the Page contents on disk to avoid sucking too much + memory when a big number of Page objects will be loaded + at the same time. + """ + + # Please prefix the class members names by SP + # to avoid possible name clashes with wikipedia.Page + + # path to the shelve + SPpath = None + # shelve + SPstore = None + + # attributes created by wikipedia.Page.__init__ + SPcopy = [ '_editrestriction', + '_site', + '_namespace', + '_section', + '_title', + 'editRestriction', + 'moveRestriction', + '_permalink', + '_userName', + '_ipedit', + '_editTime', + '_startTime', + '_revisionId', + '_deletedRevs' ] + + def SPdeleteStore(): + del StoredPage.SPstore + os.unlink(StoredPage.SPpath) + SPdeleteStore = staticmethod(SPdeleteStore) + + def __init__(self, page): + for attr in StoredPage.SPcopy: + setattr(self, attr, getattr(page, attr)) + + if not StoredPage.SPpath: + import shelve + index = 1 + while True: + path = config.datafilepath('cache', 'pagestore' + str(index)) + if not os.path.exists(path): break + index += 1 + StoredPage.SPpath = path + StoredPage.SPstore = shelve.open(path) + + self.SPkey = self.aslink().encode('utf-8') + self.SPcontentSet = False + + def SPgetContents(self): + return StoredPage.SPstore[self.SPkey] + + def SPsetContents(self, contents): + self.SPcontentSet = True + StoredPage.SPstore[self.SPkey] = contents + + def SPdelContents(self): + if self.SPcontentSet: + del StoredPage.SPstore[self.SPkey] + + _contents = property(SPgetContents, SPsetContents, SPdelContents) + class PageTree(object): """ Structure to manipulate a set of pages. @@ -642,6 +710,10 @@ def __init__(self, originPage, hints = None): """Constructor. Takes as arguments the Page on the home wiki plus optionally a list of hints for translation""" + + if globarvar.contentsondisk: + originPage = StoredPage(originPage) + # Remember the "origin page" self.originPage = originPage # todo is a list of all pages that still need to be analyzed. @@ -784,6 +856,10 @@ wikipedia.output("%s has a backlink from %s."%(page,linkingPage)) self.makeForcedStop(counter) return False + + if globarvar.contentsondisk: + page = StoredPage(page) + if page in self.foundIn: # not new self.foundIn[page].append(linkingPage) @@ -1333,6 +1409,22 @@ if config.interwiki_backlink: self.reportBacklinks(new, updatedSites)
+ """ + Delete the contents that are stored on disk for this Subject. + + We cannot afford to define this in a StoredPage destructor because + StoredPage instances can get referenced cyclicly: that would stop the + garbage collector from destroying some of those objects. + + It's also not necessary to set theses line as a Subject destructor: + deleting all stored content one entry by one entry when bailing out + after a KeyboardInterrupt for example is redundant, because the + whole storage file will be eventually removed. + """ + if globalvar.contentsondisk: + for storedPage in self.foundIn: + storedPage.SPdelContents() + def replaceLinks(self, page, newPages, bot): """ Returns True if saving was successful. @@ -2034,6 +2126,9 @@ except: bot.dump() raise + finally: + if globalvar.contentsondisk: + StoredPage.SPdeleteStore()
finally: wikipedia.stopme()
pywikipedia-svn@lists.wikimedia.org