Revision: 6756
Author: nicdumz
Date: 2009-04-30 01:47:36 +0000 (Thu, 30 Apr 2009)
Log Message:
-----------
Adding an experimental contents_on_disk feature:
save the Page contents on disk, in a python shelf, and load them
only when needed, instead of loading the contents in RAM.
Activating this option might slow down a bit the whole interwiki
process: fetching an entry on disk is slower than simply fetching in
RAM the attribute. This should however greatly reduce the memory consumption.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2009-04-30 00:26:14 UTC (rev 6755)
+++ trunk/pywikipedia/config.py 2009-04-30 01:47:36 UTC (rev 6756)
@@ -234,6 +234,11 @@
# Save file with local articles without interwikis.
without_interwiki = False
+# Experimental feature:
+# Store the page contents on disk (/cache/ directory) instead of loading
+# them in RAM.
+interwiki_contents_on_disk = False
+
############## SOLVE_DISAMBIGUATION SETTINGS ############
#
# Set disambiguation_comment[FAMILY][LANG] to a non-empty string to override
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2009-04-30 00:26:14 UTC (rev 6755)
+++ trunk/pywikipedia/interwiki.py 2009-04-30 01:47:36 UTC (rev 6756)
@@ -270,13 +270,14 @@
# (C) Rob W.W. Hooft, 2003
# (C) Daniel Herding, 2004
# (C) Yuri Astrakhan, 2005-2006
+# (C) Pywikipedia bot team, 2007-2009
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id$'
#
-import sys, copy, re
+import sys, copy, re, os
import time
import codecs
import socket
@@ -501,7 +502,74 @@
minsubjects = config.interwiki_min_subjects
nobackonly = False
hintsareright = False
+ contentsondisk = config.interwiki_contents_on_disk
+class StoredPage(wikipedia.Page):
+ """
+ Store the Page contents on disk to avoid sucking too much
+ memory when a big number of Page objects will be loaded
+ at the same time.
+ """
+
+ # Please prefix the class members names by SP
+ # to avoid possible name clashes with wikipedia.Page
+
+ # path to the shelve
+ SPpath = None
+ # shelve
+ SPstore = None
+
+ # attributes created by wikipedia.Page.__init__
+ SPcopy = [ '_editrestriction',
+ '_site',
+ '_namespace',
+ '_section',
+ '_title',
+ 'editRestriction',
+ 'moveRestriction',
+ '_permalink',
+ '_userName',
+ '_ipedit',
+ '_editTime',
+ '_startTime',
+ '_revisionId',
+ '_deletedRevs' ]
+
+ def SPdeleteStore():
+ del StoredPage.SPstore
+ os.unlink(StoredPage.SPpath)
+ SPdeleteStore = staticmethod(SPdeleteStore)
+
+ def __init__(self, page):
+ for attr in StoredPage.SPcopy:
+ setattr(self, attr, getattr(page, attr))
+
+ if not StoredPage.SPpath:
+ import shelve
+ index = 1
+ while True:
+ path = config.datafilepath('cache', 'pagestore' + str(index))
+ if not os.path.exists(path): break
+ index += 1
+ StoredPage.SPpath = path
+ StoredPage.SPstore = shelve.open(path)
+
+ self.SPkey = self.aslink().encode('utf-8')
+ self.SPcontentSet = False
+
+ def SPgetContents(self):
+ return StoredPage.SPstore[self.SPkey]
+
+ def SPsetContents(self, contents):
+ self.SPcontentSet = True
+ StoredPage.SPstore[self.SPkey] = contents
+
+ def SPdelContents(self):
+ if self.SPcontentSet:
+ del StoredPage.SPstore[self.SPkey]
+
+ _contents = property(SPgetContents, SPsetContents, SPdelContents)
+
class PageTree(object):
"""
Structure to manipulate a set of pages.
@@ -642,6 +710,10 @@
def __init__(self, originPage, hints = None):
"""Constructor. Takes as arguments the Page on the home wiki
plus optionally a list of hints for translation"""
+
+ if globarvar.contentsondisk:
+ originPage = StoredPage(originPage)
+
# Remember the "origin page"
self.originPage = originPage
# todo is a list of all pages that still need to be analyzed.
@@ -784,6 +856,10 @@
wikipedia.output("%s has a backlink from %s."%(page,linkingPage))
self.makeForcedStop(counter)
return False
+
+ if globarvar.contentsondisk:
+ page = StoredPage(page)
+
if page in self.foundIn:
# not new
self.foundIn[page].append(linkingPage)
@@ -1333,6 +1409,22 @@
if config.interwiki_backlink:
self.reportBacklinks(new, updatedSites)
+ """
+ Delete the contents that are stored on disk for this Subject.
+
+ We cannot afford to define this in a StoredPage destructor because
+ StoredPage instances can get referenced cyclicly: that would stop the
+ garbage collector from destroying some of those objects.
+
+ It's also not necessary to set theses line as a Subject destructor:
+ deleting all stored content one entry by one entry when bailing out
+ after a KeyboardInterrupt for example is redundant, because the
+ whole storage file will be eventually removed.
+ """
+ if globalvar.contentsondisk:
+ for storedPage in self.foundIn:
+ storedPage.SPdelContents()
+
def replaceLinks(self, page, newPages, bot):
"""
Returns True if saving was successful.
@@ -2034,6 +2126,9 @@
except:
bot.dump()
raise
+ finally:
+ if globalvar.contentsondisk:
+ StoredPage.SPdeleteStore()
finally:
wikipedia.stopme()