Revision: 6760
Author: nicdumz
Date: 2009-04-30 04:52:44 +0000 (Thu, 30 Apr 2009)
Log Message:
-----------
Adding the possibility to use \n in commandline
Modified Paths:
--------------
trunk/pywikipedia/add_text.py
Modified: trunk/pywikipedia/add_text.py
===================================================================
--- trunk/pywikipedia/add_text.py 2009-04-30 04:41:00 UTC (rev 6759)
+++ trunk/pywikipedia/add_text.py 2009-04-30 04:52:44 UTC (rev 6760)
@@ -11,7 +11,7 @@
Furthermore, the following command line parameters are supported:
-page Use a page as generator
--text Define which text add
+-text Define which text add. "\n" are interpreted as newlines.
-summary Define the summary to use
-except Use a regex to understand if the template is already in the page
-excepturl Use the html page as text where you want to see if there's the text, not the wiki-page.
@@ -174,6 +174,8 @@
#nn got a message between the categories and the iw's and they want to keep it there, first remove it
if (site.language()==u'nn'):
newtext = newtext.replace(nn_iw_msg, '')
+ # Translating the \\n into binary \n
+ addText = addText.replace('\\n', '\n')
# Adding the text
newtext += u"\n%s" % addText
# Reputting the categories
Revision: 6758
Author: nicdumz
Date: 2009-04-30 01:51:08 +0000 (Thu, 30 Apr 2009)
Log Message:
-----------
Typo globarvar -> globalvar
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2009-04-30 01:50:23 UTC (rev 6757)
+++ trunk/pywikipedia/interwiki.py 2009-04-30 01:51:08 UTC (rev 6758)
@@ -712,7 +712,7 @@
"""Constructor. Takes as arguments the Page on the home wiki
plus optionally a list of hints for translation"""
- if globarvar.contentsondisk:
+ if globalvar.contentsondisk:
originPage = StoredPage(originPage)
# Remember the "origin page"
@@ -858,7 +858,7 @@
self.makeForcedStop(counter)
return False
- if globarvar.contentsondisk:
+ if globalvar.contentsondisk:
page = StoredPage(page)
if page in self.foundIn:
Revision: 6756
Author: nicdumz
Date: 2009-04-30 01:47:36 +0000 (Thu, 30 Apr 2009)
Log Message:
-----------
Adding an experimental contents_on_disk feature:
save the Page contents on disk, in a python shelf, and load them
only when needed, instead of loading the contents in RAM.
Activating this option might slow down a bit the whole interwiki
process: fetching an entry on disk is slower than simply fetching in
RAM the attribute. This should however greatly reduce the memory consumption.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2009-04-30 00:26:14 UTC (rev 6755)
+++ trunk/pywikipedia/config.py 2009-04-30 01:47:36 UTC (rev 6756)
@@ -234,6 +234,11 @@
# Save file with local articles without interwikis.
without_interwiki = False
+# Experimental feature:
+# Store the page contents on disk (/cache/ directory) instead of loading
+# them in RAM.
+interwiki_contents_on_disk = False
+
############## SOLVE_DISAMBIGUATION SETTINGS ############
#
# Set disambiguation_comment[FAMILY][LANG] to a non-empty string to override
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2009-04-30 00:26:14 UTC (rev 6755)
+++ trunk/pywikipedia/interwiki.py 2009-04-30 01:47:36 UTC (rev 6756)
@@ -270,13 +270,14 @@
# (C) Rob W.W. Hooft, 2003
# (C) Daniel Herding, 2004
# (C) Yuri Astrakhan, 2005-2006
+# (C) Pywikipedia bot team, 2007-2009
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id$'
#
-import sys, copy, re
+import sys, copy, re, os
import time
import codecs
import socket
@@ -501,7 +502,74 @@
minsubjects = config.interwiki_min_subjects
nobackonly = False
hintsareright = False
+ contentsondisk = config.interwiki_contents_on_disk
+class StoredPage(wikipedia.Page):
+ """
+ Store the Page contents on disk to avoid sucking too much
+ memory when a big number of Page objects will be loaded
+ at the same time.
+ """
+
+ # Please prefix the class members names by SP
+ # to avoid possible name clashes with wikipedia.Page
+
+ # path to the shelve
+ SPpath = None
+ # shelve
+ SPstore = None
+
+ # attributes created by wikipedia.Page.__init__
+ SPcopy = [ '_editrestriction',
+ '_site',
+ '_namespace',
+ '_section',
+ '_title',
+ 'editRestriction',
+ 'moveRestriction',
+ '_permalink',
+ '_userName',
+ '_ipedit',
+ '_editTime',
+ '_startTime',
+ '_revisionId',
+ '_deletedRevs' ]
+
+ def SPdeleteStore():
+ del StoredPage.SPstore
+ os.unlink(StoredPage.SPpath)
+ SPdeleteStore = staticmethod(SPdeleteStore)
+
+ def __init__(self, page):
+ for attr in StoredPage.SPcopy:
+ setattr(self, attr, getattr(page, attr))
+
+ if not StoredPage.SPpath:
+ import shelve
+ index = 1
+ while True:
+ path = config.datafilepath('cache', 'pagestore' + str(index))
+ if not os.path.exists(path): break
+ index += 1
+ StoredPage.SPpath = path
+ StoredPage.SPstore = shelve.open(path)
+
+ self.SPkey = self.aslink().encode('utf-8')
+ self.SPcontentSet = False
+
+ def SPgetContents(self):
+ return StoredPage.SPstore[self.SPkey]
+
+ def SPsetContents(self, contents):
+ self.SPcontentSet = True
+ StoredPage.SPstore[self.SPkey] = contents
+
+ def SPdelContents(self):
+ if self.SPcontentSet:
+ del StoredPage.SPstore[self.SPkey]
+
+ _contents = property(SPgetContents, SPsetContents, SPdelContents)
+
class PageTree(object):
"""
Structure to manipulate a set of pages.
@@ -642,6 +710,10 @@
def __init__(self, originPage, hints = None):
"""Constructor. Takes as arguments the Page on the home wiki
plus optionally a list of hints for translation"""
+
+ if globarvar.contentsondisk:
+ originPage = StoredPage(originPage)
+
# Remember the "origin page"
self.originPage = originPage
# todo is a list of all pages that still need to be analyzed.
@@ -784,6 +856,10 @@
wikipedia.output("%s has a backlink from %s."%(page,linkingPage))
self.makeForcedStop(counter)
return False
+
+ if globarvar.contentsondisk:
+ page = StoredPage(page)
+
if page in self.foundIn:
# not new
self.foundIn[page].append(linkingPage)
@@ -1333,6 +1409,22 @@
if config.interwiki_backlink:
self.reportBacklinks(new, updatedSites)
+ """
+ Delete the contents that are stored on disk for this Subject.
+
+ We cannot afford to define this in a StoredPage destructor because
+ StoredPage instances can get referenced cyclicly: that would stop the
+ garbage collector from destroying some of those objects.
+
+ It's also not necessary to set theses line as a Subject destructor:
+ deleting all stored content one entry by one entry when bailing out
+ after a KeyboardInterrupt for example is redundant, because the
+ whole storage file will be eventually removed.
+ """
+ if globalvar.contentsondisk:
+ for storedPage in self.foundIn:
+ storedPage.SPdelContents()
+
def replaceLinks(self, page, newPages, bot):
"""
Returns True if saving was successful.
@@ -2034,6 +2126,9 @@
except:
bot.dump()
raise
+ finally:
+ if globalvar.contentsondisk:
+ StoredPage.SPdeleteStore()
finally:
wikipedia.stopme()
Revision: 6753
Author: nicdumz
Date: 2009-04-29 21:41:21 +0000 (Wed, 29 Apr 2009)
Log Message:
-----------
Do not keep the diskcache file opened in write mode:
once data has been read, we only need to read from that file.
Modified Paths:
--------------
trunk/pywikipedia/diskcache.py
Modified: trunk/pywikipedia/diskcache.py
===================================================================
--- trunk/pywikipedia/diskcache.py 2009-04-29 21:39:55 UTC (rev 6752)
+++ trunk/pywikipedia/diskcache.py 2009-04-29 21:41:21 UTC (rev 6753)
@@ -58,6 +58,9 @@
self.cache_file.write('%02x%s%06x%s' % (len(key), key, len(value), value))
self.lookup = lookup
+
+ self.cache_file.close()
+ self.cache_file = open(self.cache_path, 'rb')
self.cache_file.seek(0)
self.cache = []
Revision: 6752
Author: nicdumz
Date: 2009-04-29 21:39:55 +0000 (Wed, 29 Apr 2009)
Log Message:
-----------
Do not check again for key range: that has been done two lines before
Modified Paths:
--------------
trunk/pywikipedia/diskcache.py
Modified: trunk/pywikipedia/diskcache.py
===================================================================
--- trunk/pywikipedia/diskcache.py 2009-04-29 16:21:41 UTC (rev 6751)
+++ trunk/pywikipedia/diskcache.py 2009-04-29 21:39:55 UTC (rev 6752)
@@ -80,12 +80,8 @@
raise KeyError(key)
if index < 'a':
- if index < '0' or index > '9':
- raise KeyError(key)
i = ord(index) - 48 + 26 # Numeric
else:
- if index > 'z':
- raise KeyError(key)
i = ord(index) - 97
for k, v in self.cache: