[Pywikipedia-l] SVN: [5466] trunk/pywikipedia
btongminh at svn.wikimedia.org
btongminh at svn.wikimedia.org
Thu May 29 21:34:40 UTC 2008
Revision: 5466
Author: btongminh
Date: 2008-05-29 21:34:39 +0000 (Thu, 29 May 2008)
Log Message:
-----------
Add (disabled by default) diskcache for mediawiki_message which should decrease the memory usage when using many sites
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/wikipedia.py
Added Paths:
-----------
trunk/pywikipedia/cache/
trunk/pywikipedia/diskcache.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2008-05-29 14:53:40 UTC (rev 5465)
+++ trunk/pywikipedia/config.py 2008-05-29 21:34:39 UTC (rev 5466)
@@ -392,6 +392,9 @@
# foreign wiki, set cosmetic_changes_mylang_only to False, but be careful!
cosmetic_changes_mylang_only = True
+# Use the experimental disk cache to prevent huge memory usage
+use_diskcache = False
+
# End of configuration section
# ============================
# System-level and User-level changes.
Added: trunk/pywikipedia/diskcache.py
===================================================================
--- trunk/pywikipedia/diskcache.py (rev 0)
+++ trunk/pywikipedia/diskcache.py 2008-05-29 21:34:39 UTC (rev 5466)
@@ -0,0 +1,105 @@
+import os
+import random
+
+## Dictionary like disk caching module
+## (c) Copyright 2008 - Bryan Tong Minh / The Pywikipediabot team
+## Licensed under the terms of the MIT license
+
+class CachedReadOnlyDictI(object):
+ """A cached readonly dict with case insensitive keys."""
+ def __init__(self, data, max_size = 10, cache_base = 'cache'):
+ self.max_size = max_size
+ while True:
+ self.cache_path = os.path.join(cache_base, ''.join(
+ [random.choice('abcdefghijklmnopqrstuvwxyz')
+ for i in xrange(16)]))
+ if not os.path.exists(self.cache_path): break
+ self.cache_file = open(self.cache_path, 'wb+')
+
+ lookup = [-1 for i in xrange(36)]
+ data.sort(key = lambda i: i[0])
+ for key, value in data:
+ if type(key) is unicode:
+ key = key.encode('utf-8')
+ elif type(key) != str:
+ key = str(key)
+ key = key.lower()
+ index = key[0]
+ if not ((index >= 'a' and index <= 'z') or (index >= '0' and index <= '9')) or '\t' in key:
+ raise RuntimeError('Only alphabetic keys are supported', key)
+
+ if index < 'a':
+ index = ord(index) - 48 + 26# Numeric
+ else:
+ index = ord(index) - 97
+ if lookup[index] == -1:
+ lookup[index] = self.cache_file.tell()
+
+ if type(value) is unicode:
+ value = value.encode('utf-8')
+ elif type(value) != str:
+ value = str(value)
+
+ if len(key) > 0xFF:
+ raise RuntimeError('Key length must be smaller than %i' % 0xFF)
+ if len(value) > 0xFFFF:
+ raise RuntimeError('Value length must be smaller than %i' % 0xFFFF)
+
+ self.cache_file.write('%02x%s%04x%s' % (len(key), key, len(value), value))
+
+ self.lookup = lookup
+ self.cache_file.seek(0)
+ self.cache = []
+
+ def __del__(self):
+ self.cache_file.close()
+ import os
+ os.unlink(self.cache_path)
+ os = None
+
+ def __getitem__(self, key):
+ key = key.lower()
+ if type(key) is unicode:
+ key = key.encode('utf-8')
+
+ index = key[0]
+ if not ((index >= 'a' and index <= 'z') or (index >= '0' and index <= '9')):
+ raise KeyError(key)
+
+ if index < 'a':
+ if index < '0' or index > '9':
+ raise KeyError(key)
+ i = ord(index) - 48 + 26# Numeric
+ else:
+ if index > 'z':
+ raise KeyError(key)
+ i = ord(index) - 97
+
+ for k, v in self.cache:
+ if k == key:
+ self.cache.remove((k, v))
+ self.cache.append((k, v))
+
+ self.cache_file.seek(self.lookup[i])
+ while True:
+ length = int(self.read(2, key), 16)
+ k = self.read(length, key)
+ if k == key:
+ length = int(self.read(4, key), 16)
+ value = self.read(length, key).decode('utf-8')
+ if len(self.cache) > self.max_size:
+ del self.cache[0]
+ self.cache.append((key, value))
+ return value
+
+ elif k[0] != index:
+ raise KeyError(key)
+
+ length = int(self.read(4, key), 16)
+ self.cache_file.seek(length, os.SEEK_CUR)
+
+
+ def read(self, length, key = ''):
+ s = self.cache_file.read(length)
+ if not s: raise KeyError(key)
+ return s
Property changes on: trunk/pywikipedia/diskcache.py
___________________________________________________________________
Name: svn:eol-style
+ native
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2008-05-29 14:53:40 UTC (rev 5465)
+++ trunk/pywikipedia/wikipedia.py 2008-05-29 21:34:39 UTC (rev 5466)
@@ -127,6 +127,7 @@
import xmlreader
from BeautifulSoup import *
import simplejson
+import diskcache
# Set the locale to system default. This will ensure correct string
# handling for non-latin characters on Python 2.3.x. For Python 2.4.x it's no
@@ -4477,6 +4478,11 @@
output(u'Elementtree was not found, using BeautifulSoup instead')
elementtree = False
+ if config.use_diskcache:
+ _dict = diskcache.CachedReadOnlyDictI
+ else:
+ _dict = dict
+
retry_idle_time = 1
while True:
get_throttle()
@@ -4490,13 +4496,13 @@
if elementtree:
decode = xml.encode(self.encoding())
tree = XML(decode)
- self._mediawiki_messages = dict([(tag.get('name').lower(), tag.text)
- for tag in tree.getiterator('message')])
+ self._mediawiki_messages = _dict([(tag.get('name').lower(), tag.text)
+ for tag in tree.getiterator('message')])
else:
tree = BeautifulStoneSoup(xml)
- self._mediawiki_messages = dict([(tag.get('name').lower(), tag.string)
- for tag in tree.findAll('message')])
-
+ self._mediawiki_messages = _dict([(tag.get('name').lower(), tag.string)
+ for tag in tree.findAll('message')])
+
if not self._mediawiki_messages:
# No messages could be added.
# We assume that the server is down.
More information about the Pywikipedia-l
mailing list