Revision: 5466 Author: btongminh Date: 2008-05-29 21:34:39 +0000 (Thu, 29 May 2008)
Log Message: ----------- Add (disabled by default) diskcache for mediawiki_message which should decrease the memory usage when using many sites
Modified Paths: -------------- trunk/pywikipedia/config.py trunk/pywikipedia/wikipedia.py
Added Paths: ----------- trunk/pywikipedia/cache/ trunk/pywikipedia/diskcache.py
Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2008-05-29 14:53:40 UTC (rev 5465) +++ trunk/pywikipedia/config.py 2008-05-29 21:34:39 UTC (rev 5466) @@ -392,6 +392,9 @@ # foreign wiki, set cosmetic_changes_mylang_only to False, but be careful! cosmetic_changes_mylang_only = True
+# Use the experimental disk cache to prevent huge memory usage +use_diskcache = False + # End of configuration section # ============================ # System-level and User-level changes.
Added: trunk/pywikipedia/diskcache.py =================================================================== --- trunk/pywikipedia/diskcache.py (rev 0) +++ trunk/pywikipedia/diskcache.py 2008-05-29 21:34:39 UTC (rev 5466) @@ -0,0 +1,105 @@ +import os +import random + +## Dictionary like disk caching module +## (c) Copyright 2008 - Bryan Tong Minh / The Pywikipediabot team +## Licensed under the terms of the MIT license + +class CachedReadOnlyDictI(object): + """A cached readonly dict with case insensitive keys.""" + def __init__(self, data, max_size = 10, cache_base = 'cache'): + self.max_size = max_size + while True: + self.cache_path = os.path.join(cache_base, ''.join( + [random.choice('abcdefghijklmnopqrstuvwxyz') + for i in xrange(16)])) + if not os.path.exists(self.cache_path): break + self.cache_file = open(self.cache_path, 'wb+') + + lookup = [-1 for i in xrange(36)] + data.sort(key = lambda i: i[0]) + for key, value in data: + if type(key) is unicode: + key = key.encode('utf-8') + elif type(key) != str: + key = str(key) + key = key.lower() + index = key[0] + if not ((index >= 'a' and index <= 'z') or (index >= '0' and index <= '9')) or '\t' in key: + raise RuntimeError('Only alphabetic keys are supported', key) + + if index < 'a': + index = ord(index) - 48 + 26# Numeric + else: + index = ord(index) - 97 + if lookup[index] == -1: + lookup[index] = self.cache_file.tell() + + if type(value) is unicode: + value = value.encode('utf-8') + elif type(value) != str: + value = str(value) + + if len(key) > 0xFF: + raise RuntimeError('Key length must be smaller than %i' % 0xFF) + if len(value) > 0xFFFF: + raise RuntimeError('Value length must be smaller than %i' % 0xFFFF) + + self.cache_file.write('%02x%s%04x%s' % (len(key), key, len(value), value)) + + self.lookup = lookup + self.cache_file.seek(0) + self.cache = [] + + def __del__(self): + self.cache_file.close() + import os + os.unlink(self.cache_path) + os = None + + def __getitem__(self, key): + key = key.lower() + if type(key) is unicode: + key = key.encode('utf-8') + + index = key[0] + if not ((index >= 'a' and index <= 'z') or (index >= '0' and index <= '9')): + raise KeyError(key) + + if index < 'a': + if index < '0' or index > '9': + raise KeyError(key) + i = ord(index) - 48 + 26# Numeric + else: + if index > 'z': + raise KeyError(key) + i = ord(index) - 97 + + for k, v in self.cache: + if k == key: + self.cache.remove((k, v)) + self.cache.append((k, v)) + + self.cache_file.seek(self.lookup[i]) + while True: + length = int(self.read(2, key), 16) + k = self.read(length, key) + if k == key: + length = int(self.read(4, key), 16) + value = self.read(length, key).decode('utf-8') + if len(self.cache) > self.max_size: + del self.cache[0] + self.cache.append((key, value)) + return value + + elif k[0] != index: + raise KeyError(key) + + length = int(self.read(4, key), 16) + self.cache_file.seek(length, os.SEEK_CUR) + + + def read(self, length, key = ''): + s = self.cache_file.read(length) + if not s: raise KeyError(key) + return s
Property changes on: trunk/pywikipedia/diskcache.py ___________________________________________________________________ Name: svn:eol-style + native
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2008-05-29 14:53:40 UTC (rev 5465) +++ trunk/pywikipedia/wikipedia.py 2008-05-29 21:34:39 UTC (rev 5466) @@ -127,6 +127,7 @@ import xmlreader from BeautifulSoup import * import simplejson +import diskcache
# Set the locale to system default. This will ensure correct string # handling for non-latin characters on Python 2.3.x. For Python 2.4.x it's no @@ -4477,6 +4478,11 @@ output(u'Elementtree was not found, using BeautifulSoup instead') elementtree = False
+ if config.use_diskcache: + _dict = diskcache.CachedReadOnlyDictI + else: + _dict = dict + retry_idle_time = 1 while True: get_throttle() @@ -4490,13 +4496,13 @@ if elementtree: decode = xml.encode(self.encoding()) tree = XML(decode) - self._mediawiki_messages = dict([(tag.get('name').lower(), tag.text) - for tag in tree.getiterator('message')]) + self._mediawiki_messages = _dict([(tag.get('name').lower(), tag.text) + for tag in tree.getiterator('message')]) else: tree = BeautifulStoneSoup(xml) - self._mediawiki_messages = dict([(tag.get('name').lower(), tag.string) - for tag in tree.findAll('message')]) - + self._mediawiki_messages = _dict([(tag.get('name').lower(), tag.string) + for tag in tree.findAll('message')]) + if not self._mediawiki_messages: # No messages could be added. # We assume that the server is down.
pywikipedia-l@lists.wikimedia.org