[Pywikipedia-l] SVN: [5466] trunk/pywikipedia

btongminh at svn.wikimedia.org btongminh at svn.wikimedia.org
Thu May 29 21:34:40 UTC 2008


Revision: 5466
Author:   btongminh
Date:     2008-05-29 21:34:39 +0000 (Thu, 29 May 2008)

Log Message:
-----------
Add (disabled by default) diskcache for mediawiki_message which should decrease the memory usage when using many sites

Modified Paths:
--------------
    trunk/pywikipedia/config.py
    trunk/pywikipedia/wikipedia.py

Added Paths:
-----------
    trunk/pywikipedia/cache/
    trunk/pywikipedia/diskcache.py

Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py	2008-05-29 14:53:40 UTC (rev 5465)
+++ trunk/pywikipedia/config.py	2008-05-29 21:34:39 UTC (rev 5466)
@@ -392,6 +392,9 @@
 # foreign wiki, set cosmetic_changes_mylang_only to False, but be careful!
 cosmetic_changes_mylang_only = True
 
+# Use the experimental disk cache to prevent huge memory usage
+use_diskcache = False
+
 # End of configuration section
 # ============================
 # System-level and User-level changes.

Added: trunk/pywikipedia/diskcache.py
===================================================================
--- trunk/pywikipedia/diskcache.py	                        (rev 0)
+++ trunk/pywikipedia/diskcache.py	2008-05-29 21:34:39 UTC (rev 5466)
@@ -0,0 +1,105 @@
+import os
+import random
+
+## Dictionary like disk caching module
+## (c) Copyright 2008 - Bryan Tong Minh / The Pywikipediabot team
+## Licensed under the terms of the MIT license
+
+class CachedReadOnlyDictI(object):
+    """A cached readonly dict with case insensitive keys."""
+    def __init__(self, data, max_size = 10, cache_base = 'cache'):
+        self.max_size = max_size
+        while True:
+            self.cache_path = os.path.join(cache_base, ''.join(
+                [random.choice('abcdefghijklmnopqrstuvwxyz') 
+                    for i in xrange(16)]))
+            if not os.path.exists(self.cache_path): break
+        self.cache_file = open(self.cache_path, 'wb+')
+        
+        lookup = [-1 for i in xrange(36)]
+        data.sort(key = lambda i: i[0])
+        for key, value in data:
+            if type(key) is unicode:
+                key = key.encode('utf-8')
+            elif type(key) != str:
+                key = str(key)
+            key = key.lower()
+            index = key[0]
+            if not ((index >= 'a' and index <= 'z') or (index >= '0' and index <= '9')) or '\t' in key:
+                raise RuntimeError('Only alphabetic keys are supported', key)
+            
+            if index < 'a':
+                index = ord(index) - 48 + 26# Numeric
+            else:
+                index = ord(index) - 97
+            if lookup[index] == -1:
+                lookup[index] = self.cache_file.tell()
+            
+            if type(value) is unicode:
+                value = value.encode('utf-8')
+            elif type(value) != str:
+                value = str(value)
+                
+            if len(key) > 0xFF:
+                raise RuntimeError('Key length must be smaller than %i' % 0xFF)
+            if len(value) > 0xFFFF:
+                raise RuntimeError('Value length must be smaller than %i' % 0xFFFF)
+                
+            self.cache_file.write('%02x%s%04x%s' % (len(key), key, len(value), value))
+            
+        self.lookup = lookup
+        self.cache_file.seek(0)
+        self.cache = []
+    
+    def __del__(self):
+        self.cache_file.close()
+        import os
+        os.unlink(self.cache_path)
+        os = None
+        
+    def __getitem__(self, key):
+        key = key.lower()
+        if type(key) is unicode:
+            key = key.encode('utf-8')
+            
+        index = key[0]
+        if not ((index >= 'a' and index <= 'z') or (index >= '0' and index <= '9')):
+            raise KeyError(key)
+        
+        if index < 'a':
+            if index < '0' or index > '9':
+                raise KeyError(key)
+            i = ord(index) - 48 + 26# Numeric
+        else:
+            if index > 'z': 
+                raise KeyError(key)
+            i = ord(index) - 97
+        
+        for k, v in self.cache:
+            if k == key:
+                self.cache.remove((k, v))
+                self.cache.append((k, v))
+        
+        self.cache_file.seek(self.lookup[i])
+        while True:
+            length = int(self.read(2, key), 16)
+            k = self.read(length, key)
+            if k == key:
+                length = int(self.read(4, key), 16)
+                value = self.read(length, key).decode('utf-8')
+                if len(self.cache) > self.max_size:
+                    del self.cache[0]
+                self.cache.append((key, value))
+                return value
+            
+            elif k[0] != index:
+                raise KeyError(key)
+            
+            length = int(self.read(4, key), 16)
+            self.cache_file.seek(length, os.SEEK_CUR)
+        
+        
+    def read(self, length, key = ''):
+        s = self.cache_file.read(length)
+        if not s: raise KeyError(key)
+        return s


Property changes on: trunk/pywikipedia/diskcache.py
___________________________________________________________________
Name: svn:eol-style
   + native

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py	2008-05-29 14:53:40 UTC (rev 5465)
+++ trunk/pywikipedia/wikipedia.py	2008-05-29 21:34:39 UTC (rev 5466)
@@ -127,6 +127,7 @@
 import xmlreader
 from BeautifulSoup import *
 import simplejson
+import diskcache
 
 # Set the locale to system default. This will ensure correct string
 # handling for non-latin characters on Python 2.3.x. For Python 2.4.x it's no
@@ -4477,6 +4478,11 @@
                     output(u'Elementtree was not found, using BeautifulSoup instead')
                 elementtree = False
 
+            if config.use_diskcache:
+                _dict = diskcache.CachedReadOnlyDictI
+            else:
+                _dict = dict
+
             retry_idle_time = 1
             while True:
                 get_throttle()
@@ -4490,13 +4496,13 @@
                 if elementtree:
                     decode = xml.encode(self.encoding())
                     tree = XML(decode)
-                    self._mediawiki_messages = dict([(tag.get('name').lower(), tag.text)
-                    for tag in tree.getiterator('message')])
+                    self._mediawiki_messages = _dict([(tag.get('name').lower(), tag.text)
+                            for tag in tree.getiterator('message')])
                 else:
                     tree = BeautifulStoneSoup(xml)
-                    self._mediawiki_messages = dict([(tag.get('name').lower(), tag.string)
-                    for tag in tree.findAll('message')])
-
+                    self._mediawiki_messages = _dict([(tag.get('name').lower(), tag.string)
+                            for tag in tree.findAll('message')])
+                
                 if not self._mediawiki_messages:
                     # No messages could be added.
                     # We assume that the server is down.





More information about the Pywikipedia-l mailing list