[Pywikipedia-l] SVN: [4793] trunk

valhallasw at svn.wikimedia.org valhallasw at svn.wikimedia.org
Wed Jan 2 22:38:22 UTC 2008


Revision: 4793
Author:   valhallasw
Date:     2008-01-02 22:38:21 +0000 (Wed, 02 Jan 2008)

Log Message:
-----------
New: threadable http library, based on httplib2, with automatic (and thread-safe) cookie support. Includes easy_install-based setup.py.

Added Paths:
-----------
    trunk/threadedhttp/
    trunk/threadedhttp/epydoc.cfg
    trunk/threadedhttp/setup.py
    trunk/threadedhttp/threadedhttp/
    trunk/threadedhttp/threadedhttp/__init__.py
    trunk/threadedhttp/threadedhttp/connectionpool.py
    trunk/threadedhttp/threadedhttp/cookiejar.py
    trunk/threadedhttp/threadedhttp/dummy.py
    trunk/threadedhttp/threadedhttp/http.py
    trunk/threadedhttp/threadedhttp/threadedhttp.py

Added: trunk/threadedhttp/epydoc.cfg
===================================================================
--- trunk/threadedhttp/epydoc.cfg	                        (rev 0)
+++ trunk/threadedhttp/epydoc.cfg	2008-01-02 22:38:21 UTC (rev 4793)
@@ -0,0 +1,84 @@
+[epydoc] # Epydoc section marker (required by ConfigParser)
+
+# modules
+#   The list of objects to document.  Objects can be named using
+#   dotted names, module filenames, or package directory names.
+#   Alases for this option include "objects" and "values".
+modules: threadedhttp
+
+# output
+#   The type of output that should be generated.  Should be one
+#   of: html, text, latex, dvi, ps, pdf.
+output: html
+
+# target
+#   The path to the output directory.  May be relative or absolute.
+target: doc/
+
+# css
+#   The CSS stylesheet for HTML output.  Can be the name of a builtin
+#   stylesheet, or the name of a file.
+css: white
+
+# name
+#   The documented project's name.
+name: Threaded HTTP Library
+
+# url
+#   The documented project's URL.
+url: http://pywikipediabot.sourceforge.net
+
+# frames
+#   Whether or not to include a frames-based table of contents.
+frames: yes
+
+# private
+#   Whether or not to inclue private variables.  (Even if included,
+#   private variables will be hidden by default.)
+private: yes
+
+# imports
+#   Whether or not to list each module's imports.
+imports: yes
+
+# verbosity
+#   An integer indicating how verbose epydoc should be.  The default
+#   value is 0; negative values will supress warnings and errors;
+#   positive values will give more verbose output.
+verbosity: 0
+
+# parse
+#   Whether or not parsing should be used to examine objects.
+parse: yes
+
+# introspect
+#   Whether or not introspection should be used to examine objects.
+introspect: yes
+
+# graph
+#   The list of graph types that should be automatically included
+#   in the output.  Graphs are generated using the Graphviz "dot"
+#   executable.  Graph types include: "classtree", "callgraph",
+#   "umlclass".  Use "all" to include all graph types
+graph: all
+
+# dotpath
+#   The path to the Graphviz "dot" executable, used to generate
+#   graphs.
+dotpath: /usr/bin/dot
+
+# sourcecode
+#   Whether or not to include syntax highlighted source code in
+#   the output (HTML only).
+sourcecode: no
+
+# pstat
+#   The name of one or more pstat files (generated by the profile
+#   or hotshot module).  These are used to generate call graphs.
+#pstat: profile.out
+
+# separate-classes
+#   Whether each class should be listed in its own section when
+#   generating LaTeX or PDF output.
+separate-classes: no
+

Added: trunk/threadedhttp/setup.py
===================================================================
--- trunk/threadedhttp/setup.py	                        (rev 0)
+++ trunk/threadedhttp/setup.py	2008-01-02 22:38:21 UTC (rev 4793)
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding: utf-8  -*-
+
+#
+# (C) Merlijn van Deen, 2007
+#
+# Distributed under the terms of the MIT license
+#
+
+from setuptools import setup, find_packages
+setup(
+    name = "threadedhttp",
+    version = "0.1",
+    packages = find_packages(exclude=['ez_setup']),
+    install_requires = ["httplib2"],
+    platforms=['any'],
+    author = "Merlijn van Deen",
+    author_email = "valhallasw at arctus.nl",
+    description = "httplib2-based HTTP library supporting cookies and threads",
+    classifiers = filter(None, map(str.strip,
+"""                 
+Intended Audience :: Developers
+License :: OSI Approved :: MIT License
+Programming Language :: Python
+Topic :: Software Development :: Libraries :: Python Modules
+""".splitlines())),
+    license = "MIT License",
+    keywords = "thread cookie httplib2",
+    url = "http://pywikipediabot.sourceforge.net",
+)
\ No newline at end of file


Property changes on: trunk/threadedhttp/setup.py
___________________________________________________________________
Name: svn:executable
   + *

Added: trunk/threadedhttp/threadedhttp/__init__.py
===================================================================
--- trunk/threadedhttp/threadedhttp/__init__.py	                        (rev 0)
+++ trunk/threadedhttp/threadedhttp/__init__.py	2008-01-02 22:38:21 UTC (rev 4793)
@@ -0,0 +1,8 @@
+# -*- coding: utf-8  -*-
+
+""" Thread-safe cookie-eating Http library based on httplib2
+    with simple threaded interface
+"""
+
+from http import Http
+from threadedhttp import HttpRequest, HttpProcessor
\ No newline at end of file

Added: trunk/threadedhttp/threadedhttp/connectionpool.py
===================================================================
--- trunk/threadedhttp/threadedhttp/connectionpool.py	                        (rev 0)
+++ trunk/threadedhttp/threadedhttp/connectionpool.py	2008-01-02 22:38:21 UTC (rev 4793)
@@ -0,0 +1,174 @@
+# -*- coding: utf-8  -*-
+""" Thread safe connection pools to share connections between threads. """
+
+#
+# (C) Merlijn van Deen, 2007
+#
+# Distributed under the terms of the MIT license
+#
+
+__version__ = '$Id$'
+__docformat__ = 'epytext'
+
+import logging
+import threading
+
+class ConnectionList(list):
+    """ List with BoundedSemaphore """
+    def __init__(self, maxnum, *args, **kwargs):
+        """ @param maxnum: BoundedSemaphores counter initialisation (maximum number of acquires)"""
+        list.__init__(self, *args, **kwargs)
+        self.max = threading.BoundedSemaphore(maxnum)
+
+class ConnectionPool(object):
+    def __init__(self, max_connections=25, max_connections_per_host=5):
+        """ @param max_connections: Global maximum number of connections for this pool
+            @param max_connections_per_host: maximum number of connections per host
+        """
+        self.max_connections_per_host = max_connections_per_host
+        self.global_max = threading.BoundedSemaphore(max_connections)
+        self.lock = threading.Lock()
+        self.connections = [None] * max_connections # fill known connections witn Nones
+        self.clists = {}         # 'id': (semaphore, lock, [connection1, connection2])
+        logging.log(1,'<%r>: initialized' % self)
+        
+    def __del__(self):
+        """ Destructor to close all connections in the pool.
+            Not completely thread-safe, as connections *could* return just
+            after this function. Make sure the pool is destructed only when
+            it is no longer in use!"""
+        self.lock.acquire() #prevents any connections from returning into the pool
+        try:
+            for connection in self.connections:
+                if connection is not None:
+                    connection.close()
+            del self.connections
+            del self.clists
+        finally:
+            self.lock.release()
+        
+    def pop_connection(self, identifier):
+        """ Gets a connection from identifiers connection pool
+            @param identifier The pool identifier
+            @returns A connection object if found, None otherwise
+        """
+        logging.log(1,'<%r>: acquiring global_max...' % self)
+        self.global_max.acquire()
+        logging.log(1,'<%r>: global_max acquired' % self)
+        try:
+            self.lock.acquire()
+            if identifier not in self.clists:
+                self.clists[identifier] = ConnectionList(self.max_connections_per_host)
+            clist = self.clists[identifier]
+            self.lock.release()
+            logging.log(1,'<%r>: acquiring clist.max...' % self)
+            if not clist.max.acquire(False): # acquire local lock, releasing global lock when waiting
+                logging.log(1,'<%r>: ...failed' % self)
+                self.global_max.release()
+                logging.log(logging.DEBUG,'<%r>: No host connections available, global_max released.' % self)
+                clist.max.acquire()
+                self.global_max.acquire()
+            try:
+                logging.log(1,'<%r>: ...acquired' % self)
+                # we hebben nu zowel toestemming voor een global als voor een local connection
+                # kijk eerst of er zo'n verbinding bestaat
+                self.lock.acquire()
+                try:
+                    if len(clist) > 0:
+                        connection = clist.pop()
+                        logging.log(1,'<%r>: using cached connection' % self)
+                        return connection
+                    else:
+                        # pop the oldest connection from the connection stack
+                        old_connection = self.connections.pop(0)
+                        logging.log(1,'<%r>: popped %r to make place for new connection' % (self,old_connection))
+                        if old_connection is not None:
+                            old_connection.close()
+                            for slist in self.clists.itervalues():
+                                if old_connection in clist:
+                                    clist.remove(old_connection)
+                                    break # a connection is in max one clist
+                        return None
+                finally:
+                    self.lock.release()
+            except Exception, e:
+                logging.log(20,'<%r>: Exception raised level 2 | %r' % (self, e))
+                clist.max.release()            
+                raise
+        except Exception, e:
+            logging.log(20,'<%r>: Exception raised level 1 | %r' % (self, e))
+            self.global_max.release()
+            raise
+        
+    def push_connection(self, identifier, connection):
+        """ Gets a connection from identifiers connection pool
+            @param identifier The pool identifier
+            @returns A connection object if found, None otherwise
+        """
+        self.lock.acquire()
+        try:
+            clist = self.clists[identifier]
+            clist.append(connection)
+            if connection not in self.connections:
+                self.connections.append(connection)
+            clist.max.release()
+            self.global_max.release()
+            logging.log(1, 'clist.max and global_max += 1')
+        finally:
+            self.lock.release()
+
+class BasicConnectionPool(object):
+    """ A thread-safe connection pool """
+    def __init__(self, maxnum=5):
+        """ @param maxnum: Maximum number of connections per identifier.
+                           The pool drops excessive connections added.
+        """
+        self.connections = {}
+        self.lock = threading.Lock()
+        self.maxnum = maxnum
+    
+    def __del__(self):
+        """ Destructor to close all connections in the pool """
+        self.lock.acquire()
+        try:
+            for connection in self.connections:
+                connection.close()
+            
+        finally:
+            self.lock.release()
+            
+    def __repr__(self):
+        return self.connections.__repr__()
+        
+    def pop_connection(self, identifier):
+        """ Gets a connection from identifiers connection pool
+            @param identifier: The pool identifier
+            @returns: A connection object if found, None otherwise
+        """
+        self.lock.acquire()
+        try:
+            if identifier in self.connections:
+                if len(self.connections[identifier]) > 0:
+                    return self.connections[identifier].pop()
+            return None
+        finally:
+            self.lock.release()
+            
+    def push_connection(self, identifier, connection):
+        """ Adds a connection to identifiers connection pool
+            @param identifier: The pool identifier
+            @param connection: The connection to add to the pool
+        """
+        self.lock.acquire()
+        try:
+            if identifier not in self.connections:
+                self.connections[identifier] = []
+            
+            if len(self.connections[identifier]) == self.maxnum:
+                logging.debug('closing %s connection %r' % (identifier, connection))
+                connection.close()
+                del connection
+            else:
+                self.connections[identifier].append(connection)
+        finally:
+            self.lock.release()
\ No newline at end of file

Added: trunk/threadedhttp/threadedhttp/cookiejar.py
===================================================================
--- trunk/threadedhttp/threadedhttp/cookiejar.py	                        (rev 0)
+++ trunk/threadedhttp/threadedhttp/cookiejar.py	2008-01-02 22:38:21 UTC (rev 4793)
@@ -0,0 +1,27 @@
+# -*- coding: utf-8  -*-
+""" Lockable CookieJar and FileCookieJar """
+
+#
+# (C) Merlijn van Deen, 2007
+#
+# Distributed under the terms of the MIT license
+#
+
+__version__ = '$Id$'
+__docformat__ = 'epytext'
+
+import logging
+import threading
+import cookielib
+
+class LockableCookieJar(cookielib.CookieJar):
+    """ CookieJar with integrated Lock object """
+    def __init__(self, *args, **kwargs):
+        cookielib.CookieJar.__init__(self, *args, **kwargs)
+        self.lock = threading.Lock()
+        
+class LockableFileCookieJar(cookielib.FileCookieJar):
+    """ CookieJar with integrated Lock object """
+    def __init__(self, *args, **kwargs):
+        cookielib.FileCookieJar.__init__(self, *args, **kwargs)
+        self.lock = threading.Lock()

Added: trunk/threadedhttp/threadedhttp/dummy.py
===================================================================
--- trunk/threadedhttp/threadedhttp/dummy.py	                        (rev 0)
+++ trunk/threadedhttp/threadedhttp/dummy.py	2008-01-02 22:38:21 UTC (rev 4793)
@@ -0,0 +1,116 @@
+# -*- coding: utf-8  -*-
+""" Dummy classes for CookieJar <-> httplib2 communication """
+
+#
+# (C) Metaweb Technologies, Inc., 2007
+#
+# Distributed under the terms of the license included below,
+# which is compatible with use in a MIT licensed project.
+#
+
+__version__ = '$Id$'
+
+# ========================================================================
+# Copyright (c) 2007, Metaweb Technologies, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+# 
+# THIS SOFTWARE IS PROVIDED BY METAWEB TECHNOLOGIES AND CONTRIBUTORS
+# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL METAWEB
+# TECHNOLOGIES OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# ========================================================================
+
+import logging
+import urllib
+import re
+import cookielib
+
+class DummyRequest(object):
+    """Simulated urllib2.Request object for httplib2
+       implements only what's necessary for cookielib.CookieJar to work
+    """
+    def __init__(self, url, headers=None):
+        self.url = url
+        self.headers = headers
+        self.origin_req_host = cookielib.request_host(self)
+        self.type, r = urllib.splittype(url)
+        self.host, r = urllib.splithost(r)
+        if self.host:
+            self.host = urllib.unquote(self.host)
+
+    def get_full_url(self):
+        return self.url
+
+    def get_origin_req_host(self):
+        # TODO to match urllib2 this should be different for redirects
+        return self.origin_req_host
+
+    def get_type(self):
+        return self.type
+
+    def get_host(self):
+        return self.host
+
+    def get_header(self, key, default=None):
+        return self.headers.get(key.lower(), default)
+
+    def has_header(self, key):
+        return key in self.headers
+
+    def add_unredirected_header(self, key, val):
+        # TODO this header should not be sent on redirect
+        self.headers[key.lower()] = val
+
+    def is_unverifiable(self):
+        # TODO to match urllib2, this should be set to True when the
+        #  request is the result of a redirect
+        return False
+
+class DummyResponse(object):
+    """Simulated urllib2.Request object for httplib2
+       implements only what's necessary for cookielib.CookieJar to work
+    """
+    def __init__(self, response):
+        self.response = response
+
+    def info(self):
+        return DummyMessage(self.response)
+
+class DummyMessage(object):
+    """Simulated mimetools.Message object for httplib2
+       implements only what's necessary for cookielib.CookieJar to work
+    """
+    def __init__(self, response):
+        self.response = response
+
+    def getheaders(self, k):
+        k = k.lower()
+        #v = self.response.get(k.lower(), None)
+        if k not in self.response:
+            return []
+        #return self.response[k].split(re.compile(',\\s*'))
+
+        # httplib2 joins multiple values for the same header
+        #  using ','.  but the netscape cookie format uses ','
+        #  as part of the expires= date format.  so we have
+        #  to split carefully here - header.split(',') won't do it.
+        HEADERVAL= re.compile(r'\s*(([^,]|(,\s*\d))+)')
+        return [h[0] for h in HEADERVAL.findall(self.response[k])]
\ No newline at end of file

Added: trunk/threadedhttp/threadedhttp/http.py
===================================================================
--- trunk/threadedhttp/threadedhttp/http.py	                        (rev 0)
+++ trunk/threadedhttp/threadedhttp/http.py	2008-01-02 22:38:21 UTC (rev 4793)
@@ -0,0 +1,145 @@
+# -*- coding: utf-8  -*-
+""" Thread-safe cookie-eating Http library based on httplib2 """
+
+#
+# (C) Merlijn van Deen, 2007
+#
+# Indicated parts (C) Joe Gregorio et al, 2006 
+# Distributed under the terms of the MIT license
+#
+__version__ = '$Id$'
+__docformat__ = 'epytext'
+
+import logging
+
+# easy_install safeguarded dependencies
+import pkg_resources
+pkg_resources.require("httplib2")
+import httplib2
+
+# local package imports
+from cookiejar import LockableCookieJar
+from connectionpool import ConnectionPool
+from dummy import DummyRequest, DummyResponse
+
+class Http(httplib2.Http):
+    """ Subclass of httplib2.Http that uses a `LockableCookieJar` to store cookies.
+        Overrides httplib2s internal redirect support to prevent cookies 
+        being eaten by the wrong sites.
+    """
+    def __init__(self, *args, **kwargs):
+        """ @param cookiejar: (optional) CookieJar to use. A new one will be used when not supplied.
+            @param connection_pool: (optional) Connection pool to use. A new one will be used when not supplied.
+            @param max_redirects: (optional) The maximum number of redirects to follow. 5 is default.
+        """
+        self.cookiejar = kwargs.pop('cookiejar', LockableCookieJar())
+        self.connection_pool = kwargs.pop('connection_pool', ConnectionPool())
+        self.max_redirects = kwargs.pop('max_redirects', 5)
+        httplib2.Http.__init__(self, *args, **kwargs)
+
+    def request(self, uri, method="GET", body=None, headers=None, max_redirects=None, connection_type=None):
+        """ Starts an HTTP request.
+            @param uri: The uri to retrieve
+            @param method: (optional) The HTTP method to use. Default is 'GET'
+            @param body: (optional) The request body. Default is no body.
+            @param headers: (optional) Additional headers to send. Defaults include 
+                            C{connection: keep-alive}, C{user-agent} and C{content-type}.
+            @param max_redirects: (optional) The maximum number of redirects to use for this request.
+                                  The class instances max_redirects is default
+            @param connection_type: (optional) ?
+            @returns: (response, content) tuple
+        """ 
+        if max_redirects is None:
+            max_redirects = self.max_redirects
+        if headers is None:
+            headers = {}
+        # Prepare headers
+        headers.pop('cookie', None)
+        req = DummyRequest(uri, headers)
+        self.cookiejar.lock.acquire()
+        try:
+            self.cookiejar.add_cookie_header(req)
+        finally:
+            self.cookiejar.lock.release()
+        headers = req.headers
+        
+        # Wikimedia squids: add connection: keep-alive to request headers unless overridden
+        headers['connection'] = headers.pop('connection', 'keep-alive')
+        
+        # determine connection pool key and fetch connection
+        (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri))
+        conn_key = scheme+":"+authority
+        
+        connection = self.connection_pool.pop_connection(conn_key)
+        if connection is not None:
+            self.connections[conn_key] = connection
+        
+        # Redirect hack: we want to regulate redirects
+        follow_redirects = self.follow_redirects
+        #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects)
+        self.follow_redirects = False
+        #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects)
+        logging.debug('%r' % ((uri, method, headers, max_redirects, connection_type),))
+        (response, content) = httplib2.Http.request(self, uri, method, body, headers, max_redirects, connection_type)
+        #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects)
+        self.follow_redirects = follow_redirects
+        #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects)
+        
+        
+        # return connection to pool
+        self.connection_pool.push_connection(conn_key, self.connections[conn_key])
+        del self.connections[conn_key]
+                
+        # First write cookies 
+        self.cookiejar.lock.acquire()
+        try:           
+            self.cookiejar.extract_cookies(DummyResponse(response), req)
+        finally:
+            self.cookiejar.lock.release()
+        
+        # Check for possible redirects
+        redirectable_response = ((response.status == 303) or
+                                 (response.status in [300, 301, 302, 307] and method in ["GET", "HEAD"]))
+        if self.follow_redirects and (max_redirects > 0) and redirectable_response:
+            (response, content) = self._follow_redirect(uri, method, body, headers, response, content, max_redirects)
+        return (response, content)
+    
+    # The _follow_redirect function is based on the redirect handling in the
+    # _request function of httplib2. The original function is (C) Joe Gregorio et al, 2006
+    # and licensed under the MIT license. Other contributers include
+    # Thomas Broyer (t.broyer at ltgt.net), James Antill, Xavier Verges Farrero, 
+    # Jonathan Feinberg, Blair Zajac, Sam Ruby and Louis Nyffenegger (httplib2.__contributers__)
+    def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects):
+        """ Internal function to follow a redirect recieved by L{request} """
+        (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri))
+        if self.cache:
+            cachekey = defrag_uri
+        else:
+            cachekey = None
+
+        # Pick out the location header and basically start from the beginning
+        # remembering first to strip the ETag header and decrement our 'depth'
+        if not response.has_key('location') and response.status != 300:
+            raise httplib2.RedirectMissingLocation("Redirected but the response is missing a Location: header.", response, content)
+        # Fix-up relative redirects (which violate an RFC 2616 MUST)
+        if response.has_key('location'):
+            location = response['location']
+            (scheme, authority, path, query, fragment) = httplib2.parse_uri(location)
+            if authority == None:
+                response['location'] = httplib2.urlparse.urljoin(uri, location)
+                logging.debug('Relative redirect: changed [%s] to [%s]' % (location, response['location']))
+        if response.status == 301 and method in ["GET", "HEAD"]:
+            response['-x-permanent-redirect-url'] = response['location']
+            if not response.has_key('content-location'):
+                response['content-location'] = absolute_uri 
+            httplib2._updateCache(headers, response, content, self.cache, cachekey)
+        
+        headers.pop('if-none-match', None)
+        headers.pop('if-modified-since', None)
+        
+        if response.has_key('location'):
+            location = response['location']
+            redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
+            return self.request(location, redirect_method, body=body, headers = headers, max_redirects = max_redirects - 1)
+        else:
+            raise httplib2.RedirectLimit("Redirected more times than redirection_limit allows.", response, content)
\ No newline at end of file

Added: trunk/threadedhttp/threadedhttp/threadedhttp.py
===================================================================
--- trunk/threadedhttp/threadedhttp/threadedhttp.py	                        (rev 0)
+++ trunk/threadedhttp/threadedhttp/threadedhttp.py	2008-01-02 22:38:21 UTC (rev 4793)
@@ -0,0 +1,57 @@
+# -*- coding: utf-8  -*-
+""" Module containing HttpRequest wrapper class and HttpProcessor thread class """
+
+#
+# (C) Merlijn van Deen, 2007
+#
+# Distributed under the terms of the MIT license
+#
+__version__ = '$Id$'
+__docformat__ = 'epytext'
+
+import logging
+import threading
+
+from http import Http
+
+class HttpRequest(object):
+    """ Object wrapper for HTTP requests that need to block the requesters thread.
+        Usage:
+        >>> request = HttpRequest('http://www.google.com')
+        >>> queue.put(request)
+        >>> request.lock.acquire()
+        >>> print request.data
+        
+        C{request.lock.acquire()} will block until the data is available.
+    """
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+        self.data = None
+        self.lock = threading.Semaphore(0)
+
+class HttpProcessor(threading.Thread):
+    """ Thread object to spawn multiple HTTP connection threads """
+    def __init__(self, queue, cookiejar, connection_pool):
+        """ @param queue: The C{Queue.Queue} object that contains L{HttpRequest} objects.
+            @param cookiejar: The C{LockableCookieJar} cookie object to share among requests.
+            @param connection_pool: The C{ConnectionPool} object which contains connections to share among requests.
+        """
+        threading.Thread.__init__(self)
+        self.queue = queue
+        self.http = Http(cookiejar=cookiejar, connection_pool=connection_pool)
+        
+    def run(self):
+        # The Queue item is expected to either an HttpRequest object
+        # or None (to shut down the thread)
+        logging.debug('Thread started, waiting for requests.')
+        while (True):
+            item = self.queue.get()
+            if item is None:
+                logging.debug('Shutting down thread.')
+                return
+            try:
+                item.data = self.http.request(*item.args, **item.kwargs)
+            finally:
+                if item.lock:
+                    item.lock.release()
\ No newline at end of file





More information about the Pywikipedia-l mailing list