Revision: 4793 Author: valhallasw Date: 2008-01-02 22:38:21 +0000 (Wed, 02 Jan 2008)
Log Message: ----------- New: threadable http library, based on httplib2, with automatic (and thread-safe) cookie support. Includes easy_install-based setup.py.
Added Paths: ----------- trunk/threadedhttp/ trunk/threadedhttp/epydoc.cfg trunk/threadedhttp/setup.py trunk/threadedhttp/threadedhttp/ trunk/threadedhttp/threadedhttp/__init__.py trunk/threadedhttp/threadedhttp/connectionpool.py trunk/threadedhttp/threadedhttp/cookiejar.py trunk/threadedhttp/threadedhttp/dummy.py trunk/threadedhttp/threadedhttp/http.py trunk/threadedhttp/threadedhttp/threadedhttp.py
Added: trunk/threadedhttp/epydoc.cfg =================================================================== --- trunk/threadedhttp/epydoc.cfg (rev 0) +++ trunk/threadedhttp/epydoc.cfg 2008-01-02 22:38:21 UTC (rev 4793) @@ -0,0 +1,84 @@ +[epydoc] # Epydoc section marker (required by ConfigParser) + +# modules +# The list of objects to document. Objects can be named using +# dotted names, module filenames, or package directory names. +# Alases for this option include "objects" and "values". +modules: threadedhttp + +# output +# The type of output that should be generated. Should be one +# of: html, text, latex, dvi, ps, pdf. +output: html + +# target +# The path to the output directory. May be relative or absolute. +target: doc/ + +# css +# The CSS stylesheet for HTML output. Can be the name of a builtin +# stylesheet, or the name of a file. +css: white + +# name +# The documented project's name. +name: Threaded HTTP Library + +# url +# The documented project's URL. +url: http://pywikipediabot.sourceforge.net + +# frames +# Whether or not to include a frames-based table of contents. +frames: yes + +# private +# Whether or not to inclue private variables. (Even if included, +# private variables will be hidden by default.) +private: yes + +# imports +# Whether or not to list each module's imports. +imports: yes + +# verbosity +# An integer indicating how verbose epydoc should be. The default +# value is 0; negative values will supress warnings and errors; +# positive values will give more verbose output. +verbosity: 0 + +# parse +# Whether or not parsing should be used to examine objects. +parse: yes + +# introspect +# Whether or not introspection should be used to examine objects. +introspect: yes + +# graph +# The list of graph types that should be automatically included +# in the output. Graphs are generated using the Graphviz "dot" +# executable. Graph types include: "classtree", "callgraph", +# "umlclass". Use "all" to include all graph types +graph: all + +# dotpath +# The path to the Graphviz "dot" executable, used to generate +# graphs. +dotpath: /usr/bin/dot + +# sourcecode +# Whether or not to include syntax highlighted source code in +# the output (HTML only). +sourcecode: no + +# pstat +# The name of one or more pstat files (generated by the profile +# or hotshot module). These are used to generate call graphs. +#pstat: profile.out + +# separate-classes +# Whether each class should be listed in its own section when +# generating LaTeX or PDF output. +separate-classes: no +
Added: trunk/threadedhttp/setup.py =================================================================== --- trunk/threadedhttp/setup.py (rev 0) +++ trunk/threadedhttp/setup.py 2008-01-02 22:38:21 UTC (rev 4793) @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# +# (C) Merlijn van Deen, 2007 +# +# Distributed under the terms of the MIT license +# + +from setuptools import setup, find_packages +setup( + name = "threadedhttp", + version = "0.1", + packages = find_packages(exclude=['ez_setup']), + install_requires = ["httplib2"], + platforms=['any'], + author = "Merlijn van Deen", + author_email = "valhallasw@arctus.nl", + description = "httplib2-based HTTP library supporting cookies and threads", + classifiers = filter(None, map(str.strip, +""" +Intended Audience :: Developers +License :: OSI Approved :: MIT License +Programming Language :: Python +Topic :: Software Development :: Libraries :: Python Modules +""".splitlines())), + license = "MIT License", + keywords = "thread cookie httplib2", + url = "http://pywikipediabot.sourceforge.net", +) \ No newline at end of file
Property changes on: trunk/threadedhttp/setup.py ___________________________________________________________________ Name: svn:executable + *
Added: trunk/threadedhttp/threadedhttp/__init__.py =================================================================== --- trunk/threadedhttp/threadedhttp/__init__.py (rev 0) +++ trunk/threadedhttp/threadedhttp/__init__.py 2008-01-02 22:38:21 UTC (rev 4793) @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- + +""" Thread-safe cookie-eating Http library based on httplib2 + with simple threaded interface +""" + +from http import Http +from threadedhttp import HttpRequest, HttpProcessor \ No newline at end of file
Added: trunk/threadedhttp/threadedhttp/connectionpool.py =================================================================== --- trunk/threadedhttp/threadedhttp/connectionpool.py (rev 0) +++ trunk/threadedhttp/threadedhttp/connectionpool.py 2008-01-02 22:38:21 UTC (rev 4793) @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +""" Thread safe connection pools to share connections between threads. """ + +# +# (C) Merlijn van Deen, 2007 +# +# Distributed under the terms of the MIT license +# + +__version__ = '$Id$' +__docformat__ = 'epytext' + +import logging +import threading + +class ConnectionList(list): + """ List with BoundedSemaphore """ + def __init__(self, maxnum, *args, **kwargs): + """ @param maxnum: BoundedSemaphores counter initialisation (maximum number of acquires)""" + list.__init__(self, *args, **kwargs) + self.max = threading.BoundedSemaphore(maxnum) + +class ConnectionPool(object): + def __init__(self, max_connections=25, max_connections_per_host=5): + """ @param max_connections: Global maximum number of connections for this pool + @param max_connections_per_host: maximum number of connections per host + """ + self.max_connections_per_host = max_connections_per_host + self.global_max = threading.BoundedSemaphore(max_connections) + self.lock = threading.Lock() + self.connections = [None] * max_connections # fill known connections witn Nones + self.clists = {} # 'id': (semaphore, lock, [connection1, connection2]) + logging.log(1,'<%r>: initialized' % self) + + def __del__(self): + """ Destructor to close all connections in the pool. + Not completely thread-safe, as connections *could* return just + after this function. Make sure the pool is destructed only when + it is no longer in use!""" + self.lock.acquire() #prevents any connections from returning into the pool + try: + for connection in self.connections: + if connection is not None: + connection.close() + del self.connections + del self.clists + finally: + self.lock.release() + + def pop_connection(self, identifier): + """ Gets a connection from identifiers connection pool + @param identifier The pool identifier + @returns A connection object if found, None otherwise + """ + logging.log(1,'<%r>: acquiring global_max...' % self) + self.global_max.acquire() + logging.log(1,'<%r>: global_max acquired' % self) + try: + self.lock.acquire() + if identifier not in self.clists: + self.clists[identifier] = ConnectionList(self.max_connections_per_host) + clist = self.clists[identifier] + self.lock.release() + logging.log(1,'<%r>: acquiring clist.max...' % self) + if not clist.max.acquire(False): # acquire local lock, releasing global lock when waiting + logging.log(1,'<%r>: ...failed' % self) + self.global_max.release() + logging.log(logging.DEBUG,'<%r>: No host connections available, global_max released.' % self) + clist.max.acquire() + self.global_max.acquire() + try: + logging.log(1,'<%r>: ...acquired' % self) + # we hebben nu zowel toestemming voor een global als voor een local connection + # kijk eerst of er zo'n verbinding bestaat + self.lock.acquire() + try: + if len(clist) > 0: + connection = clist.pop() + logging.log(1,'<%r>: using cached connection' % self) + return connection + else: + # pop the oldest connection from the connection stack + old_connection = self.connections.pop(0) + logging.log(1,'<%r>: popped %r to make place for new connection' % (self,old_connection)) + if old_connection is not None: + old_connection.close() + for slist in self.clists.itervalues(): + if old_connection in clist: + clist.remove(old_connection) + break # a connection is in max one clist + return None + finally: + self.lock.release() + except Exception, e: + logging.log(20,'<%r>: Exception raised level 2 | %r' % (self, e)) + clist.max.release() + raise + except Exception, e: + logging.log(20,'<%r>: Exception raised level 1 | %r' % (self, e)) + self.global_max.release() + raise + + def push_connection(self, identifier, connection): + """ Gets a connection from identifiers connection pool + @param identifier The pool identifier + @returns A connection object if found, None otherwise + """ + self.lock.acquire() + try: + clist = self.clists[identifier] + clist.append(connection) + if connection not in self.connections: + self.connections.append(connection) + clist.max.release() + self.global_max.release() + logging.log(1, 'clist.max and global_max += 1') + finally: + self.lock.release() + +class BasicConnectionPool(object): + """ A thread-safe connection pool """ + def __init__(self, maxnum=5): + """ @param maxnum: Maximum number of connections per identifier. + The pool drops excessive connections added. + """ + self.connections = {} + self.lock = threading.Lock() + self.maxnum = maxnum + + def __del__(self): + """ Destructor to close all connections in the pool """ + self.lock.acquire() + try: + for connection in self.connections: + connection.close() + + finally: + self.lock.release() + + def __repr__(self): + return self.connections.__repr__() + + def pop_connection(self, identifier): + """ Gets a connection from identifiers connection pool + @param identifier: The pool identifier + @returns: A connection object if found, None otherwise + """ + self.lock.acquire() + try: + if identifier in self.connections: + if len(self.connections[identifier]) > 0: + return self.connections[identifier].pop() + return None + finally: + self.lock.release() + + def push_connection(self, identifier, connection): + """ Adds a connection to identifiers connection pool + @param identifier: The pool identifier + @param connection: The connection to add to the pool + """ + self.lock.acquire() + try: + if identifier not in self.connections: + self.connections[identifier] = [] + + if len(self.connections[identifier]) == self.maxnum: + logging.debug('closing %s connection %r' % (identifier, connection)) + connection.close() + del connection + else: + self.connections[identifier].append(connection) + finally: + self.lock.release() \ No newline at end of file
Added: trunk/threadedhttp/threadedhttp/cookiejar.py =================================================================== --- trunk/threadedhttp/threadedhttp/cookiejar.py (rev 0) +++ trunk/threadedhttp/threadedhttp/cookiejar.py 2008-01-02 22:38:21 UTC (rev 4793) @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +""" Lockable CookieJar and FileCookieJar """ + +# +# (C) Merlijn van Deen, 2007 +# +# Distributed under the terms of the MIT license +# + +__version__ = '$Id$' +__docformat__ = 'epytext' + +import logging +import threading +import cookielib + +class LockableCookieJar(cookielib.CookieJar): + """ CookieJar with integrated Lock object """ + def __init__(self, *args, **kwargs): + cookielib.CookieJar.__init__(self, *args, **kwargs) + self.lock = threading.Lock() + +class LockableFileCookieJar(cookielib.FileCookieJar): + """ CookieJar with integrated Lock object """ + def __init__(self, *args, **kwargs): + cookielib.FileCookieJar.__init__(self, *args, **kwargs) + self.lock = threading.Lock()
Added: trunk/threadedhttp/threadedhttp/dummy.py =================================================================== --- trunk/threadedhttp/threadedhttp/dummy.py (rev 0) +++ trunk/threadedhttp/threadedhttp/dummy.py 2008-01-02 22:38:21 UTC (rev 4793) @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +""" Dummy classes for CookieJar <-> httplib2 communication """ + +# +# (C) Metaweb Technologies, Inc., 2007 +# +# Distributed under the terms of the license included below, +# which is compatible with use in a MIT licensed project. +# + +__version__ = '$Id$' + +# ======================================================================== +# Copyright (c) 2007, Metaweb Technologies, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY METAWEB TECHNOLOGIES AND CONTRIBUTORS +# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL METAWEB +# TECHNOLOGIES OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# ======================================================================== + +import logging +import urllib +import re +import cookielib + +class DummyRequest(object): + """Simulated urllib2.Request object for httplib2 + implements only what's necessary for cookielib.CookieJar to work + """ + def __init__(self, url, headers=None): + self.url = url + self.headers = headers + self.origin_req_host = cookielib.request_host(self) + self.type, r = urllib.splittype(url) + self.host, r = urllib.splithost(r) + if self.host: + self.host = urllib.unquote(self.host) + + def get_full_url(self): + return self.url + + def get_origin_req_host(self): + # TODO to match urllib2 this should be different for redirects + return self.origin_req_host + + def get_type(self): + return self.type + + def get_host(self): + return self.host + + def get_header(self, key, default=None): + return self.headers.get(key.lower(), default) + + def has_header(self, key): + return key in self.headers + + def add_unredirected_header(self, key, val): + # TODO this header should not be sent on redirect + self.headers[key.lower()] = val + + def is_unverifiable(self): + # TODO to match urllib2, this should be set to True when the + # request is the result of a redirect + return False + +class DummyResponse(object): + """Simulated urllib2.Request object for httplib2 + implements only what's necessary for cookielib.CookieJar to work + """ + def __init__(self, response): + self.response = response + + def info(self): + return DummyMessage(self.response) + +class DummyMessage(object): + """Simulated mimetools.Message object for httplib2 + implements only what's necessary for cookielib.CookieJar to work + """ + def __init__(self, response): + self.response = response + + def getheaders(self, k): + k = k.lower() + #v = self.response.get(k.lower(), None) + if k not in self.response: + return [] + #return self.response[k].split(re.compile(',\s*')) + + # httplib2 joins multiple values for the same header + # using ','. but the netscape cookie format uses ',' + # as part of the expires= date format. so we have + # to split carefully here - header.split(',') won't do it. + HEADERVAL= re.compile(r'\s*(([^,]|(,\s*\d))+)') + return [h[0] for h in HEADERVAL.findall(self.response[k])] \ No newline at end of file
Added: trunk/threadedhttp/threadedhttp/http.py =================================================================== --- trunk/threadedhttp/threadedhttp/http.py (rev 0) +++ trunk/threadedhttp/threadedhttp/http.py 2008-01-02 22:38:21 UTC (rev 4793) @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +""" Thread-safe cookie-eating Http library based on httplib2 """ + +# +# (C) Merlijn van Deen, 2007 +# +# Indicated parts (C) Joe Gregorio et al, 2006 +# Distributed under the terms of the MIT license +# +__version__ = '$Id$' +__docformat__ = 'epytext' + +import logging + +# easy_install safeguarded dependencies +import pkg_resources +pkg_resources.require("httplib2") +import httplib2 + +# local package imports +from cookiejar import LockableCookieJar +from connectionpool import ConnectionPool +from dummy import DummyRequest, DummyResponse + +class Http(httplib2.Http): + """ Subclass of httplib2.Http that uses a `LockableCookieJar` to store cookies. + Overrides httplib2s internal redirect support to prevent cookies + being eaten by the wrong sites. + """ + def __init__(self, *args, **kwargs): + """ @param cookiejar: (optional) CookieJar to use. A new one will be used when not supplied. + @param connection_pool: (optional) Connection pool to use. A new one will be used when not supplied. + @param max_redirects: (optional) The maximum number of redirects to follow. 5 is default. + """ + self.cookiejar = kwargs.pop('cookiejar', LockableCookieJar()) + self.connection_pool = kwargs.pop('connection_pool', ConnectionPool()) + self.max_redirects = kwargs.pop('max_redirects', 5) + httplib2.Http.__init__(self, *args, **kwargs) + + def request(self, uri, method="GET", body=None, headers=None, max_redirects=None, connection_type=None): + """ Starts an HTTP request. + @param uri: The uri to retrieve + @param method: (optional) The HTTP method to use. Default is 'GET' + @param body: (optional) The request body. Default is no body. + @param headers: (optional) Additional headers to send. Defaults include + C{connection: keep-alive}, C{user-agent} and C{content-type}. + @param max_redirects: (optional) The maximum number of redirects to use for this request. + The class instances max_redirects is default + @param connection_type: (optional) ? + @returns: (response, content) tuple + """ + if max_redirects is None: + max_redirects = self.max_redirects + if headers is None: + headers = {} + # Prepare headers + headers.pop('cookie', None) + req = DummyRequest(uri, headers) + self.cookiejar.lock.acquire() + try: + self.cookiejar.add_cookie_header(req) + finally: + self.cookiejar.lock.release() + headers = req.headers + + # Wikimedia squids: add connection: keep-alive to request headers unless overridden + headers['connection'] = headers.pop('connection', 'keep-alive') + + # determine connection pool key and fetch connection + (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) + conn_key = scheme+":"+authority + + connection = self.connection_pool.pop_connection(conn_key) + if connection is not None: + self.connections[conn_key] = connection + + # Redirect hack: we want to regulate redirects + follow_redirects = self.follow_redirects + #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects) + self.follow_redirects = False + #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects) + logging.debug('%r' % ((uri, method, headers, max_redirects, connection_type),)) + (response, content) = httplib2.Http.request(self, uri, method, body, headers, max_redirects, connection_type) + #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects) + self.follow_redirects = follow_redirects + #print 'follow_redirects: %r %r' % (self.follow_redirects, follow_redirects) + + + # return connection to pool + self.connection_pool.push_connection(conn_key, self.connections[conn_key]) + del self.connections[conn_key] + + # First write cookies + self.cookiejar.lock.acquire() + try: + self.cookiejar.extract_cookies(DummyResponse(response), req) + finally: + self.cookiejar.lock.release() + + # Check for possible redirects + redirectable_response = ((response.status == 303) or + (response.status in [300, 301, 302, 307] and method in ["GET", "HEAD"])) + if self.follow_redirects and (max_redirects > 0) and redirectable_response: + (response, content) = self._follow_redirect(uri, method, body, headers, response, content, max_redirects) + return (response, content) + + # The _follow_redirect function is based on the redirect handling in the + # _request function of httplib2. The original function is (C) Joe Gregorio et al, 2006 + # and licensed under the MIT license. Other contributers include + # Thomas Broyer (t.broyer@ltgt.net), James Antill, Xavier Verges Farrero, + # Jonathan Feinberg, Blair Zajac, Sam Ruby and Louis Nyffenegger (httplib2.__contributers__) + def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects): + """ Internal function to follow a redirect recieved by L{request} """ + (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) + if self.cache: + cachekey = defrag_uri + else: + cachekey = None + + # Pick out the location header and basically start from the beginning + # remembering first to strip the ETag header and decrement our 'depth' + if not response.has_key('location') and response.status != 300: + raise httplib2.RedirectMissingLocation("Redirected but the response is missing a Location: header.", response, content) + # Fix-up relative redirects (which violate an RFC 2616 MUST) + if response.has_key('location'): + location = response['location'] + (scheme, authority, path, query, fragment) = httplib2.parse_uri(location) + if authority == None: + response['location'] = httplib2.urlparse.urljoin(uri, location) + logging.debug('Relative redirect: changed [%s] to [%s]' % (location, response['location'])) + if response.status == 301 and method in ["GET", "HEAD"]: + response['-x-permanent-redirect-url'] = response['location'] + if not response.has_key('content-location'): + response['content-location'] = absolute_uri + httplib2._updateCache(headers, response, content, self.cache, cachekey) + + headers.pop('if-none-match', None) + headers.pop('if-modified-since', None) + + if response.has_key('location'): + location = response['location'] + redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method + return self.request(location, redirect_method, body=body, headers = headers, max_redirects = max_redirects - 1) + else: + raise httplib2.RedirectLimit("Redirected more times than redirection_limit allows.", response, content) \ No newline at end of file
Added: trunk/threadedhttp/threadedhttp/threadedhttp.py =================================================================== --- trunk/threadedhttp/threadedhttp/threadedhttp.py (rev 0) +++ trunk/threadedhttp/threadedhttp/threadedhttp.py 2008-01-02 22:38:21 UTC (rev 4793) @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +""" Module containing HttpRequest wrapper class and HttpProcessor thread class """ + +# +# (C) Merlijn van Deen, 2007 +# +# Distributed under the terms of the MIT license +# +__version__ = '$Id$' +__docformat__ = 'epytext' + +import logging +import threading + +from http import Http + +class HttpRequest(object): + """ Object wrapper for HTTP requests that need to block the requesters thread. + Usage: + >>> request = HttpRequest('http://www.google.com') + >>> queue.put(request) + >>> request.lock.acquire() + >>> print request.data + + C{request.lock.acquire()} will block until the data is available. + """ + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + self.data = None + self.lock = threading.Semaphore(0) + +class HttpProcessor(threading.Thread): + """ Thread object to spawn multiple HTTP connection threads """ + def __init__(self, queue, cookiejar, connection_pool): + """ @param queue: The C{Queue.Queue} object that contains L{HttpRequest} objects. + @param cookiejar: The C{LockableCookieJar} cookie object to share among requests. + @param connection_pool: The C{ConnectionPool} object which contains connections to share among requests. + """ + threading.Thread.__init__(self) + self.queue = queue + self.http = Http(cookiejar=cookiejar, connection_pool=connection_pool) + + def run(self): + # The Queue item is expected to either an HttpRequest object + # or None (to shut down the thread) + logging.debug('Thread started, waiting for requests.') + while (True): + item = self.queue.get() + if item is None: + logging.debug('Shutting down thread.') + return + try: + item.data = self.http.request(*item.args, **item.kwargs) + finally: + if item.lock: + item.lock.release() \ No newline at end of file
pywikipedia-l@lists.wikimedia.org