Revision: 5230 Author: russblau Date: 2008-04-17 17:54:39 +0000 (Thu, 17 Apr 2008)
Log Message: ----------- Moved comm protocol classes to separate directory, simplified throttle, made other behind-the-scenes changes
Modified Paths: -------------- branches/rewrite/pywikibot/__init__.py branches/rewrite/pywikibot/config.py branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/login.py branches/rewrite/pywikibot/page.py branches/rewrite/pywikibot/site.py branches/rewrite/pywikibot/throttle.py
Added Paths: ----------- branches/rewrite/pywikibot/comms/ branches/rewrite/pywikibot/comms/__init__.py branches/rewrite/pywikibot/comms/http.py branches/rewrite/pywikibot/comms/threadedhttp.py
Removed Paths: ------------- branches/rewrite/pywikibot/data/http.py branches/rewrite/pywikibot/data/threadedhttp.py
Modified: branches/rewrite/pywikibot/__init__.py =================================================================== --- branches/rewrite/pywikibot/__init__.py 2008-04-17 16:43:25 UTC (rev 5229) +++ branches/rewrite/pywikibot/__init__.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -9,11 +9,11 @@ # __version__ = '$Id: $'
+import logging +logging.getLogger().setLevel(logging.DEBUG)
from exceptions import *
-from page import Page, ImagePage, Category - import config
_sites = {} @@ -60,6 +60,20 @@
getSite = Site # alias for backwards-compability
+from page import Page, ImagePage, Category + +##def Page(*args, **kwargs): +## from page import Page as _Page +## return _Page(*args, **kwargs) +## +##def ImagePage(*args, **kwargs): +## from page import ImagePage as _ImagePage +## return _ImagePage(*args, **kwargs) +## +##def Category(*args, **kwargs): +## from page import Category as _Category +## return _Category(*args, **kwargs) + # DEBUG
def output(text): @@ -71,9 +85,6 @@ return getpass.getpass(prompt) return raw_input(prompt)
-import logging -logging.getLogger().setLevel(logging.DEBUG) - def stopme(): """Drop this process from the throttle log.
@@ -82,7 +93,11 @@
""" # only need one drop() call because all throttles use the same global pid - Site().get_throttle.drop() + try: + _sites[_sites.keys()[0]].throttle.drop() + logging.info("Dropped throttle(s).") + except IndexError: + pass
import atexit atexit.register(stopme)
Added: branches/rewrite/pywikibot/comms/__init__.py ===================================================================
Added: branches/rewrite/pywikibot/comms/http.py =================================================================== --- branches/rewrite/pywikibot/comms/http.py (rev 0) +++ branches/rewrite/pywikibot/comms/http.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +""" +Basic HTTP access interface. + +This module handles communication between the bot and the HTTP threads. + +This module is responsible for + - Setting up a connection pool + - Providing a (blocking) interface for HTTP requests + - Translate site objects with query strings into urls + - Urlencoding all data + - Basic HTTP error handling +""" + +# +# (C) Pywikipedia bot team, 2007 +# +# Distributed under the terms of the MIT license. +# + +__version__ = '$Id: $' +__docformat__ = 'epytext' + +import Queue +import urllib +import urlparse +import logging +import atexit + +import config +import cookielib +import threadedhttp + + +# global variables + +useragent = 'Pywikipediabot/2.0' # This should include some global version string +numthreads = 1 +threads = [] + +connection_pool = threadedhttp.ConnectionPool() +http_queue = Queue.Queue() + +cookie_jar = threadedhttp.LockableCookieJar( + config.datafilepath("%s-%s-%s.lwp" + % (config.family, + config.mylang, + config.usernames[config.family][config.mylang]))) +try: + cookie_jar.load() +except (IOError, cookielib.LoadError): + logging.debug("Loading cookies failed.") +else: + logging.debug("Loaded cookies from file.") + + +# Build up HttpProcessors +logging.info('Starting %i threads...' % numthreads) +for i in range(numthreads): + proc = threadedhttp.HttpProcessor(http_queue, cookie_jar, connection_pool) + proc.setDaemon(True) + threads.append(proc) + proc.start() + +# Prepare flush on quit +def _flush(): + for i in threads: + http_queue.put(None) + logging.info('Waiting for threads to finish... ') + for i in threads: + i.join() + logging.debug('All threads finished.') +atexit.register(_flush) + +# export cookie_jar to global namespace +import pywikibot +pywikibot.cookie_jar = cookie_jar + +def request(site, uri, *args, **kwargs): + """Queue a request to be submitted to Site. + + All parameters not listed below are the same as + L{httplib2.Http.request}, but the uri is relative + + @param site: The Site to connect to + @return: The received data (a unicode string). + """ + baseuri = "%s://%s/" % (site.protocol(), site.hostname()) + uri = urlparse.urljoin(baseuri, uri) + + request = threadedhttp.HttpRequest(uri, *args, **kwargs) + http_queue.put(request) + request.lock.acquire() + + #do some error correcting stuff + + #if all else fails + if isinstance(request.data, Exception): + raise request.data + + if request.data[0].status != 200: + logging.warning("Http response status %s" % request.data[0].status) + + return request.data[1]
Added: branches/rewrite/pywikibot/comms/threadedhttp.py =================================================================== --- branches/rewrite/pywikibot/comms/threadedhttp.py (rev 0) +++ branches/rewrite/pywikibot/comms/threadedhttp.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -0,0 +1,428 @@ +# -*- coding: utf-8 -*- +""" Httplib2 threaded cookie layer + +This class extends httplib2, adding support for: + - Cookies, guarded for cross-site redirects + - Thread safe ConnectionPool and LockableCookieJar classes + - HttpProcessor thread class + - HttpRequest object + +""" + +# (C) 2007 Pywikipedia bot team, 2007 +# (C) 2006 Httplib 2 team, 2006 +# (C) 2007 Metaweb Technologies, Inc. +# +# Partially distributed under the MIT license +# Partially distributed under Metaweb Technologies, Incs license +# which is compatible with the MIT license + +__version__ = '$Id: threadedhttp.py 4984 2008-02-06 16:21:18Z russblau $' +__docformat__ = 'epytext' + +# standard python libraries +import re +import threading +import time +import logging + +import urllib +import cookielib + +# easy_install safeguarded dependencies +import pkg_resources +pkg_resources.require("httplib2") +import httplib2 + + +class ConnectionPool(object): + """A thread-safe connection pool.""" + + def __init__(self, maxnum=5): + """ + @param maxnum: Maximum number of connections per identifier. + The pool drops excessive connections added. + + """ + logging.debug("Creating connection pool.") + self.connections = {} + self.lock = threading.Lock() + self.maxnum = maxnum + + def __del__(self): + """Destructor to close all connections in the pool.""" + self.lock.acquire() + try: + logging.debug("Closing connection pool (%s connections)" + % len(self.connections)) + for key in self.connections: + for connection in self.connections[key]: + connection.close() + finally: + self.lock.release() + + def __repr__(self): + return self.connections.__repr__() + + def pop_connection(self, identifier): + """Get a connection from identifier's connection pool. + + @param identifier: The pool identifier + @return: A connection object if found, None otherwise + + """ + self.lock.acquire() + try: + if identifier in self.connections: + if len(self.connections[identifier]) > 0: + logging.debug("Retrieved connection from '%s' pool." + % identifier) + return self.connections[identifier].pop() + return None + finally: + self.lock.release() + + def push_connection(self, identifier, connection): + """Add a connection to identifier's connection pool. + + @param identifier: The pool identifier + @param connection: The connection to add to the pool + + """ + self.lock.acquire() + try: + if identifier not in self.connections: + self.connections[identifier] = [] + + if len(self.connections[identifier]) == self.maxnum: + logging.debug('closing %s connection %r' + % (identifier, connection)) + connection.close() + del connection + else: + self.connections[identifier].append(connection) + finally: + self.lock.release() + + +class LockableCookieJar(cookielib.LWPCookieJar): + """CookieJar with integrated Lock object.""" + def __init__(self, *args, **kwargs): + cookielib.LWPCookieJar.__init__(self, *args, **kwargs) + self.lock = threading.Lock() + + +class Http(httplib2.Http): + """Subclass of httplib2.Http that stores cookies. + + Overrides httplib2's internal redirect support to prevent cookies being + eaten by the wrong sites. + + """ + def __init__(self, *args, **kwargs): + """ + @param cookiejar: (optional) CookieJar to use. A new one will be + used when not supplied. + @param connection_pool: (optional) Connection pool to use. A new one + will be used when not supplied. + @param max_redirects: (optional) The maximum number of redirects to + follow. 5 is default. + + """ + try: + self.cookiejar = kwargs.pop('cookiejar') + except KeyError: + self.cookiejar = LockableCookieJar() + try: + self.connection_pool = kwargs.pop('connection_pool') + except KeyError: + self.connection_pool = ConnectionPool() + self.max_redirects = kwargs.pop('max_redirects', 5) + httplib2.Http.__init__(self, *args, **kwargs) + + def request(self, uri, method="GET", body=None, headers=None, + max_redirects=None, connection_type=None): + """Start an HTTP request. + + @param uri: The uri to retrieve + @param method: (optional) The HTTP method to use. Default is 'GET' + @param body: (optional) The request body. Default is no body. + @param headers: (optional) Additional headers to send. Defaults + include C{connection: keep-alive}, C{user-agent} and + C{content-type}. + @param max_redirects: (optional) The maximum number of redirects to + use for this request. The class instance's max_redirects is + default + @param connection_type: (optional) see L{httplib2.Http.request} + + @return: (response, content) tuple + + """ + if max_redirects is None: + max_redirects = self.max_redirects + if headers is None: + headers = {} + # Prepare headers + headers.pop('cookie', None) + req = DummyRequest(uri, headers) + self.cookiejar.lock.acquire() + try: + self.cookiejar.add_cookie_header(req) + finally: + self.cookiejar.lock.release() + headers = req.headers + + # Wikimedia squids: add connection: keep-alive to request headers + # unless overridden + headers['connection'] = headers.pop('connection', 'keep-alive') + + # determine connection pool key and fetch connection + (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm( + httplib2.iri2uri(uri)) + conn_key = scheme+":"+authority + + connection = self.connection_pool.pop_connection(conn_key) + if connection is not None: + self.connections[conn_key] = connection + + # Redirect hack: we want to regulate redirects + follow_redirects = self.follow_redirects + self.follow_redirects = False + logging.debug('%r' % ( + (uri, method, body, headers, max_redirects, connection_type),)) + try: + (response, content) = httplib2.Http.request( + self, uri, method, body, headers, + max_redirects, connection_type) + except Exception, e: # what types? + # return exception instance to be retrieved by the calling thread + return e + self.follow_redirects = follow_redirects + + # return connection to pool + self.connection_pool.push_connection(conn_key, + self.connections[conn_key]) + del self.connections[conn_key] + + # First write cookies + self.cookiejar.lock.acquire() + try: + self.cookiejar.extract_cookies(DummyResponse(response), req) + finally: + self.cookiejar.lock.release() + + # Check for possible redirects + redirectable_response = ((response.status == 303) or + (response.status in [300, 301, 302, 307] and + method in ["GET", "HEAD"])) + if self.follow_redirects and (max_redirects > 0) \ + and redirectable_response: + (response, content) = self._follow_redirect( + uri, method, body, headers, response, content, max_redirects) + + return (response, content) + + def _follow_redirect(self, uri, method, body, headers, response, + content, max_redirects): + """Internal function to follow a redirect recieved by L{request}""" + (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm( + httplib2.iri2uri(uri)) + if self.cache: + cachekey = defrag_uri + else: + cachekey = None + + # Pick out the location header and basically start from the beginning + # remembering first to strip the ETag header and decrement our 'depth' + if not response.has_key('location') and response.status != 300: + raise httplib2.RedirectMissingLocation( + "Redirected but the response is missing a Location: header.", + response, content) + # Fix-up relative redirects (which violate an RFC 2616 MUST) + if response.has_key('location'): + location = response['location'] + (scheme, authority, path, query, fragment) = httplib2.parse_uri( + location) + if authority == None: + response['location'] = httplib2.urlparse.urljoin(uri, location) + logging.debug('Relative redirect: changed [%s] to [%s]' + % (location, response['location'])) + if response.status == 301 and method in ["GET", "HEAD"]: + response['-x-permanent-redirect-url'] = response['location'] + if not response.has_key('content-location'): + response['content-location'] = absolute_uri + httplib2._updateCache(headers, response, content, self.cache, + cachekey) + + headers.pop('if-none-match', None) + headers.pop('if-modified-since', None) + + if response.has_key('location'): + location = response['location'] + redirect_method = ((response.status == 303) and + (method not in ["GET", "HEAD"]) + ) and "GET" or method + return self.request(location, redirect_method, body=body, + headers=headers, + max_redirects=max_redirects - 1) + else: + raise RedirectLimit( + "Redirected more times than redirection_limit allows.", + response, content) + + +class HttpRequest(object): + """Object wrapper for HTTP requests that need to block origin thread. + + Usage: + + >>> request = HttpRequest('http://www.google.com') + >>> queue.put(request) + >>> request.lock.acquire() + >>> print request.data + + C{request.lock.acquire()} will block until the data is available. + + """ + def __init__(self, *args, **kwargs): + """See C{Http.request} for parameters.""" + self.args = args + self.kwargs = kwargs + self.data = None + self.lock = threading.Semaphore(0) + + +class HttpProcessor(threading.Thread): + """Thread object to spawn multiple HTTP connection threads.""" + def __init__(self, queue, cookiejar, connection_pool): + """ + @param queue: The C{Queue.Queue} object that contains L{HttpRequest} + objects. + @param cookiejar: The C{LockableCookieJar} cookie object to share among + requests. + @param connection_pool: The C{ConnectionPool} object which contains + connections to share among requests. + + """ + threading.Thread.__init__(self) + self.queue = queue + self.http = Http(cookiejar=cookiejar, connection_pool=connection_pool) + + def run(self): + # The Queue item is expected to either an HttpRequest object + # or None (to shut down the thread) + logging.debug('Thread started, waiting for requests.') + while (True): + item = self.queue.get() + if item is None: + logging.debug('Shutting down thread.') + return + try: + item.data = self.http.request(*item.args, **item.kwargs) + finally: + if item.lock: + item.lock.release() + + +# Metaweb Technologies, Inc. License: + # ======================================================================== + # The following dummy classes are: + # ======================================================================== + # Copyright (c) 2007, Metaweb Technologies, Inc. + # All rights reserved. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions + # are met: + # * Redistributions of source code must retain the above copyright + # notice, this list of conditions and the following disclaimer. + # * Redistributions in binary form must reproduce the above + # copyright notice, this list of conditions and the following + # disclaimer in the documentation and/or other materials provided + # with the distribution. + # + # THIS SOFTWARE IS PROVIDED BY METAWEB TECHNOLOGIES AND CONTRIBUTORS + # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL METAWEB + # TECHNOLOGIES OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + # POSSIBILITY OF SUCH DAMAGE. + # ======================================================================== + +class DummyRequest(object): + """Simulated urllib2.Request object for httplib2 + implements only what's necessary for cookielib.CookieJar to work + """ + def __init__(self, url, headers=None): + self.url = url + self.headers = headers + self.origin_req_host = cookielib.request_host(self) + self.type, r = urllib.splittype(url) + self.host, r = urllib.splithost(r) + if self.host: + self.host = urllib.unquote(self.host) + + def get_full_url(self): + return self.url + + def get_origin_req_host(self): + # TODO to match urllib2 this should be different for redirects + return self.origin_req_host + + def get_type(self): + return self.type + + def get_host(self): + return self.host + + def get_header(self, key, default=None): + return self.headers.get(key.lower(), default) + + def has_header(self, key): + return key in self.headers + + def add_unredirected_header(self, key, val): + # TODO this header should not be sent on redirect + self.headers[key.lower()] = val + + def is_unverifiable(self): + # TODO to match urllib2, this should be set to True when the + # request is the result of a redirect + return False + +class DummyResponse(object): + """Simulated urllib2.Request object for httplib2 + implements only what's necessary for cookielib.CookieJar to work + """ + def __init__(self, response): + self.response = response + + def info(self): + return DummyMessage(self.response) + +class DummyMessage(object): + """Simulated mimetools.Message object for httplib2 + implements only what's necessary for cookielib.CookieJar to work + """ + def __init__(self, response): + self.response = response + + def getheaders(self, k): + k = k.lower() + v = self.response.get(k.lower(), None) + if k not in self.response: + return [] + #return self.response[k].split(re.compile(',\s*')) + + # httplib2 joins multiple values for the same header + # using ','. but the netscape cookie format uses ',' + # as part of the expires= date format. so we have + # to split carefully here - header.split(',') won't do it. + HEADERVAL= re.compile(r'\s*(([^,]|(,\s*\d))+)') + return [h[0] for h in HEADERVAL.findall(self.response[k])]
Modified: branches/rewrite/pywikibot/config.py =================================================================== --- branches/rewrite/pywikibot/config.py 2008-04-17 16:43:25 UTC (rev 5229) +++ branches/rewrite/pywikibot/config.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -282,16 +282,16 @@ # 'minthrottle' seconds. This can be lengthened if the server is slow, # but never more than 'maxthrottle' seconds. However - if you are running # more than one bot in parallel the times are lengthened. -minthrottle = 1 +# By default, the get_throttle is turned off, and 'maxlag' is used to +# control the rate of server access. Set minthrottle to non-zero to use a +# throttle on read access. +minthrottle = 0 maxthrottle = 10
-# Slow down the robot such that it never makes a second change within +# Slow down the robot such that it never makes a second page edit within # 'put_throttle' seconds. put_throttle = 10 -# By default, the get_throttle is turned off, and 'maxlag' is used to -# control the rate of server access. Set this to non-zero to use a throttle -# on read access. -get_throttle = 0 + # Sometimes you want to know when a delay is inserted. If a delay is larger # than 'noisysleep' seconds, it is logged on the screen. noisysleep = 3.0 @@ -299,9 +299,11 @@ # Defer bot edits during periods of database server lag. For details, see # http://www.mediawiki.org/wiki/Maxlag_parameter # You can set this variable to a number of seconds, or to None (or 0) to -# disable this behavior. -# It is recommended that you do not change this parameter unless you know -# what you are doing and have a good reason for it! +# disable this behavior. Higher values are more aggressive in seeking +# access to the wiki. +# Non-Wikimedia wikis may or may not support this feature; for families +# that do not use it, it is recommended to set minthrottle (above) to +# at least 1 second. maxlag = 5
# Maximum of pages which can be retrieved by special pages. Increase this if
Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2008-04-17 16:43:25 UTC (rev 5229) +++ branches/rewrite/pywikibot/data/api.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -11,7 +11,6 @@
from UserDict import DictMixin from datetime import datetime, timedelta -import http import simplejson as json import logging import re @@ -65,7 +64,7 @@
>>> r = Request(site=mysite, action="query", meta="userinfo") >>> # This is equivalent to - >>> # http://%5Bpath%5D/api.php?action=query&meta=userinfo&format=json + >>> # http://%7Bpath%7D/api.php?action=query&meta=userinfo&format=json >>> # r.data is undefined until request is submitted >>> print r.data Traceback (most recent call last): @@ -75,8 +74,9 @@ >>> r['meta'] = "userinfo|siteinfo" >>> # add a new parameter >>> r['siprop'] = "namespaces" + >>> # note that "uiprop" param gets added automatically >>> r.params - {'action': 'query', 'meta': 'userinfo|siteinfo', 'maxlag': '5', 'siprop': 'namespaces', 'format': 'json'} + {'maxlag': '5', 'format': 'json', 'meta': 'userinfo|siteinfo', 'action': 'query', 'siprop': 'namespaces', 'uiprop': 'blockinfo|hasmsg'} >>> data = r.submit() >>> type(data) <type 'dict'> @@ -137,6 +137,7 @@ @return: The data retrieved from api.php (a dict)
""" + from pywikibot.comms import http if self.params['format'] != 'json': raise TypeError("Query format '%s' cannot be parsed." % self.params['format']) @@ -292,7 +293,7 @@ # FIXME: this won't handle generators with <redirlinks> subelements # correctly yet while True: - self.site.get_throttle() + self.site.throttle() self.data = self.request.submit() if not self.data or not isinstance(self.data, dict): raise StopIteration @@ -394,7 +395,7 @@ """Iterate objects for elements found in response.""" # this looks for the resultkey ''inside'' a <page> entry while True: - self.site.get_throttle() + self.site.throttle() self.data = self.request.submit() if not self.data or not isinstance(self.data, dict): raise StopIteration @@ -459,7 +460,10 @@ self.username = login_result['login']['lgusername'] return "\n".join(cookies)
+ def storecookiedata(self, data): + pywikibot.cookie_jar.save()
+ if __name__ == "__main__": from pywikibot import Site mysite = Site("en", "wikipedia") @@ -468,5 +472,5 @@ import doctest doctest.testmod() _test() + pywikibot.stopme()
-
Deleted: branches/rewrite/pywikibot/data/http.py =================================================================== --- branches/rewrite/pywikibot/data/http.py 2008-04-17 16:43:25 UTC (rev 5229) +++ branches/rewrite/pywikibot/data/http.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -1,84 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Basic HTTP access interface. - -This module handles communication between the bot and the HTTP threads. - -This module is responsible for - - Setting up a connection pool - - Providing a (blocking) interface for HTTP requests - - Translate site objects with query strings into urls - - Urlencoding all data - - Basic HTTP error handling -""" - -# -# (C) Pywikipedia bot team, 2007 -# -# Distributed under the terms of the MIT license. -# - -__version__ = '$Id: $' -__docformat__ = 'epytext' - -import Queue -import urllib -import urlparse -import logging -import atexit - -import threadedhttp - - -# global variables - -useragent = 'Pywikipediabot/2.0' # This should include some global version string -numthreads = 1 -threads = [] - -connection_pool = threadedhttp.ConnectionPool() -cookie_jar = threadedhttp.LockableCookieJar() -http_queue = Queue.Queue() - -# Build up HttpProcessors -logging.info('Starting %i threads...' % numthreads) -for i in range(numthreads): - proc = threadedhttp.HttpProcessor(http_queue, cookie_jar, connection_pool) - threads.append(proc) - proc.start() - -# Prepare flush on quit -def _flush(): - for i in threads: - http_queue.put(None) - logging.info('Waiting for threads to finish... ') - for i in threads: - i.join() -atexit.register(_flush) - -def request(site, uri, *args, **kwargs): - """Queue a request to be submitted to Site. - - All parameters not listed below are the same as - L{httplib2.Http.request}, but the uri is relative - - @param site: The Site to connect to - @return: The received data (a unicode string). - """ - baseuri = "%s://%s/" % (site.protocol(), site.hostname()) - uri = urlparse.urljoin(baseuri, uri) - - request = threadedhttp.HttpRequest(uri, *args, **kwargs) - http_queue.put(request) - request.lock.acquire() - - #do some error correcting stuff - - #if all else fails - if isinstance(request.data, Exception): - raise request.data - - if request.data[0].status != 200: - logging.warning("Http response status %s" % request.data[0].status) - - return request.data[1]
Deleted: branches/rewrite/pywikibot/data/threadedhttp.py =================================================================== --- branches/rewrite/pywikibot/data/threadedhttp.py 2008-04-17 16:43:25 UTC (rev 5229) +++ branches/rewrite/pywikibot/data/threadedhttp.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -1,414 +0,0 @@ -# -*- coding: utf-8 -*- -""" Httplib2 threaded cookie layer - -This class extends httplib2, adding support for: - - Cookies, guarded for cross-site redirects - - Thread safe ConnectionPool and LockableCookieJar classes - - HttpProcessor thread class - - HttpRequest object - -""" - -# (C) 2007 Pywikipedia bot team, 2007 -# (C) 2006 Httplib 2 team, 2006 -# (C) 2007 Metaweb Technologies, Inc. -# -# Partially distributed under the MIT license -# Partially distributed under Metaweb Technologies, Incs license -# which is compatible with the MIT license - -__version__ = '$Id$' -__docformat__ = 'epytext' - -# standard python libraries -import re -import threading -import time -import logging - -import urllib -import cookielib - -# easy_install safeguarded dependencies -import pkg_resources -pkg_resources.require("httplib2") -import httplib2 - - -class ConnectionPool(object): - """A thread-safe connection pool.""" - - def __init__(self, maxnum=5): - """ - @param maxnum: Maximum number of connections per identifier. - The pool drops excessive connections added. - - """ - self.connections = {} - self.lock = threading.Lock() - self.maxnum = maxnum - - def __del__(self): - """Destructor to close all connections in the pool.""" - self.lock.acquire() - try: - for key in self.connections: - for connection in self.connections[key]: - connection.close() - finally: - self.lock.release() - - def __repr__(self): - return self.connections.__repr__() - - def pop_connection(self, identifier): - """Get a connection from identifier's connection pool. - - @param identifier: The pool identifier - @return: A connection object if found, None otherwise - - """ - self.lock.acquire() - try: - if identifier in self.connections: - if len(self.connections[identifier]) > 0: - return self.connections[identifier].pop() - return None - finally: - self.lock.release() - - def push_connection(self, identifier, connection): - """Add a connection to identifier's connection pool. - - @param identifier: The pool identifier - @param connection: The connection to add to the pool - - """ - self.lock.acquire() - try: - if identifier not in self.connections: - self.connections[identifier] = [] - - if len(self.connections[identifier]) == self.maxnum: - logging.debug('closing %s connection %r' % (identifier, connection)) - connection.close() - del connection - else: - self.connections[identifier].append(connection) - finally: - self.lock.release() - -class LockableCookieJar(cookielib.CookieJar): - """CookieJar with integrated Lock object.""" - def __init__(self, *args, **kwargs): - cookielib.CookieJar.__init__(self, *args, **kwargs) - self.lock = threading.Lock() - -class Http(httplib2.Http): - """Subclass of httplib2.Http that stores cookies. - - Overrides httplib2's internal redirect support to prevent cookies being - eaten by the wrong sites. - - """ - def __init__(self, *args, **kwargs): - """ - @param cookiejar: (optional) CookieJar to use. A new one will be - used when not supplied. - @param connection_pool: (optional) Connection pool to use. A new one - will be used when not supplied. - @param max_redirects: (optional) The maximum number of redirects to - follow. 5 is default. - - """ - self.cookiejar = kwargs.pop('cookiejar', LockableCookieJar()) - self.connection_pool = kwargs.pop('connection_pool', ConnectionPool()) - self.max_redirects = kwargs.pop('max_redirects', 5) - httplib2.Http.__init__(self, *args, **kwargs) - - def request(self, uri, method="GET", body=None, headers=None, - max_redirects=None, connection_type=None): - """Start an HTTP request. - - @param uri: The uri to retrieve - @param method: (optional) The HTTP method to use. Default is 'GET' - @param body: (optional) The request body. Default is no body. - @param headers: (optional) Additional headers to send. Defaults - include C{connection: keep-alive}, C{user-agent} and - C{content-type}. - @param max_redirects: (optional) The maximum number of redirects to - use for this request. The class instance's max_redirects is - default - @param connection_type: (optional) see L{httplib2.Http.request} - - @return: (response, content) tuple - - """ - if max_redirects is None: - max_redirects = self.max_redirects - if headers is None: - headers = {} - # Prepare headers - headers.pop('cookie', None) - req = DummyRequest(uri, headers) - self.cookiejar.lock.acquire() - try: - self.cookiejar.add_cookie_header(req) - finally: - self.cookiejar.lock.release() - headers = req.headers - - # Wikimedia squids: add connection: keep-alive to request headers - # unless overridden - headers['connection'] = headers.pop('connection', 'keep-alive') - - # determine connection pool key and fetch connection - (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm( - httplib2.iri2uri(uri)) - conn_key = scheme+":"+authority - - connection = self.connection_pool.pop_connection(conn_key) - if connection is not None: - self.connections[conn_key] = connection - - # Redirect hack: we want to regulate redirects - follow_redirects = self.follow_redirects - self.follow_redirects = False - logging.debug('%r' % ( - (uri, method, body, headers, max_redirects, connection_type),)) - try: - (response, content) = httplib2.Http.request( - self, uri, method, body, headers, - max_redirects, connection_type) - except Exception, e: # what types? - # return exception instance to be retrieved by the calling thread - return e - self.follow_redirects = follow_redirects - - # return connection to pool - self.connection_pool.push_connection(conn_key, - self.connections[conn_key]) - del self.connections[conn_key] - - # First write cookies - self.cookiejar.lock.acquire() - try: - self.cookiejar.extract_cookies(DummyResponse(response), req) - finally: - self.cookiejar.lock.release() - - # Check for possible redirects - redirectable_response = ((response.status == 303) or - (response.status in [300, 301, 302, 307] and - method in ["GET", "HEAD"])) - if self.follow_redirects and (max_redirects > 0) \ - and redirectable_response: - (response, content) = self._follow_redirect( - uri, method, body, headers, response, content, max_redirects) - - return (response, content) - - def _follow_redirect(self, uri, method, body, headers, response, - content, max_redirects): - """Internal function to follow a redirect recieved by L{request}""" - (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm( - httplib2.iri2uri(uri)) - if self.cache: - cachekey = defrag_uri - else: - cachekey = None - - # Pick out the location header and basically start from the beginning - # remembering first to strip the ETag header and decrement our 'depth' - if not response.has_key('location') and response.status != 300: - raise httplib2.RedirectMissingLocation( - "Redirected but the response is missing a Location: header.", - response, content) - # Fix-up relative redirects (which violate an RFC 2616 MUST) - if response.has_key('location'): - location = response['location'] - (scheme, authority, path, query, fragment) = httplib2.parse_uri( - location) - if authority == None: - response['location'] = httplib2.urlparse.urljoin(uri, location) - logging.debug('Relative redirect: changed [%s] to [%s]' - % (location, response['location'])) - if response.status == 301 and method in ["GET", "HEAD"]: - response['-x-permanent-redirect-url'] = response['location'] - if not response.has_key('content-location'): - response['content-location'] = absolute_uri - httplib2._updateCache(headers, response, content, self.cache, - cachekey) - - headers.pop('if-none-match', None) - headers.pop('if-modified-since', None) - - if response.has_key('location'): - location = response['location'] - redirect_method = ((response.status == 303) and - (method not in ["GET", "HEAD"]) - ) and "GET" or method - return self.request(location, redirect_method, body=body, - headers=headers, - max_redirects=max_redirects - 1) - else: - raise RedirectLimit( - "Redirected more times than redirection_limit allows.", - response, content) - - -class HttpRequest(object): - """Object wrapper for HTTP requests that need to block origin thread. - - Usage: - - >>> request = HttpRequest('http://www.google.com') - >>> queue.put(request) - >>> request.lock.acquire() - >>> print request.data - - C{request.lock.acquire()} will block until the data is available. - - """ - def __init__(self, *args, **kwargs): - """See C{Http.request} for parameters.""" - self.args = args - self.kwargs = kwargs - self.data = None - self.lock = threading.Semaphore(0) - - -class HttpProcessor(threading.Thread): - """Thread object to spawn multiple HTTP connection threads.""" - def __init__(self, queue, cookiejar, connection_pool): - """ - @param queue: The C{Queue.Queue} object that contains L{HttpRequest} - objects. - @param cookiejar: The C{LockableCookieJar} cookie object to share among - requests. - @param connection_pool: The C{ConnectionPool} object which contains - connections to share among requests. - - """ - threading.Thread.__init__(self) - self.queue = queue - self.http = Http(cookiejar=cookiejar, connection_pool=connection_pool) - - def run(self): - # The Queue item is expected to either an HttpRequest object - # or None (to shut down the thread) - logging.debug('Thread started, waiting for requests.') - while (True): - item = self.queue.get() - if item is None: - logging.debug('Shutting down thread.') - return - try: - item.data = self.http.request(*item.args, **item.kwargs) - finally: - if item.lock: - item.lock.release() - - -# Metaweb Technologies, Inc. License: - # ======================================================================== - # The following dummy classes are: - # ======================================================================== - # Copyright (c) 2007, Metaweb Technologies, Inc. - # All rights reserved. - # - # Redistribution and use in source and binary forms, with or without - # modification, are permitted provided that the following conditions - # are met: - # * Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # * Redistributions in binary form must reproduce the above - # copyright notice, this list of conditions and the following - # disclaimer in the documentation and/or other materials provided - # with the distribution. - # - # THIS SOFTWARE IS PROVIDED BY METAWEB TECHNOLOGIES AND CONTRIBUTORS - # ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL METAWEB - # TECHNOLOGIES OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - # POSSIBILITY OF SUCH DAMAGE. - # ======================================================================== - -class DummyRequest(object): - """Simulated urllib2.Request object for httplib2 - implements only what's necessary for cookielib.CookieJar to work - """ - def __init__(self, url, headers=None): - self.url = url - self.headers = headers - self.origin_req_host = cookielib.request_host(self) - self.type, r = urllib.splittype(url) - self.host, r = urllib.splithost(r) - if self.host: - self.host = urllib.unquote(self.host) - - def get_full_url(self): - return self.url - - def get_origin_req_host(self): - # TODO to match urllib2 this should be different for redirects - return self.origin_req_host - - def get_type(self): - return self.type - - def get_host(self): - return self.host - - def get_header(self, key, default=None): - return self.headers.get(key.lower(), default) - - def has_header(self, key): - return key in self.headers - - def add_unredirected_header(self, key, val): - # TODO this header should not be sent on redirect - self.headers[key.lower()] = val - - def is_unverifiable(self): - # TODO to match urllib2, this should be set to True when the - # request is the result of a redirect - return False - -class DummyResponse(object): - """Simulated urllib2.Request object for httplib2 - implements only what's necessary for cookielib.CookieJar to work - """ - def __init__(self, response): - self.response = response - - def info(self): - return DummyMessage(self.response) - -class DummyMessage(object): - """Simulated mimetools.Message object for httplib2 - implements only what's necessary for cookielib.CookieJar to work - """ - def __init__(self, response): - self.response = response - - def getheaders(self, k): - k = k.lower() - v = self.response.get(k.lower(), None) - if k not in self.response: - return [] - #return self.response[k].split(re.compile(',\s*')) - - # httplib2 joins multiple values for the same header - # using ','. but the netscape cookie format uses ',' - # as part of the expires= date format. so we have - # to split carefully here - header.split(',') won't do it. - HEADERVAL= re.compile(r'\s*(([^,]|(,\s*\d))+)') - return [h[0] for h in HEADERVAL.findall(self.response[k])]
Modified: branches/rewrite/pywikibot/login.py =================================================================== --- branches/rewrite/pywikibot/login.py 2008-04-17 16:43:25 UTC (rev 5229) +++ branches/rewrite/pywikibot/login.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -26,15 +26,15 @@
-force Ignores if the user is already logged in, and tries to log in.
-If not given as parameter, the script will ask for your username and password -(password entry will be hidden), log in to your home wiki using this -combination, and store the resulting cookies (containing your password hash, -so keep it secured!) in a file in the login-data subdirectory. +If not given as parameter, the script will ask for your username and +password (password entry will be hidden), log in to your home wiki using +this combination, and store the resulting cookies (containing your password +hash, so keep it secured!) in a file in the data subdirectory.
-All scripts in this library will be looking for this cookie file and will use the -login information if it is present. +All scripts in this library will be looking for this cookie file and will +use the login information if it is present.
-To log out, throw away the XX-login.data file that is created in the login-data +To log out, throw away the *.lwp file that is created in the data subdirectory. """ # @@ -169,7 +169,7 @@ if match: id = match.group('id') if not config.solve_captcha: - raise wikipedia.CaptchaError(id) + raise pywikibot.CaptchaError(id) url = self.site.protocol() + '://' + self.site.hostname() + self.site.captcha_image_address(id) answer = wikipedia.ui.askForCaptcha(url) return self.getCookie(remember = remember, captchaId = id, captchaAnswer = answer) @@ -255,7 +255,7 @@ for arg in wikipedia.handleArgs(): if arg.startswith("-pass"): if len(arg) == 5: - password = wikipedia.input(u'Password for all accounts:', password = True) + password = pywikibot.input(u'Password for all accounts:', password = True) else: password = arg[6:] elif arg == "-sysop": @@ -274,9 +274,9 @@ namedict = config.usernames for familyName in namedict.iterkeys(): for lang in namedict[familyName].iterkeys(): - site = wikipedia.getSite(code=lang, fam=familyName) + site = pywikibot.getSite(code=lang, fam=familyName) if not forceLogin and site.loggedInAs(sysop = sysop) != None: - wikipedia.output(u'Already logged in on %s' % site) + pywikibot.output(u'Already logged in on %s' % site) else: loginMan = LoginManager(password, sysop = sysop, site = site) loginMan.login() @@ -288,4 +288,4 @@ try: main() finally: - wikipedia.stopme() + pywikibot.stopme()
Modified: branches/rewrite/pywikibot/page.py =================================================================== --- branches/rewrite/pywikibot/page.py 2008-04-17 16:43:25 UTC (rev 5229) +++ branches/rewrite/pywikibot/page.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -93,10 +93,10 @@ # copy all of source's attributes to this object self.__dict__ = source.__dict__ elif isinstance(source, Link): - self._site = link.site - self._section = link.section - self._ns = link.namespace - self._title = link.title + self._site = source.site + self._section = source.section + self._ns = source.namespace + self._title = source.title # reassemble the canonical title from components if self._ns: self._title = "%s:%s" % (self.site().namespace(self._ns),
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2008-04-17 16:43:25 UTC (rev 5229) +++ branches/rewrite/pywikibot/site.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -103,15 +103,8 @@ self._pagemutex = threading.Lock() self._locked_pages = []
- pt_min = min(config.minthrottle, config.put_throttle) - self.put_throttle = Throttle(self, pt_min, config.maxthrottle, - verbosedelay=True) - self.put_throttle.setDelay(config.put_throttle) + self.throttle = Throttle(self, multiplydelay=True, verbosedelay=True)
- gt_min = min(config.minthrottle, config.get_throttle) - self.get_throttle = Throttle(self, gt_min, config.maxthrottle) - self.get_throttle.setDelay(config.get_throttle) - def family(self): """Return the associated Family object.""" return self._family @@ -409,7 +402,7 @@ - blockinfo: present if user is blocked (dict)
""" - if not hasattr(self, "_userinfo"): + if not hasattr(self, "_userinfo") or "rights" not in self._userinfo: uirequest = api.Request( site=self, action="query",
Modified: branches/rewrite/pywikibot/throttle.py =================================================================== --- branches/rewrite/pywikibot/throttle.py 2008-04-17 16:43:25 UTC (rev 5229) +++ branches/rewrite/pywikibot/throttle.py 2008-04-17 17:54:39 UTC (rev 5230) @@ -27,22 +27,24 @@ Calling this object blocks the calling thread until at least 'delay' seconds have passed since the previous call.
- Each Site initiates two Throttle objects: get_throttle to control - the rate of read access, and put_throttle to control the rate of write - access. These are available as the Site.get_throttle and Site.put_throttle - objects. + Each Site initiates one Throttle object (site.throttle) to control the + rate of access.
""" def __init__(self, site, mindelay=config.minthrottle, maxdelay=config.maxthrottle, + writedelay=config.put_throttle, multiplydelay=True, verbosedelay=False): self.lock = threading.RLock() self.mysite = str(site) + self.logfn = config.datafilepath('throttle.log') self.mindelay = mindelay self.maxdelay = maxdelay - self.now = 0 + self.writedelay = writedelay + self.last_read = 0 + self.last_write = 0 self.next_multiplicity = 1.0 - self.checkdelay = 240 # Check logfile again after this many seconds + self.checkdelay = 120 # Check logfile again after this many seconds self.dropdelay = 360 # Ignore processes that have not made # a check in this many seconds self.releasepid = 1800 # Free the process id after this many seconds @@ -51,11 +53,8 @@ self.verbosedelay = verbosedelay if multiplydelay: self.checkMultiplicity() - self.setDelay(mindelay) + self.setDelays()
- def logfn(self): - return config.datafilepath('throttle.log') - def checkMultiplicity(self): global pid self.lock.acquire() @@ -65,7 +64,7 @@ my_pid = 1 count = 1 try: - f = open(self.logfn(), 'r') + f = open(self.logfn, 'r') except IOError: if not pid: pass @@ -101,7 +100,7 @@ processes.append({'pid': my_pid, 'time': self.checktime, 'site': self.mysite}) - f = open(self.logfn(), 'w') + f = open(self.logfn, 'w') processes.sort(key=lambda p:(p['pid'], p['site'])) for p in processes: f.write("%(pid)s %(time)s %(site)s\n" % p) @@ -114,20 +113,24 @@ finally: self.lock.release()
- def setDelay(self, delay=config.minthrottle, absolute=False): - """Set the nominal delay in seconds.""" + def setDelays(self, delay=None, absolute=False): + """Set the nominal delays in seconds. Defaults to config values.""" self.lock.acquire() try: + if delay is None: + delay = self.mindelay if absolute: self.maxdelay = delay self.mindelay = delay self.delay = delay + self.writedelay = min(max(self.mindelay, self.writedelay), + self.maxdelay) # Start the delay count now, not at the next check - self.now = time.time() + self.last_read = self.last_write = time.time() finally: self.lock.release()
- def getDelay(self): + def getDelay(self, write=False): """Return the actual delay, accounting for multiple processes.
This value is the maximum wait between reads/writes, not taking @@ -135,7 +138,10 @@
""" global pid - thisdelay = self.delay + if write: + thisdelay = self.writedelay + else: + thisdelay = self.delay if pid: # If set, we're checking for multiple processes if time.time() > self.checktime + self.checkdelay: self.checkMultiplicity() @@ -146,13 +152,16 @@ thisdelay *= self.process_multiplicity return thisdelay
- def waittime(self): + def waittime(self, write=False): """Return waiting time in seconds if a query would be made right now""" # Take the previous requestsize in account calculating the desired # delay this time - thisdelay = self.getDelay() + thisdelay = self.getDelay(write=write) now = time.time() - ago = now - self.now + if write: + ago = now - self.last_write + else: + ago = now - self.last_read if ago < thisdelay: delta = thisdelay - ago return delta @@ -164,7 +173,7 @@ self.checktime = 0 processes = [] try: - f = open(self.logfn(), 'r') + f = open(self.logfn, 'r') except IOError: return else: @@ -183,13 +192,13 @@ processes.append({'pid': this_pid, 'time': ptime, 'site': this_site}) - f = open(self.logfn(), 'w') + f = open(self.logfn, 'w') processes.sort(key=lambda p:p['pid']) for p in processes: f.write("%(pid)s %(time)s %(site)s\n" % p) f.close()
- def __call__(self, requestsize=1): + def __call__(self, requestsize=1, write=False): """ Block the calling program if the throttle time has not expired.
@@ -198,7 +207,7 @@ """ self.lock.acquire() try: - waittime = self.waittime() + waittime = self.waittime(write=write) # Calculate the multiplicity of the next delay based on how # big the request is that is being posted now. # We want to add "one delay" for each factor of two in the @@ -213,7 +222,10 @@ time.localtime())) ) time.sleep(waittime) - self.now = time.time() + if write: + self.last_write = time.time() + else: + self.last_read = time.time() finally: self.lock.release()