Revision: 5127 Author: russblau Date: 2008-03-13 21:12:29 +0000 (Thu, 13 Mar 2008)
Log Message: ----------- Add throttling, add Page tests
Modified Paths: -------------- branches/rewrite/pywikibot/__init__.py branches/rewrite/pywikibot/config.py branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/site.py branches/rewrite/pywikibot/tests/page_tests.py
Added Paths: ----------- branches/rewrite/pywikibot/throttle.py
Modified: branches/rewrite/pywikibot/__init__.py =================================================================== --- branches/rewrite/pywikibot/__init__.py 2008-03-13 12:48:27 UTC (rev 5126) +++ branches/rewrite/pywikibot/__init__.py 2008-03-13 21:12:29 UTC (rev 5127) @@ -51,6 +51,8 @@ key = '%s:%s:%s' % (fam, code, user) if not _sites.has_key(key): _sites[key] = __Site(code=code, fam=fam, user=user) + _sites[key].getsiteinfo() + _sites[key].login(False) return _sites[key]
getSite = Site # alias for backwards-compability @@ -68,3 +70,17 @@
import logging logging.getLogger().setLevel(logging.DEBUG) + +def stopme(): + """Drop this process from the throttle log. + + Can be called manually if desired, but if not, will be called automatically + at Python exit. + + """ + # only need one drop() call because all throttles use the same global pid + Site().get_throttle.drop() + +import atexit +atexit.register(stopme) +
Modified: branches/rewrite/pywikibot/config.py =================================================================== --- branches/rewrite/pywikibot/config.py 2008-03-13 12:48:27 UTC (rev 5126) +++ branches/rewrite/pywikibot/config.py 2008-03-13 21:12:29 UTC (rev 5127) @@ -288,14 +288,18 @@ # Slow down the robot such that it never makes a second change within # 'put_throttle' seconds. put_throttle = 10 +# By default, the get_throttle is turned off, and 'maxlag' is used to +# control the rate of server access. Set this to non-zero to use a throttle +# on read access. +get_throttle = 0 # Sometimes you want to know when a delay is inserted. If a delay is larger # than 'noisysleep' seconds, it is logged on the screen. noisysleep = 3.0
# Defer bot edits during periods of database server lag. For details, see # http://www.mediawiki.org/wiki/Maxlag_parameter -# You can set this variable to a number of seconds, or to None to disable -# this behavior. +# You can set this variable to a number of seconds, or to None (or 0) to +# disable this behavior. # It is recommended that you do not change this parameter unless you know # what you are doing and have a good reason for it! maxlag = 5
Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2008-03-13 12:48:27 UTC (rev 5126) +++ branches/rewrite/pywikibot/data/api.py 2008-03-13 21:12:29 UTC (rev 5127) @@ -19,10 +19,10 @@ import time import urllib
+import config import pywikibot from pywikibot import login
- lagpattern = re.compile(r"Waiting for [\d.]+: (?P<lag>\d+) seconds? lagged")
@@ -94,7 +94,7 @@ if "format" not in kwargs: self.params["format"] = "json" if "maxlag" not in kwargs: - self.params["maxlag"] = "5" # replace with configurable constant? + self.params["maxlag"] = str(config.maxlag) self.update(**kwargs)
# implement dict interface @@ -229,6 +229,7 @@ # following "if" is used for testing with plugged-in data; it wouldn't # be needed for actual usage if not hasattr(self, "data"): + site.get_throttle() self.data = self.request.submit() if not self.data or not isinstance(self.data, dict): raise StopIteration
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2008-03-13 12:48:27 UTC (rev 5126) +++ branches/rewrite/pywikibot/site.py 2008-03-13 21:12:29 UTC (rev 5127) @@ -11,7 +11,9 @@ __version__ = '$Id: $'
import pywikibot +from pywikibot.throttle import Throttle from pywikibot.data import api +import config
import os import threading @@ -99,6 +101,14 @@ self._mutex = threading.Lock() self._locked_pages = []
+ pt_min = min(config.minthrottle, config.put_throttle) + self.put_throttle = Throttle(pt_min, config.maxthrottle) + self.put_throttle.setDelay(config.put_throttle) + + gt_min = min(config.minthrottle, config.get_throttle) + self.get_throttle = Throttle(gt_min, config.maxthrottle) + self.get_throttle.setDelay(config.get_throttle) + def family(self): """Return the associated Family object.""" return self._family @@ -188,7 +198,7 @@ finally: self._mutex.release()
- + class APISite(BaseSite): """API interface to MediaWiki site.
@@ -500,7 +510,7 @@ self.getsiteinfo() return self._namespaces
- def namespace(self, num, all = False): + def namespace(self, num, all=False): """Return string containing local name of namespace 'num'.
If optional argument 'all' is true, return a list of all recognized @@ -528,9 +538,15 @@ in this list.
""" + if 'bot' in self.getuserinfo()['groups']: + limit = 5000 + else: + limit = 500 + if followRedirects: + limit = limit / 2 bltitle = page.title(withSection=False) blgen = api.PageGenerator("backlinks", gbltitle=bltitle, - gbllimit="5000") + gbllimit=str(limit)) if namespaces is not None: blgen.request["gblnamespace"] = u"|".join(unicode(ns) for ns in namespaces)
Modified: branches/rewrite/pywikibot/tests/page_tests.py =================================================================== --- branches/rewrite/pywikibot/tests/page_tests.py 2008-03-13 12:48:27 UTC (rev 5126) +++ branches/rewrite/pywikibot/tests/page_tests.py 2008-03-13 21:12:29 UTC (rev 5127) @@ -48,7 +48,7 @@ u"Hispanic (U.S. Census)" : u"Hispanic (U.S. Census)", u"Stołpce" : u"Stołpce", u"Nowy_Sącz" : u"Nowy Sącz", - u"battle of Węgierska Górka" : u"Battle of Węgierska Górka", + u"battle of Węgierska Górka" : u"Battle of Węgierska Górka", } # random bunch of possible section titles sections = [u"", @@ -69,7 +69,17 @@ site) self.assertEqual(m.namespace, num)
+ def testTitles(self): + """Test that Link() normalizes titles""" + for title in self.titles: + for num in (0, 1): + l = pywikibot.page.Link(self.namespaces[num][0]+title) + self.assertEqual(l.title, self.titles[title]) + # prefixing name with ":" shouldn't change result + m = pywikibot.page.Link(":"+self.namespaces[num][0]+title) + self.assertEqual(m.title, self.titles[title])
+ class TestPageObject(unittest.TestCase): def testSite(self): """Test site() method"""
Added: branches/rewrite/pywikibot/throttle.py =================================================================== --- branches/rewrite/pywikibot/throttle.py (rev 0) +++ branches/rewrite/pywikibot/throttle.py 2008-03-13 21:12:29 UTC (rev 5127) @@ -0,0 +1,200 @@ +# -*- coding: utf-8 -*- +""" +Mechanics to slow down wiki read and/or write rate. +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id: $' + +import config +import pywikibot + +import logging +import threading +import time + +pid = False # global process identifier + # Don't check for other processes unless this is set + + +class Throttle(object): + """Control rate of access to wiki server + + Calling this object blocks the calling thread until at least 'delay' + seconds have passed since the previous call. + + Each Site initiates two Throttle objects: get_throttle to control + the rate of read access, and put_throttle to control the rate of write + access. These are available as the Site.get_throttle and Site.put_throttle + objects. + + """ + def __init__(self, mindelay=config.minthrottle, + maxdelay=config.maxthrottle, + multiplydelay=True): + self.lock = threading.RLock() + self.mindelay = mindelay + self.maxdelay = maxdelay + self.now = 0 + self.next_multiplicity = 1.0 + self.checkdelay = 240 # Check logfile again after this many seconds + self.dropdelay = 360 # Ignore processes that have not made + # a check in this many seconds + self.releasepid = 1800 # Free the process id after this many seconds + self.lastwait = 0.0 + self.delay = 0 + if multiplydelay: + self.checkMultiplicity() + self.setDelay(mindelay) + + def logfn(self): + return config.datafilepath('throttle.log') + + def checkMultiplicity(self): + global pid + self.lock.acquire() + logging.debug("Checking multiplicity: pid = %s" % pid) + try: + processes = {} + my_pid = 1 + count = 1 + try: + f = open(self.logfn(), 'r') + except IOError: + if not pid: + pass + else: + raise + else: + now = time.time() + for line in f.readlines(): + try: + line = line.split(' ') + this_pid = int(line[0]) + ptime = int(line[1].split('.')[0]) + if now - ptime <= self.releasepid: + if now - ptime <= self.dropdelay \ + and this_pid != pid: + count += 1 + processes[this_pid] = ptime + if this_pid >= my_pid: + my_pid = this_pid+1 + except (IndexError, ValueError): + pass # Sometimes the file gets corrupted + # ignore that line + + if not pid: + pid = my_pid + self.checktime = time.time() + processes[pid] = self.checktime + f = open(self.logfn(), 'w') + for p in processes.keys(): + f.write(str(p)+' '+str(processes[p])+'\n') + f.close() + self.process_multiplicity = count + pywikibot.output( + u"Found %s processes running, including the current process." + % count) + finally: + self.lock.release() + + def setDelay(self, delay=config.minthrottle, absolute=False): + """Set the nominal delay in seconds.""" + self.lock.acquire() + try: + if absolute: + self.maxdelay = delay + self.mindelay = delay + self.delay = delay + # Start the delay count now, not at the next check + self.now = time.time() + finally: + self.lock.release() + + def getDelay(self): + """Return the actual delay, accounting for multiple processes. + + This value is the maximum wait between reads/writes, not taking + account of how much time has elapsed since the last access. + + """ + global pid + thisdelay = self.delay + if pid: # If set, we're checking for multiple processes + if time.time() > self.checktime + self.checkdelay: + self.checkMultiplicity() + if thisdelay < (self.mindelay * self.next_multiplicity): + thisdelay = self.mindelay * self.next_multiplicity + elif thisdelay > self.maxdelay: + thisdelay = self.maxdelay + thisdelay *= self.process_multiplicity + return thisdelay + + def waittime(self): + """Return waiting time in seconds if a query would be made right now""" + # Take the previous requestsize in account calculating the desired + # delay this time + thisdelay = self.getDelay() + now = time.time() + ago = now - self.now + if ago < thisdelay: + delta = thisdelay - ago + return delta + else: + return 0.0 + + def drop(self): + """Remove me from the list of running bots processes.""" + self.checktime = 0 + processes = {} + try: + f = open(self.logfn(), 'r') + except IOError: + return + else: + now = time.time() + for line in f.readlines(): + try: + line = line.split(' ') + this_pid = int(line[0]) + ptime = int(line[1].split('.')[0]) + if now - ptime <= self.releasepid and this_pid != pid: + processes[this_pid] = ptime + except (IndexError,ValueError): + pass # Sometimes the file gets corrupted - ignore that line + f = open(self.logfn(), 'w') + for p in processes.keys(): + f.write(str(p)+' '+str(processes[p])+'\n') + f.close() + + def __call__(self, requestsize=1): + """ + Block the calling program if the throttle time has not expired. + + Parameter requestsize is the number of Pages to be read/written; + multiply delay time by an appropriate factor. + """ + self.lock.acquire() + try: + waittime = self.waittime() + # Calculate the multiplicity of the next delay based on how + # big the request is that is being posted now. + # We want to add "one delay" for each factor of two in the + # size of the request. Getting 64 pages at once allows 6 times + # the delay time for the server. + self.next_multiplicity = math.log(1+requestsize)/math.log(2.0) + # Announce the delay if it exceeds a preset limit + if waittime > config.noisysleep: + pywikibot.output(u"Sleeping for %.1f seconds, %s" + % (waittime, + time.strftime("%Y-%m-%d %H:%M:%S", + time.localtime())) + ) + time.sleep(waittime) + self.now = time.time() + finally: + self.lock.release() +