Revision: 5127
Author: russblau
Date: 2008-03-13 21:12:29 +0000 (Thu, 13 Mar 2008)
Log Message:
-----------
Add throttling, add Page tests
Modified Paths:
--------------
branches/rewrite/pywikibot/__init__.py
branches/rewrite/pywikibot/config.py
branches/rewrite/pywikibot/data/api.py
branches/rewrite/pywikibot/site.py
branches/rewrite/pywikibot/tests/page_tests.py
Added Paths:
-----------
branches/rewrite/pywikibot/throttle.py
Modified: branches/rewrite/pywikibot/__init__.py
===================================================================
--- branches/rewrite/pywikibot/__init__.py 2008-03-13 12:48:27 UTC (rev 5126)
+++ branches/rewrite/pywikibot/__init__.py 2008-03-13 21:12:29 UTC (rev 5127)
@@ -51,6 +51,8 @@
key = '%s:%s:%s' % (fam, code, user)
if not _sites.has_key(key):
_sites[key] = __Site(code=code, fam=fam, user=user)
+ _sites[key].getsiteinfo()
+ _sites[key].login(False)
return _sites[key]
getSite = Site # alias for backwards-compability
@@ -68,3 +70,17 @@
import logging
logging.getLogger().setLevel(logging.DEBUG)
+
+def stopme():
+ """Drop this process from the throttle log.
+
+ Can be called manually if desired, but if not, will be called automatically
+ at Python exit.
+
+ """
+ # only need one drop() call because all throttles use the same global pid
+ Site().get_throttle.drop()
+
+import atexit
+atexit.register(stopme)
+
Modified: branches/rewrite/pywikibot/config.py
===================================================================
--- branches/rewrite/pywikibot/config.py 2008-03-13 12:48:27 UTC (rev 5126)
+++ branches/rewrite/pywikibot/config.py 2008-03-13 21:12:29 UTC (rev 5127)
@@ -288,14 +288,18 @@
# Slow down the robot such that it never makes a second change within
# 'put_throttle' seconds.
put_throttle = 10
+# By default, the get_throttle is turned off, and 'maxlag' is used to
+# control the rate of server access. Set this to non-zero to use a throttle
+# on read access.
+get_throttle = 0
# Sometimes you want to know when a delay is inserted. If a delay is larger
# than 'noisysleep' seconds, it is logged on the screen.
noisysleep = 3.0
# Defer bot edits during periods of database server lag. For details, see
#
http://www.mediawiki.org/wiki/Maxlag_parameter
-# You can set this variable to a number of seconds, or to None to disable
-# this behavior.
+# You can set this variable to a number of seconds, or to None (or 0) to
+# disable this behavior.
# It is recommended that you do not change this parameter unless you know
# what you are doing and have a good reason for it!
maxlag = 5
Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py 2008-03-13 12:48:27 UTC (rev 5126)
+++ branches/rewrite/pywikibot/data/api.py 2008-03-13 21:12:29 UTC (rev 5127)
@@ -19,10 +19,10 @@
import time
import urllib
+import config
import pywikibot
from pywikibot import login
-
lagpattern = re.compile(r"Waiting for [\d.]+: (?P<lag>\d+) seconds?
lagged")
@@ -94,7 +94,7 @@
if "format" not in kwargs:
self.params["format"] = "json"
if "maxlag" not in kwargs:
- self.params["maxlag"] = "5" # replace with configurable
constant?
+ self.params["maxlag"] = str(config.maxlag)
self.update(**kwargs)
# implement dict interface
@@ -229,6 +229,7 @@
# following "if" is used for testing with plugged-in data; it
wouldn't
# be needed for actual usage
if not hasattr(self, "data"):
+ site.get_throttle()
self.data = self.request.submit()
if not self.data or not isinstance(self.data, dict):
raise StopIteration
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-03-13 12:48:27 UTC (rev 5126)
+++ branches/rewrite/pywikibot/site.py 2008-03-13 21:12:29 UTC (rev 5127)
@@ -11,7 +11,9 @@
__version__ = '$Id: $'
import pywikibot
+from pywikibot.throttle import Throttle
from pywikibot.data import api
+import config
import os
import threading
@@ -99,6 +101,14 @@
self._mutex = threading.Lock()
self._locked_pages = []
+ pt_min = min(config.minthrottle, config.put_throttle)
+ self.put_throttle = Throttle(pt_min, config.maxthrottle)
+ self.put_throttle.setDelay(config.put_throttle)
+
+ gt_min = min(config.minthrottle, config.get_throttle)
+ self.get_throttle = Throttle(gt_min, config.maxthrottle)
+ self.get_throttle.setDelay(config.get_throttle)
+
def family(self):
"""Return the associated Family object."""
return self._family
@@ -188,7 +198,7 @@
finally:
self._mutex.release()
-
+
class APISite(BaseSite):
"""API interface to MediaWiki site.
@@ -500,7 +510,7 @@
self.getsiteinfo()
return self._namespaces
- def namespace(self, num, all = False):
+ def namespace(self, num, all=False):
"""Return string containing local name of namespace
'num'.
If optional argument 'all' is true, return a list of all recognized
@@ -528,9 +538,15 @@
in this list.
"""
+ if 'bot' in self.getuserinfo()['groups']:
+ limit = 5000
+ else:
+ limit = 500
+ if followRedirects:
+ limit = limit / 2
bltitle = page.title(withSection=False)
blgen = api.PageGenerator("backlinks", gbltitle=bltitle,
- gbllimit="5000")
+ gbllimit=str(limit))
if namespaces is not None:
blgen.request["gblnamespace"] = u"|".join(unicode(ns)
for ns in namespaces)
Modified: branches/rewrite/pywikibot/tests/page_tests.py
===================================================================
--- branches/rewrite/pywikibot/tests/page_tests.py 2008-03-13 12:48:27 UTC (rev 5126)
+++ branches/rewrite/pywikibot/tests/page_tests.py 2008-03-13 21:12:29 UTC (rev 5127)
@@ -48,7 +48,7 @@
u"Hispanic (U.S. Census)" : u"Hispanic (U.S. Census)",
u"Stołpce" : u"Stołpce",
u"Nowy_Sącz" : u"Nowy Sącz",
- u"battle of Węgierska Górka" : u"Battle of Węgierska
Górka",
+ u"battle of Węgierska Górka" : u"Battle of Węgierska
Górka",
}
# random bunch of possible section titles
sections = [u"",
@@ -69,7 +69,17 @@
site)
self.assertEqual(m.namespace, num)
+ def testTitles(self):
+ """Test that Link() normalizes titles"""
+ for title in self.titles:
+ for num in (0, 1):
+ l = pywikibot.page.Link(self.namespaces[num][0]+title)
+ self.assertEqual(l.title, self.titles[title])
+ # prefixing name with ":" shouldn't change result
+ m = pywikibot.page.Link(":"+self.namespaces[num][0]+title)
+ self.assertEqual(m.title, self.titles[title])
+
class TestPageObject(unittest.TestCase):
def testSite(self):
"""Test site() method"""
Added: branches/rewrite/pywikibot/throttle.py
===================================================================
--- branches/rewrite/pywikibot/throttle.py (rev 0)
+++ branches/rewrite/pywikibot/throttle.py 2008-03-13 21:12:29 UTC (rev 5127)
@@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+"""
+Mechanics to slow down wiki read and/or write rate.
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+import config
+import pywikibot
+
+import logging
+import threading
+import time
+
+pid = False # global process identifier
+ # Don't check for other processes unless this is set
+
+
+class Throttle(object):
+ """Control rate of access to wiki server
+
+ Calling this object blocks the calling thread until at least 'delay'
+ seconds have passed since the previous call.
+
+ Each Site initiates two Throttle objects: get_throttle to control
+ the rate of read access, and put_throttle to control the rate of write
+ access. These are available as the Site.get_throttle and Site.put_throttle
+ objects.
+
+ """
+ def __init__(self, mindelay=config.minthrottle,
+ maxdelay=config.maxthrottle,
+ multiplydelay=True):
+ self.lock = threading.RLock()
+ self.mindelay = mindelay
+ self.maxdelay = maxdelay
+ self.now = 0
+ self.next_multiplicity = 1.0
+ self.checkdelay = 240 # Check logfile again after this many seconds
+ self.dropdelay = 360 # Ignore processes that have not made
+ # a check in this many seconds
+ self.releasepid = 1800 # Free the process id after this many seconds
+ self.lastwait = 0.0
+ self.delay = 0
+ if multiplydelay:
+ self.checkMultiplicity()
+ self.setDelay(mindelay)
+
+ def logfn(self):
+ return config.datafilepath('throttle.log')
+
+ def checkMultiplicity(self):
+ global pid
+ self.lock.acquire()
+ logging.debug("Checking multiplicity: pid = %s" % pid)
+ try:
+ processes = {}
+ my_pid = 1
+ count = 1
+ try:
+ f = open(self.logfn(), 'r')
+ except IOError:
+ if not pid:
+ pass
+ else:
+ raise
+ else:
+ now = time.time()
+ for line in f.readlines():
+ try:
+ line = line.split(' ')
+ this_pid = int(line[0])
+ ptime = int(line[1].split('.')[0])
+ if now - ptime <= self.releasepid:
+ if now - ptime <= self.dropdelay \
+ and this_pid != pid:
+ count += 1
+ processes[this_pid] = ptime
+ if this_pid >= my_pid:
+ my_pid = this_pid+1
+ except (IndexError, ValueError):
+ pass # Sometimes the file gets corrupted
+ # ignore that line
+
+ if not pid:
+ pid = my_pid
+ self.checktime = time.time()
+ processes[pid] = self.checktime
+ f = open(self.logfn(), 'w')
+ for p in processes.keys():
+ f.write(str(p)+' '+str(processes[p])+'\n')
+ f.close()
+ self.process_multiplicity = count
+ pywikibot.output(
+ u"Found %s processes running, including the current process."
+ % count)
+ finally:
+ self.lock.release()
+
+ def setDelay(self, delay=config.minthrottle, absolute=False):
+ """Set the nominal delay in seconds."""
+ self.lock.acquire()
+ try:
+ if absolute:
+ self.maxdelay = delay
+ self.mindelay = delay
+ self.delay = delay
+ # Start the delay count now, not at the next check
+ self.now = time.time()
+ finally:
+ self.lock.release()
+
+ def getDelay(self):
+ """Return the actual delay, accounting for multiple processes.
+
+ This value is the maximum wait between reads/writes, not taking
+ account of how much time has elapsed since the last access.
+
+ """
+ global pid
+ thisdelay = self.delay
+ if pid: # If set, we're checking for multiple processes
+ if time.time() > self.checktime + self.checkdelay:
+ self.checkMultiplicity()
+ if thisdelay < (self.mindelay * self.next_multiplicity):
+ thisdelay = self.mindelay * self.next_multiplicity
+ elif thisdelay > self.maxdelay:
+ thisdelay = self.maxdelay
+ thisdelay *= self.process_multiplicity
+ return thisdelay
+
+ def waittime(self):
+ """Return waiting time in seconds if a query would be made right
now"""
+ # Take the previous requestsize in account calculating the desired
+ # delay this time
+ thisdelay = self.getDelay()
+ now = time.time()
+ ago = now - self.now
+ if ago < thisdelay:
+ delta = thisdelay - ago
+ return delta
+ else:
+ return 0.0
+
+ def drop(self):
+ """Remove me from the list of running bots
processes."""
+ self.checktime = 0
+ processes = {}
+ try:
+ f = open(self.logfn(), 'r')
+ except IOError:
+ return
+ else:
+ now = time.time()
+ for line in f.readlines():
+ try:
+ line = line.split(' ')
+ this_pid = int(line[0])
+ ptime = int(line[1].split('.')[0])
+ if now - ptime <= self.releasepid and this_pid != pid:
+ processes[this_pid] = ptime
+ except (IndexError,ValueError):
+ pass # Sometimes the file gets corrupted - ignore that line
+ f = open(self.logfn(), 'w')
+ for p in processes.keys():
+ f.write(str(p)+' '+str(processes[p])+'\n')
+ f.close()
+
+ def __call__(self, requestsize=1):
+ """
+ Block the calling program if the throttle time has not expired.
+
+ Parameter requestsize is the number of Pages to be read/written;
+ multiply delay time by an appropriate factor.
+ """
+ self.lock.acquire()
+ try:
+ waittime = self.waittime()
+ # Calculate the multiplicity of the next delay based on how
+ # big the request is that is being posted now.
+ # We want to add "one delay" for each factor of two in the
+ # size of the request. Getting 64 pages at once allows 6 times
+ # the delay time for the server.
+ self.next_multiplicity = math.log(1+requestsize)/math.log(2.0)
+ # Announce the delay if it exceeds a preset limit
+ if waittime > config.noisysleep:
+ pywikibot.output(u"Sleeping for %.1f seconds, %s"
+ % (waittime,
+ time.strftime("%Y-%m-%d %H:%M:%S",
+ time.localtime()))
+ )
+ time.sleep(waittime)
+ self.now = time.time()
+ finally:
+ self.lock.release()
+