Revision: 7998 Author: xqt Date: 2010-03-13 11:03:04 +0000 (Sat, 13 Mar 2010)
Log Message: ----------- activate throttle class from library. This increases put_throttle wait time if dealing with the same default site.
Modified Paths: -------------- trunk/pywikipedia/config.py trunk/pywikipedia/pywikibot/__init__.py trunk/pywikipedia/pywikibot/throttle.py trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/config.py =================================================================== --- trunk/pywikipedia/config.py 2010-03-13 10:04:49 UTC (rev 7997) +++ trunk/pywikipedia/config.py 2010-03-13 11:03:04 UTC (rev 7998) @@ -47,7 +47,7 @@ account_global = False
# Solve captchas in the webbrowser. Setting this to False will result in the -# exception CaptchaError be thrown if a captcha is encountered. +# exception CaptchaError being thrown if a captcha is encountered. solve_captcha = True
# Some sites will require password identication to access the HTML pages at @@ -64,7 +64,9 @@ # 2. You must use the hostname of the site, not its family/language pair authenticate = {}
+# # Security Connection for Wikimedia Projects +# SSL_connection = False
# password_file = ".passwd" @@ -130,7 +132,7 @@ # Currently only works if interface 'terminal' is set. transliterate = True
-# Should the system bell be rung if the bot expects user input? +# Should the system bell ring if the bot expects user input? ring_bell = False
# Colorization can be used to markup important text parts of the output. @@ -155,7 +157,7 @@ # The command for the editor you want to use. If set to None, a simple Tkinter # editor will be used. # On Windows systems, this script tries to determine the default text editor. -if __sys.platform=='win32': +if __sys.platform == 'win32': try: import _winreg _key1 = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER, 'Software\Microsoft\Windows\CurrentVersion\Explorer\FileExts.txt\OpenWithProgids') @@ -163,11 +165,12 @@ _key2 = _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '%s\shell\open\command' % _progID) _cmd = _winreg.QueryValueEx(_key2, None)[0] editor = _cmd.replace('%1', '') - # Notepad is even worse than our Tkinter editor. Nobody has - # deserved to use it. + # Notepad is even worse than our Tkinter editor. + # Nobody has deserved to use it. if editor.lower().endswith('notepad.exe'): editor = None except: + # XXX what are we catching here? #raise editor = None else: @@ -267,19 +270,21 @@ # but never more than 'maxthrottle' seconds. However - if you are running # more than one bot in parallel the times are lengthened. minthrottle = 1 -maxthrottle = 10 +maxthrottle = 60
-# Slow down the robot such that it never makes a second change within +# Slow down the robot such that it never makes a second page edit within # 'put_throttle' seconds. put_throttle = 10 + # Sometimes you want to know when a delay is inserted. If a delay is larger # than 'noisysleep' seconds, it is logged on the screen. noisysleep = 3.0
# Defer bot edits during periods of database server lag. For details, see # http://www.mediawiki.org/wiki/Maxlag_parameter -# You can set this variable to a number of seconds, or to None to disable -# this behavior. +# You can set this variable to a number of seconds, or to None (or 0) to +# disable this behavior. Higher values are more aggressive in seeking +# access to the wiki. # It is recommended that you do not change this parameter unless you know # what you are doing and have a good reason for it! maxlag = 5 @@ -465,25 +470,25 @@ # ============================ # System-level and User-level changes. # Store current variables and their types. -_glv={} +_glv = {} _glv.update(globals()) -_gl=_glv.keys() -_tp={} +_gl = _glv.keys() +_tp = {} for _key in _gl: - if _key[0]!='_': - _tp[_key]=type(globals()[_key]) + if _key[0] != '_': + _tp[_key] = type(globals()[_key])
# Get the user files -_thislevel=0 -_fns=[os.path.join(_base_dir, "user-config.py")] +_thislevel = 0 +_fns = [os.path.join(_base_dir, "user-config.py")] for _filename in _fns: _thislevel += 1 if os.path.exists(_filename): - _filestatus=os.stat(_filename) - _filemode=_filestatus[0] - _fileuid=_filestatus[4] - if (__sys.platform=='win32' or _fileuid==os.getuid() or _fileuid==0): - if __sys.platform=='win32' or _filemode&002==0: + _filestatus = os.stat(_filename) + _filemode = _filestatus[0] + _fileuid = _filestatus[4] + if __sys.platform == 'win32' or _fileuid in [os.getuid(), 0]: + if __sys.platform == 'win32' or _filemode & 002 == 0: execfile(_filename) else: print "WARNING: Skipped '%s': writeable by others."%_filename @@ -507,13 +512,13 @@ print "WARNING: Type of '%s' changed"%_key print " Was: ",ot print " Now: ",nt - del nt,ot + del nt, ot else: print "WARNING: Configuration variable %r is defined but unknown. Misspelled?" %_key
# Fix up default console_encoding if console_encoding is None: - if __sys.platform=='win32': + if __sys.platform == 'win32': console_encoding = 'cp850' else: console_encoding = 'iso-8859-1' @@ -562,23 +567,22 @@ # # When called as main program, list all configuration variables # -if __name__=="__main__": +if __name__ == "__main__": import types - _all=1 + _all = 1 for _arg in __sys.argv[1:]: - if _arg=="modified": - _all=0 + if _arg == "modified": + _all = 0 else: print "Unknown arg %s ignored"%_arg - _k=globals().keys() + _k = globals().keys() _k.sort() for _name in _k: - if _name[0]!='_': + if _name[0] != '_': if not type(globals()[_name]) in [types.FunctionType, types.ModuleType]: - if _all or _glv[_name]!=globals()[_name]: - print _name,"=",repr(globals()[_name]) + if _all or _glv[_name] != globals()[_name]: + print _name, "=", repr(globals()[_name])
- # cleanup all locally-defined variables
for __var in globals().keys():
Modified: trunk/pywikipedia/pywikibot/__init__.py =================================================================== --- trunk/pywikipedia/pywikibot/__init__.py 2010-03-13 10:04:49 UTC (rev 7997) +++ trunk/pywikipedia/pywikibot/__init__.py 2010-03-13 11:03:04 UTC (rev 7998) @@ -13,6 +13,7 @@
from exceptions import * from textlib import * +from throttle import *
import wikipedia
Modified: trunk/pywikipedia/pywikibot/throttle.py =================================================================== --- trunk/pywikipedia/pywikibot/throttle.py 2010-03-13 10:04:49 UTC (rev 7997) +++ trunk/pywikipedia/pywikibot/throttle.py 2010-03-13 11:03:04 UTC (rev 7998) @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Mechanics to slow down wiki page download rate. +Mechanics to slow down wiki read and/or write rate. """ # # (C) Pywikipedia bot team, 2008 @@ -34,7 +34,7 @@
""" def __init__(self, mindelay=None, maxdelay=None, writedelay=None, - multiplydelay=True, verbosedelay=False): + multiplydelay=True, verbosedelay=False, write=False): self.lock = threading.RLock() self.mysite = None self.ctrlfilename = config.datafilepath('pywikibot', 'throttle.ctrl') @@ -62,6 +62,7 @@ if self.multiplydelay: self.checkMultiplicity() self.setDelay() + self.write = write
def checkMultiplicity(self): """Count running processes for site and set process_multiplicity.""" @@ -225,10 +226,15 @@
Parameter requestsize is the number of Pages to be read/written; multiply delay time by an appropriate factor. + + Because this seizes the throttle lock, it will prevent any other + thread from writing to the same site the script started with + until the wait expires. + """ self.lock.acquire() try: - wait = self.waittime(write=write) + wait = self.waittime(write=write or self.write) # Calculate the multiplicity of the next delay based on how # big the request is that is being posted now. # We want to add "one delay" for each factor of two in the @@ -236,13 +242,15 @@ # the delay time for the server. self.next_multiplicity = math.log(1+requestsize)/math.log(2.0) # Announce the delay if it exceeds a preset limit - if wait > config.noisysleep or pywikibot.verbose: - pywikibot.output(u"Sleeping for %(wait).1f seconds, %(now)s" - % {'wait': wait, - 'now' : time.strftime("%Y-%m-%d %H:%M:%S", - time.localtime()) - } ) - time.sleep(wait) + if wait > 0: + if wait > config.noisysleep or pywikibot.verbose: + pywikibot.output( + u"Sleeping for %(wait).1f seconds, %(now)s" + % {'wait': wait, + 'now' : time.strftime("%Y-%m-%d %H:%M:%S", + time.localtime()) + } ) + time.sleep(wait) if write: self.last_write = time.time() else:
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2010-03-13 10:04:49 UTC (rev 7997) +++ trunk/pywikipedia/wikipedia.py 2010-03-13 11:03:04 UTC (rev 7998) @@ -4118,176 +4118,6 @@ # Default User-agent setUserAgent('PythonWikipediaBot/1.0')
-# Mechanics to slow down page download rate. -class Throttle(object): - """For internal use only - control rate of access to wiki server - - Calling this object blocks the calling thread until at least 'delay' - seconds have passed since the previous call. - - The framework initiates two Throttle objects: get_throttle to control - the rate of read access, and put_throttle to control the rate of write - access. - - """ - def __init__(self, mindelay=config.minthrottle, - maxdelay=config.maxthrottle, - multiplydelay=True): - self.lock = threading.RLock() - self.mindelay = mindelay - self.maxdelay = maxdelay - self.now = 0 - self.pid = False # If self.pid remains False, we're not checking for multiple processes - self.next_multiplicity = 1.0 - self.checkdelay = 240 # Check the file with processes again after this many seconds - self.dropdelay = 360 # Drop processes from the list that have not made a check in this many seconds - self.releasepid = 1200 # Free the process id - self.lastwait = 0.0 - self.delay = 0 - self.multiplydelay = multiplydelay - if self.multiplydelay: - self.checkMultiplicity() - self.setDelay(mindelay) - - def logfn(self): - return config.datafilepath('pywikibot', 'throttle.ctrl') - - def checkMultiplicity(self): - self.lock.acquire() - try: - processes = {} - my_pid = 1 - count = 1 - try: - f = open(self.logfn(), 'r') - except IOError: - if not self.pid: - pass - else: - raise - else: - now = time.time() - for line in f.readlines(): - try: - line = line.split(' ') - pid = int(line[0]) - ptime = int(line[1].split('.')[0]) - if now - ptime <= self.releasepid: - if now - ptime <= self.dropdelay and pid != self.pid: - count += 1 - processes[pid] = ptime - if pid >= my_pid: - my_pid = pid+1 - except (IndexError,ValueError): - pass # Sometimes the file gets corrupted - ignore that line - - if not self.pid: - self.pid = my_pid - self.checktime = time.time() - processes[self.pid] = self.checktime - try: - f = open(self.logfn(), 'w') - for p in processes: - f.write(str(p)+' '+str(processes[p])+'\n') - except IOError: - pass - f.close() - self.process_multiplicity = count - if verbose: - output(u"Checked for running processes. %s processes currently running, including the current process." % count) - finally: - self.lock.release() - - def setDelay(self, delay = config.minthrottle, absolute = False): - self.lock.acquire() - try: - if absolute: - self.maxdelay = delay - self.mindelay = delay - self.delay = delay - # Don't count the time we already waited as part of our waiting time :-0 - self.now = time.time() - finally: - self.lock.release() - - def getDelay(self): - thisdelay = self.delay - if self.multiplydelay: # If self.pid, we're checking for multiple processes - if time.time() > self.checktime + self.checkdelay: - self.checkMultiplicity() - if thisdelay < (self.mindelay * self.next_multiplicity): - thisdelay = self.mindelay * self.next_multiplicity - elif thisdelay > self.maxdelay: - thisdelay = self.maxdelay - thisdelay *= self.process_multiplicity - return thisdelay - - def waittime(self): - """Calculate the time in seconds we will have to wait if a query - would be made right now""" - # Take the previous requestsize in account calculating the desired - # delay this time - thisdelay = self.getDelay() - now = time.time() - ago = now - self.now - if ago < thisdelay: - delta = thisdelay - ago - return delta - else: - return 0.0 - - def drop(self): - """Remove me from the list of running bots processes.""" - self.checktime = 0 - processes = {} - try: - f = open(self.logfn(), 'r') - except IOError: - return - else: - now = time.time() - for line in f.readlines(): - try: - line = line.split(' ') - pid = int(line[0]) - ptime = int(line[1].split('.')[0]) - if now - ptime <= self.releasepid and pid != self.pid: - processes[pid] = ptime - except (IndexError,ValueError): - pass # Sometimes the file gets corrupted - ignore that line - try: - f = open(self.logfn(), 'w') - for p in processes: - f.write(str(p)+' '+str(processes[p])+'\n') - except IOError: - pass - f.close() - - def __call__(self, requestsize=1): - """ - Block the calling program if the throttle time has not expired. - - Parameter requestsize is the number of Pages to be read/written; - multiply delay time by an appropriate factor. - """ - self.lock.acquire() - try: - waittime = self.waittime() - # Calculate the multiplicity of the next delay based on how - # big the request is that is being posted now. - # We want to add "one delay" for each factor of two in the - # size of the request. Getting 64 pages at once allows 6 times - # the delay time for the server. - self.next_multiplicity = math.log(1+requestsize)/math.log(2.0) - # Announce the delay if it exceeds a preset limit - if waittime > config.noisysleep: - output(u"Sleeping for %.1f seconds, %s" % (waittime, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) - time.sleep(waittime) - self.now = time.time() - finally: - self.lock.release() - -# end of category specific code def url2link(percentname, insite, site): """Convert urlname of a wiki page into interwiki link format.
@@ -7378,9 +7208,11 @@ elif arg.startswith('-lang:'): default_code = arg[6:] elif arg.startswith('-putthrottle:'): - put_throttle.setDelay(int(arg[13:]), absolute = True) + config.put_throttle = int(arg[len("-putthrottle:") : ]) + put_throttle.setDelay() elif arg.startswith('-pt:'): - put_throttle.setDelay(int(arg[4:]), absolute = True) + config.put_throttle = int(arg[len("-pt:") : ]) + put_throttle.setDelay() elif arg == '-log': setLogfileStatus(True) elif arg.startswith('-log:'): @@ -7768,8 +7600,8 @@ f.close() output( u'ERROR: %s caused error %s. Dump %s created.' % (name,error,filename) )
-get_throttle = Throttle(config.minthrottle,config.maxthrottle) -put_throttle = Throttle(config.put_throttle,config.put_throttle,multiplydelay=False) +get_throttle = Throttle() +put_throttle = Throttle(write=True)
def decompress_gzip(data): # Use cStringIO if available
pywikipedia-svn@lists.wikimedia.org