Revision: 6138 Author: russblau Date: 2008-12-11 20:50:17 +0000 (Thu, 11 Dec 2008)
Log Message: ----------- Add touch.py as a demonstration of how to write a bot under the new framework, and clean up bugs found during testing.
Modified Paths: -------------- branches/rewrite/pywikibot/__init__.py branches/rewrite/pywikibot/bot.py branches/rewrite/pywikibot/pagegenerators.py branches/rewrite/pywikibot/throttle.py
Added Paths: ----------- branches/rewrite/pywikibot/scripts/touch.py
Modified: branches/rewrite/pywikibot/__init__.py =================================================================== --- branches/rewrite/pywikibot/__init__.py 2008-12-10 19:03:37 UTC (rev 6137) +++ branches/rewrite/pywikibot/__init__.py 2008-12-11 20:50:17 UTC (rev 6138) @@ -159,23 +159,22 @@
""" global stopped - if stopped: - return logger = logging.getLogger("wiki") - - logger.debug("stopme() called") - count = sum(1 for thd in threadpool if thd.isAlive()) - if count: - logger.info("Waiting for about %(count)s pages to be saved." - % locals()) - for thd in threadpool: - if thd.isAlive(): - thd.join() + + if not stopped: + logger.debug("stopme() called") + count = sum(1 for thd in threadpool if thd.isAlive()) + if count: + logger.info("Waiting for about %(count)s pages to be saved." + % locals()) + for thd in threadpool: + if thd.isAlive(): + thd.join() + stopped = True # only need one drop() call because all throttles use the same global pid try: _sites[_sites.keys()[0]].throttle.drop() logger.info("Dropped throttle(s).") - stopped = True except IndexError: pass
Modified: branches/rewrite/pywikibot/bot.py =================================================================== --- branches/rewrite/pywikibot/bot.py 2008-12-10 19:03:37 UTC (rev 6137) +++ branches/rewrite/pywikibot/bot.py 2008-12-11 20:50:17 UTC (rev 6138) @@ -14,6 +14,7 @@ # scripts, instead of writing each one from scratch.
+import logging import os.path import sys import pywikibot @@ -108,8 +109,9 @@ return nonGlobalArgs
-def showHelp(): - moduleName = calledModuleName() +def showHelp(name=""): + # argument, if given, is ignored + module = calledModuleName() globalHelp =u'''\ Global arguments available for all bots:
@@ -144,15 +146,14 @@ -v debugging. ''' try: - exec('import %s as module' % moduleName) + exec('import %s as module' % module) helpText = module.__doc__.decode('utf-8') if hasattr(module, 'docuReplacements'): for key, value in module.docuReplacements.iteritems(): helpText = helpText.replace(key, value.strip('\n\r')) pywikibot.output(helpText) except: - pywikibot.output(u'Sorry, no help available for %s' % moduleName) + if module: + pywikibot.output(u'Sorry, no help available for %s' % module) logging.exception('showHelp:') pywikibot.output(globalHelp) - -
Modified: branches/rewrite/pywikibot/pagegenerators.py =================================================================== --- branches/rewrite/pywikibot/pagegenerators.py 2008-12-10 19:03:37 UTC (rev 6137) +++ branches/rewrite/pywikibot/pagegenerators.py 2008-12-11 20:50:17 UTC (rev 6138) @@ -654,7 +654,7 @@ yield page.toggleTalkPage()
-def PreloadingGenerator(self, generator, pageNumber=60, lookahead=10): +def PreloadingGenerator(generator, pageNumber=60, lookahead=10): """Yield preloaded pages taken from another generator."""
# pages may be on more than one site, for example if an interwiki @@ -663,8 +663,8 @@ # build a list of pages for each site found in the iterator for page in generator: sites.setdefault(page.site(), []).append(page) - return itertools.chain(site.preloadpages(sites[site], pageNumber) - for site in sites) + return itertools.chain(*(site.preloadpages(sites[site], pageNumber) + for site in sites))
#TODO below
Added: branches/rewrite/pywikibot/scripts/touch.py =================================================================== --- branches/rewrite/pywikibot/scripts/touch.py (rev 0) +++ branches/rewrite/pywikibot/scripts/touch.py 2008-12-11 20:50:17 UTC (rev 6138) @@ -0,0 +1,96 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +"""This bot goes over multiple pages of a wiki, and edits them without +changing. This is for example used to get category links in templates +working. + +This script understands various command-line arguments: + +¶ms; + +-redir specifies that the robot should touch redirect pages; + otherwise, they will be skipped. + +All other parameters will be regarded as a page title; in this case, the bot +will only touch a single page. +""" + +__version__='$Id: touch.py,v 1.13 2006/03/01 14:07:06 russblau Exp $' + +import pywikibot +from pywikibot import pagegenerators, catlib, config +import sys + +docuReplacements = {'¶ms;': pagegenerators.parameterHelp} + + +class TouchBot: + def __init__(self, generator, touch_redirects): + self.generator = generator + self.touch_redirects = touch_redirects + + def run(self): + for page in self.generator: + try: + # get the page, and save it using the unmodified text. + # whether or not getting a redirect throws an exception + # depends on the variable self.touch_redirects. + text = page.get(get_redirect = self.touch_redirects) + page.save("Pywikibot touch script") + except pywikibot.NoPage: + print "Page %s does not exist?!" % page.aslink() + except pywikibot.IsRedirectPage: + print "Page %s is a redirect; skipping." % page.aslink() + except pywikibot.LockedPage: + print "Page %s is locked?!" % page.aslink() + + +def main(*args): + global bot + # Disable cosmetic changes because we don't want to modify any page + # content, so that we don't flood the histories with minor changes. + config.cosmetic_changes = False + #page generator + gen = None + genFactory = pagegenerators.GeneratorFactory() + redirs = False + namespaces = [] + # If the user chooses to work on a single page, this temporary array is + # used to read the words from the page title. The words will later be + # joined with spaces to retrieve the full title. + pageTitle = [] + for arg in pywikibot.handleArgs(*args): + if arg == '-redir': + redirs = True + elif arg.startswith('-namespace:'): + try: + namespaces.append(int(arg[11:])) + except ValueError: + namespaces.append(arg[11:]) + else: + generator = genFactory.handleArg(arg) + if generator: + gen = generator + else: + pageTitle.append(arg) + + if pageTitle: + # work on a single page + page = pywikibot.Page(pywikibot.Link(' '.join(pageTitle))) + gen = iter([page]) + if not gen: + pywikibot.showHelp() + else: + if namespaces: + gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) + preloadingGen = pagegenerators.PreloadingGenerator(gen) + bot = TouchBot(preloadingGen, redirs) + bot.run() + + +if __name__ == "__main__": + try: + main() + finally: + pywikibot.stopme()
Modified: branches/rewrite/pywikibot/throttle.py =================================================================== --- branches/rewrite/pywikibot/throttle.py 2008-12-10 19:03:37 UTC (rev 6137) +++ branches/rewrite/pywikibot/throttle.py 2008-12-11 20:50:17 UTC (rev 6138) @@ -17,10 +17,12 @@ import threading import time
-logger = logging.getLogger("wiki") +logger = logging.getLogger("wiki.throttle")
-pid = False # global process identifier - # Don't check for other processes unless this is set +pid = False # global process identifier + # when the first Throttle is instantiated, it will set this + # variable to a positive integer, which will apply to all + # throttle objects created by this process.
class Throttle(object): @@ -33,23 +35,25 @@ rate of access.
""" - def __init__(self, site, mindelay=config.minthrottle, - maxdelay=config.maxthrottle, - writedelay=config.put_throttle, - multiplydelay=True, verbosedelay=False): + def __init__(self, site, mindelay=None, maxdelay=None, writedelay=None, + multiplydelay=True, verbosedelay=False): self.lock = threading.RLock() self.mysite = str(site) self.logfn = config.datafilepath('throttle.log') self.mindelay = mindelay + if self.mindelay is None: + self.mindelay = config.minthrottle self.maxdelay = maxdelay + if self.maxdelay is None: + self.maxdelay = config.maxthrottle self.writedelay = writedelay self.last_read = 0 self.last_write = 0 self.next_multiplicity = 1.0 self.checkdelay = 300 # Check logfile again after this many seconds - self.dropdelay = 750 # Ignore processes that have not made + self.dropdelay = 600 # Ignore processes that have not made # a check in this many seconds - self.releasepid = 1800 # Free the process id after this many seconds + self.releasepid = 1200 # Free the process id after this many seconds self.lastwait = 0.0 self.delay = 0 self.verbosedelay = verbosedelay @@ -58,13 +62,16 @@ self.setDelays()
def checkMultiplicity(self): + """Count running processes for site and set process_multiplicity.""" global pid self.lock.acquire() + mysite = self.mysite logger.debug("Checking multiplicity: pid = %(pid)s" % globals()) try: processes = [] - my_pid = 1 + my_pid = pid or 1 # start at 1 if global pid not yet set count = 1 + # open throttle.log try: f = open(self.logfn, 'r') except IOError: @@ -75,6 +82,7 @@ else: now = time.time() for line in f.readlines(): + # parse line; format is "pid timestamp site" try: line = line.split(' ') this_pid = int(line[0]) @@ -86,7 +94,7 @@ if now - ptime > self.releasepid: continue # process has expired, drop from file if now - ptime <= self.dropdelay \ - and this_site == self.mysite \ + and this_site == mysite \ and this_pid != pid: count += 1 if this_site != self.mysite or this_pid != pid: @@ -94,14 +102,14 @@ 'time': ptime, 'site': this_site}) if not pid and this_pid >= my_pid: - my_pid = this_pid+1 + my_pid = this_pid+1 # next unused process id
if not pid: pid = my_pid self.checktime = time.time() - processes.append({'pid': my_pid, + processes.append({'pid': pid, 'time': self.checktime, - 'site': self.mysite}) + 'site': mysite}) f = open(self.logfn, 'w') processes.sort(key=lambda p:(p['pid'], p['site'])) for p in processes: @@ -110,7 +118,7 @@ self.process_multiplicity = count if self.verbosedelay: logger.info( -u"Found %(count)s processes running, including the current process." +u"Found %(count)s %(mysite)s processes running, including this one." % locals()) finally: self.lock.release() @@ -119,10 +127,11 @@ """Set the nominal delays in seconds. Defaults to config values.""" self.lock.acquire() try: + maxdelay = self.maxdelay if delay is None: delay = self.mindelay if writedelay is None: - writedelay = self.writedelay + writedelay = config.put_throttle if absolute: self.maxdelay = delay self.mindelay = delay @@ -173,7 +182,8 @@ return 0.0
def drop(self): - """Remove me from the list of running bots processes.""" + """Remove me from the list of running bot processes.""" + # drop all throttles with this process's pid, regardless of site self.checktime = 0 processes = [] try:
pywikipedia-l@lists.wikimedia.org