Revision: 6138
Author: russblau
Date: 2008-12-11 20:50:17 +0000 (Thu, 11 Dec 2008)
Log Message:
-----------
Add touch.py as a demonstration of how to write a bot under the new framework, and clean
up bugs found during testing.
Modified Paths:
--------------
branches/rewrite/pywikibot/__init__.py
branches/rewrite/pywikibot/bot.py
branches/rewrite/pywikibot/pagegenerators.py
branches/rewrite/pywikibot/throttle.py
Added Paths:
-----------
branches/rewrite/pywikibot/scripts/touch.py
Modified: branches/rewrite/pywikibot/__init__.py
===================================================================
--- branches/rewrite/pywikibot/__init__.py 2008-12-10 19:03:37 UTC (rev 6137)
+++ branches/rewrite/pywikibot/__init__.py 2008-12-11 20:50:17 UTC (rev 6138)
@@ -159,23 +159,22 @@
"""
global stopped
- if stopped:
- return
logger = logging.getLogger("wiki")
-
- logger.debug("stopme() called")
- count = sum(1 for thd in threadpool if thd.isAlive())
- if count:
- logger.info("Waiting for about %(count)s pages to be saved."
- % locals())
- for thd in threadpool:
- if thd.isAlive():
- thd.join()
+
+ if not stopped:
+ logger.debug("stopme() called")
+ count = sum(1 for thd in threadpool if thd.isAlive())
+ if count:
+ logger.info("Waiting for about %(count)s pages to be saved."
+ % locals())
+ for thd in threadpool:
+ if thd.isAlive():
+ thd.join()
+ stopped = True
# only need one drop() call because all throttles use the same global pid
try:
_sites[_sites.keys()[0]].throttle.drop()
logger.info("Dropped throttle(s).")
- stopped = True
except IndexError:
pass
Modified: branches/rewrite/pywikibot/bot.py
===================================================================
--- branches/rewrite/pywikibot/bot.py 2008-12-10 19:03:37 UTC (rev 6137)
+++ branches/rewrite/pywikibot/bot.py 2008-12-11 20:50:17 UTC (rev 6138)
@@ -14,6 +14,7 @@
# scripts, instead of writing each one from scratch.
+import logging
import os.path
import sys
import pywikibot
@@ -108,8 +109,9 @@
return nonGlobalArgs
-def showHelp():
- moduleName = calledModuleName()
+def showHelp(name=""):
+ # argument, if given, is ignored
+ module = calledModuleName()
globalHelp =u'''\
Global arguments available for all bots:
@@ -144,15 +146,14 @@
-v debugging.
'''
try:
- exec('import %s as module' % moduleName)
+ exec('import %s as module' % module)
helpText = module.__doc__.decode('utf-8')
if hasattr(module, 'docuReplacements'):
for key, value in module.docuReplacements.iteritems():
helpText = helpText.replace(key, value.strip('\n\r'))
pywikibot.output(helpText)
except:
- pywikibot.output(u'Sorry, no help available for %s' % moduleName)
+ if module:
+ pywikibot.output(u'Sorry, no help available for %s' % module)
logging.exception('showHelp:')
pywikibot.output(globalHelp)
-
-
Modified: branches/rewrite/pywikibot/pagegenerators.py
===================================================================
--- branches/rewrite/pywikibot/pagegenerators.py 2008-12-10 19:03:37 UTC (rev 6137)
+++ branches/rewrite/pywikibot/pagegenerators.py 2008-12-11 20:50:17 UTC (rev 6138)
@@ -654,7 +654,7 @@
yield page.toggleTalkPage()
-def PreloadingGenerator(self, generator, pageNumber=60, lookahead=10):
+def PreloadingGenerator(generator, pageNumber=60, lookahead=10):
"""Yield preloaded pages taken from another
generator."""
# pages may be on more than one site, for example if an interwiki
@@ -663,8 +663,8 @@
# build a list of pages for each site found in the iterator
for page in generator:
sites.setdefault(page.site(), []).append(page)
- return itertools.chain(site.preloadpages(sites[site], pageNumber)
- for site in sites)
+ return itertools.chain(*(site.preloadpages(sites[site], pageNumber)
+ for site in sites))
#TODO below
Added: branches/rewrite/pywikibot/scripts/touch.py
===================================================================
--- branches/rewrite/pywikibot/scripts/touch.py (rev 0)
+++ branches/rewrite/pywikibot/scripts/touch.py 2008-12-11 20:50:17 UTC (rev 6138)
@@ -0,0 +1,96 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""This bot goes over multiple pages of a wiki, and edits them without
+changing. This is for example used to get category links in templates
+working.
+
+This script understands various command-line arguments:
+
+¶ms;
+
+-redir specifies that the robot should touch redirect pages;
+ otherwise, they will be skipped.
+
+All other parameters will be regarded as a page title; in this case, the bot
+will only touch a single page.
+"""
+
+__version__='$Id: touch.py,v 1.13 2006/03/01 14:07:06 russblau Exp $'
+
+import pywikibot
+from pywikibot import pagegenerators, catlib, config
+import sys
+
+docuReplacements = {'¶ms;': pagegenerators.parameterHelp}
+
+
+class TouchBot:
+ def __init__(self, generator, touch_redirects):
+ self.generator = generator
+ self.touch_redirects = touch_redirects
+
+ def run(self):
+ for page in self.generator:
+ try:
+ # get the page, and save it using the unmodified text.
+ # whether or not getting a redirect throws an exception
+ # depends on the variable self.touch_redirects.
+ text = page.get(get_redirect = self.touch_redirects)
+ page.save("Pywikibot touch script")
+ except pywikibot.NoPage:
+ print "Page %s does not exist?!" % page.aslink()
+ except pywikibot.IsRedirectPage:
+ print "Page %s is a redirect; skipping." % page.aslink()
+ except pywikibot.LockedPage:
+ print "Page %s is locked?!" % page.aslink()
+
+
+def main(*args):
+ global bot
+ # Disable cosmetic changes because we don't want to modify any page
+ # content, so that we don't flood the histories with minor changes.
+ config.cosmetic_changes = False
+ #page generator
+ gen = None
+ genFactory = pagegenerators.GeneratorFactory()
+ redirs = False
+ namespaces = []
+ # If the user chooses to work on a single page, this temporary array is
+ # used to read the words from the page title. The words will later be
+ # joined with spaces to retrieve the full title.
+ pageTitle = []
+ for arg in pywikibot.handleArgs(*args):
+ if arg == '-redir':
+ redirs = True
+ elif arg.startswith('-namespace:'):
+ try:
+ namespaces.append(int(arg[11:]))
+ except ValueError:
+ namespaces.append(arg[11:])
+ else:
+ generator = genFactory.handleArg(arg)
+ if generator:
+ gen = generator
+ else:
+ pageTitle.append(arg)
+
+ if pageTitle:
+ # work on a single page
+ page = pywikibot.Page(pywikibot.Link(' '.join(pageTitle)))
+ gen = iter([page])
+ if not gen:
+ pywikibot.showHelp()
+ else:
+ if namespaces:
+ gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
+ preloadingGen = pagegenerators.PreloadingGenerator(gen)
+ bot = TouchBot(preloadingGen, redirs)
+ bot.run()
+
+
+if __name__ == "__main__":
+ try:
+ main()
+ finally:
+ pywikibot.stopme()
Modified: branches/rewrite/pywikibot/throttle.py
===================================================================
--- branches/rewrite/pywikibot/throttle.py 2008-12-10 19:03:37 UTC (rev 6137)
+++ branches/rewrite/pywikibot/throttle.py 2008-12-11 20:50:17 UTC (rev 6138)
@@ -17,10 +17,12 @@
import threading
import time
-logger = logging.getLogger("wiki")
+logger = logging.getLogger("wiki.throttle")
-pid = False # global process identifier
- # Don't check for other processes unless this is set
+pid = False # global process identifier
+ # when the first Throttle is instantiated, it will set this
+ # variable to a positive integer, which will apply to all
+ # throttle objects created by this process.
class Throttle(object):
@@ -33,23 +35,25 @@
rate of access.
"""
- def __init__(self, site, mindelay=config.minthrottle,
- maxdelay=config.maxthrottle,
- writedelay=config.put_throttle,
- multiplydelay=True, verbosedelay=False):
+ def __init__(self, site, mindelay=None, maxdelay=None, writedelay=None,
+ multiplydelay=True, verbosedelay=False):
self.lock = threading.RLock()
self.mysite = str(site)
self.logfn = config.datafilepath('throttle.log')
self.mindelay = mindelay
+ if self.mindelay is None:
+ self.mindelay = config.minthrottle
self.maxdelay = maxdelay
+ if self.maxdelay is None:
+ self.maxdelay = config.maxthrottle
self.writedelay = writedelay
self.last_read = 0
self.last_write = 0
self.next_multiplicity = 1.0
self.checkdelay = 300 # Check logfile again after this many seconds
- self.dropdelay = 750 # Ignore processes that have not made
+ self.dropdelay = 600 # Ignore processes that have not made
# a check in this many seconds
- self.releasepid = 1800 # Free the process id after this many seconds
+ self.releasepid = 1200 # Free the process id after this many seconds
self.lastwait = 0.0
self.delay = 0
self.verbosedelay = verbosedelay
@@ -58,13 +62,16 @@
self.setDelays()
def checkMultiplicity(self):
+ """Count running processes for site and set
process_multiplicity."""
global pid
self.lock.acquire()
+ mysite = self.mysite
logger.debug("Checking multiplicity: pid = %(pid)s" % globals())
try:
processes = []
- my_pid = 1
+ my_pid = pid or 1 # start at 1 if global pid not yet set
count = 1
+ # open throttle.log
try:
f = open(self.logfn, 'r')
except IOError:
@@ -75,6 +82,7 @@
else:
now = time.time()
for line in f.readlines():
+ # parse line; format is "pid timestamp site"
try:
line = line.split(' ')
this_pid = int(line[0])
@@ -86,7 +94,7 @@
if now - ptime > self.releasepid:
continue # process has expired, drop from file
if now - ptime <= self.dropdelay \
- and this_site == self.mysite \
+ and this_site == mysite \
and this_pid != pid:
count += 1
if this_site != self.mysite or this_pid != pid:
@@ -94,14 +102,14 @@
'time': ptime,
'site': this_site})
if not pid and this_pid >= my_pid:
- my_pid = this_pid+1
+ my_pid = this_pid+1 # next unused process id
if not pid:
pid = my_pid
self.checktime = time.time()
- processes.append({'pid': my_pid,
+ processes.append({'pid': pid,
'time': self.checktime,
- 'site': self.mysite})
+ 'site': mysite})
f = open(self.logfn, 'w')
processes.sort(key=lambda p:(p['pid'], p['site']))
for p in processes:
@@ -110,7 +118,7 @@
self.process_multiplicity = count
if self.verbosedelay:
logger.info(
-u"Found %(count)s processes running, including the current process."
+u"Found %(count)s %(mysite)s processes running, including this one."
% locals())
finally:
self.lock.release()
@@ -119,10 +127,11 @@
"""Set the nominal delays in seconds. Defaults to config
values."""
self.lock.acquire()
try:
+ maxdelay = self.maxdelay
if delay is None:
delay = self.mindelay
if writedelay is None:
- writedelay = self.writedelay
+ writedelay = config.put_throttle
if absolute:
self.maxdelay = delay
self.mindelay = delay
@@ -173,7 +182,8 @@
return 0.0
def drop(self):
- """Remove me from the list of running bots
processes."""
+ """Remove me from the list of running bot
processes."""
+ # drop all throttles with this process's pid, regardless of site
self.checktime = 0
processes = []
try: