Revision: 5852 Author: balasyum Date: 2008-08-27 14:44:46 +0000 (Wed, 27 Aug 2008)
Log Message: ----------- Adding maintainer.py the feature to censure recent changes for bad words, other little modifications.
Modified Paths: -------------- trunk/pywikipedia/maintainer.py trunk/pywikipedia/rciw.py
Added Paths: ----------- trunk/pywikipedia/censure.py
Added: trunk/pywikipedia/censure.py =================================================================== --- trunk/pywikipedia/censure.py (rev 0) +++ trunk/pywikipedia/censure.py 2008-08-27 14:44:46 UTC (rev 5852) @@ -0,0 +1,114 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +"""Bad word checker bot +Should not be run manually/directly, but automatically by maintainer.py +Warning: experimental software, use at your own risk +""" + +__version__ = '$Id$' + +# Author: Balasyum +# http://hu.wikipedia.org/wiki/User:Balasyum + +import wikipedia +import sys +import thread + +# The page, where the bot logs to + +logPages = { + 'hu': u'Wikipédia:Cenzúra', + } + +# To add a new language, create or find the bad word page +# similarly to the 'hu' one (one word per line, starting with <pre> and ending with </pre> lines), +# and add to the badWordList lines below. + +badWordList = { + 'hu': u'User:Cenzúrabot/lista', + } + +site = wikipedia.getSite() +if not badWordList.has_key(site.language()) or not logPages.has_key(site.language()): + wikipedia.output('Error: your language isn't supported, see the source code for further details') + sys.exit(1) +ownWordPage = wikipedia.Page(site, badWordList[site.language()]) +try: + ownWordList = ownWordPage.get(get_redirect = True) +except wikipedia.NoPage: + wikipedia.output('Error: the page containing the bad word list of your language doesn't exist') + sys.exit(1) +ownWordList = ownWordList.split('\n') +del ownWordList[0] +del ownWordList[len(ownWordList) - 1] + +def seekbpos(str1, str2): + i = 0 + while i < len(str1): + if str1[i] != str2[i]: + return i + i += 1 + return i + +def seekepos(str1, str2, bpos): + i1 = len(str1) - 1 + i2 = len(str2) - 1 + while i1 > -1 and i2 > -1: + if i1 == bpos: + return i2 + elif i1 < bpos or str1[i1] != str2[i2]: + return i2 + 1 + i1 -= 1 + i2 -= 1 + return -1 + +def checkPage(title, onlyLastDiff = False): + if title == logPages[site.language()]: + return + wikipedia.output('Checking ' + title + ' for bad word list') + page = wikipedia.Page(site, title) + try: + text = page.get() + if onlyLastDiff: + oldver = page.getOldVersion(page.previousRevision()) + if len(text) > len(oldver): + bpos = seekbpos(oldver, text) + epos = seekepos(oldver, text, bpos) + diff = text[bpos:epos] + text = diff + except wikipedia.NoPage: + wikipedia.output('Page ' + title + ' doesn't exist, skipping') + return + except wikipedia.IsRedirectPage: + wikipedia.output('Page ' + title + ' is a redirect, skipping') + return + + report = False + wordsIn = [] + for badWord in ownWordList: + if text.find(' ' + badWord + ' ') != -1: + wordsIn.append(badWord) + report = True + if report: + logPage = wikipedia.Page(site, logPages[site.language()]) + try: + log = logPage.get() + except: + pass + wikipedia.output(title + ' matches the bad word list') + log = '* [' + page.permalink()+ ' ' + title + '] - ' + ' '.join(wordsIn) + '\n' + log + logPage.put(log, title) + else: + wikipedia.output(title + ' doesn't match any of the bad word list') + +def main(): + wikipedia.output('Warning: this script should not be run manually/directly, but automatically by maintainer.py') + if len(sys.argv) == 1: + wikipedia.output("Usage: censure.py <article title>") + sys.exit(1) + del sys.argv[0] + checkPage(' '.join(sys.argv).decode('utf-8')) + +if __name__ == "__main__": + main()
Modified: trunk/pywikipedia/maintainer.py =================================================================== --- trunk/pywikipedia/maintainer.py 2008-08-27 13:41:53 UTC (rev 5851) +++ trunk/pywikipedia/maintainer.py 2008-08-27 14:44:46 UTC (rev 5852) @@ -3,37 +3,81 @@ """ A wiki-maintainer script that shares tasks between workers, requires no intervention.
-Note: the script requires the Python IRC library http://python-irclib.sourceforge.net/ +This script requires the Python IRC library http://python-irclib.sourceforge.net/ + +Warning: experimental software, use at your own risk """ __version__ = '$Id$'
# Author: Balasyum # http://hu.wikipedia.org/wiki/User:Balasyum -# License : LGPL
from ircbot import SingleServerIRCBot from irclib import nm_to_n import random import wikipedia +import thread import threading import time import rciw +import censure
ver = 1
+site = wikipedia.getSite() +site.forceLogin() + +class rcFeeder(SingleServerIRCBot): + def __init__(self, channel, nickname, server, port=6667): + SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname) + self.channel = channel + self.rcbot = rciw.IWRCBot(site) + self.tasks = [] + + def on_nicknameinuse(self, c, e): + c.nick(c.get_nickname() + "_") + + def on_welcome(self, c, e): + c.join(self.channel) + + def on_privmsg(self, c, e): + pass + + def on_pubmsg(self, c, e): + try: + msg = unicode(e.arguments()[0],'utf-8') + except UnicodeDecodeError: + return + name = msg[8:msg.find(u'14',9)] + if 'rciw' in self.tasks: + self.rcbot.addQueue(name) + if 'censure' in self.tasks: + thread.start_new_thread(censure.checkPage, (name, True)) + + def on_dccmsg(self, c, e): + pass + + def on_dccchat(self, c, e): + pass + + def on_quit(self, e, cmd): + pass + class MaintcontBot(SingleServerIRCBot): def __init__(self, nickname, server, port=6667): SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname) - self.rct = threading.Thread(target=rciw.main) - self.rct.setDaemon(True) - self.rciwrunning = False + feederThread = threading.Thread(target=self.feederBot) + feederThread.setDaemon(True) + feederThread.start()
+ def feederBot(self): + self.feed = rcFeeder('#' + site.language() + '.' + site.family.name, site.loggedInAs(), "irc.wikimedia.org") + self.feed.start() + def on_nicknameinuse(self, c, e): c.nick("mainter" + str(random.randrange(100, 999)))
def on_welcome(self, c, e): - site = wikipedia.getSite() - site.forceLogin() self.connection.privmsg("maintcont", "workerjoin " + site.language() + '.' + site.family.name + ' ' + str(ver))
def on_privmsg(self, c, e): @@ -43,17 +87,9 @@ do = cmd.split() if do[0] == "accepted": print "Joined the network" - t = threading.Thread(target=self.activator) - t.setDaemon(True) - t.start() + thread.start_new_thread(self.activator,()) elif do[0] == "tasklist" and len(do) > 1: - tasks = do[1].split('|') - if 'rciw' in do[1]: - self.rct.start() - self.rciwrunning = True - if (not 'rciw' in do[1]) and self.rciwrunning: - self.rct.join(0) - self.rciwrunning = False + self.feed.tasks = do[1].split('|')
def on_dccmsg(self, c, e): pass @@ -66,9 +102,17 @@ self.connection.privmsg("maintcont", "active") time.sleep(10)
-def main(): - bot = MaintcontBot("mainter" + str(random.randrange(100, 999)), "irc.freenode.net") - bot.start() +class Maintainer: + def __init__(self): + controllThread = threading.Thread(target=self.controllBot) + controllThread.setDaemon(True) + controllThread.start() + while True: + raw_input()
+ def controllBot(self): + bot = MaintcontBot("mainter" + str(random.randrange(100, 999)), "irc.freenode.net") + bot.start() + if __name__ == "__main__": - main() \ No newline at end of file + Maintainer()
Modified: trunk/pywikipedia/rciw.py =================================================================== --- trunk/pywikipedia/rciw.py 2008-08-27 13:41:53 UTC (rev 5851) +++ trunk/pywikipedia/rciw.py 2008-08-27 14:44:46 UTC (rev 5852) @@ -4,6 +4,8 @@ A simple IRC script to check for Recent Changes through IRC, and to check for interwikis in those recently modified articles.
+Can not be run manually/directly, but automatically by maintainer.py + In use on hu:, not sure if this scales well on a large wiki such as en: (Depending on the edit rate, the number of IW threads could grow continuously without ever decreasing) @@ -12,6 +14,7 @@
-safe Does not handle the same page more than once in a session
+Warning: experimental software, use at your own risk """ __version__ = '$Id$'
@@ -19,8 +22,6 @@ # http://hu.wikipedia.org/wiki/User:Kisbes # License : GFDL
-from ircbot import SingleServerIRCBot -from irclib import nm_to_n, nm_to_h, irc_lower, ip_numstr_to_quad, ip_quad_to_numstr import interwiki import threading import re @@ -28,10 +29,8 @@ import time from Queue import Queue
-class IWRCBot(SingleServerIRCBot): - def __init__(self, site, channel, nickname, server, port, safe): - SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname) - self.channel = channel +class IWRCBot(): + def __init__(self, site, safe = True): self.other_ns = re.compile(u'14[[07(' + u'|'.join(site.namespaces()) + u')') interwiki.globalvar.autonomous = True self.site = site @@ -52,27 +51,9 @@ bot.queryStep() self.queue.task_done()
- def join(self): - self.queue.join() - - def on_nicknameinuse(self, c, e): - c.nick(c.get_nickname() + "_") - - def on_welcome(self, c, e): - c.join(self.channel) - - def on_privmsg(self, c, e): - pass - - def on_pubmsg(self, c, e): - try: - msg = unicode(e.arguments()[0],'utf-8') - except UnicodeDecodeError: + def addQueue(self, name): + if self.other_ns.match(name): return - if self.other_ns.match(msg): - return - - name = msg[8:msg.find(u'14',9)] if self.safe: if name in self.processed: return @@ -82,35 +63,8 @@ # it is a simple atomic append(), no need to acquire a semaphore self.queue.put_nowait(page)
- def on_dccmsg(self, c, e): - pass - - def on_dccchat(self, c, e): - pass - - def do_command(self, e, cmd): - pass - - def on_quit(self, e, cmd): - pass - def main(): - safe = False - for arg in wikipedia.handleArgs(): - if arg == 'safe': - safe = True - site = wikipedia.getSite() - site.forceLogin() - chan = '#' + site.language() + '.' + site.family.name - bot = IWRCBot(site, chan, site.loggedInAs(), "irc.wikimedia.org", 6667, safe) - try: - bot.start() - except: - # Quit IRC - bot.disconnect() - # Join the IW threads - bot.join() - raise + wikipedia.output('Warning: this script can not be run manually/directly, but automatically by maintainer.py')
if __name__ == "__main__": main()