Revision: 5852
Author: balasyum
Date: 2008-08-27 14:44:46 +0000 (Wed, 27 Aug 2008)
Log Message:
-----------
Adding maintainer.py the feature to censure recent changes for bad words, other little
modifications.
Modified Paths:
--------------
trunk/pywikipedia/maintainer.py
trunk/pywikipedia/rciw.py
Added Paths:
-----------
trunk/pywikipedia/censure.py
Added: trunk/pywikipedia/censure.py
===================================================================
--- trunk/pywikipedia/censure.py (rev 0)
+++ trunk/pywikipedia/censure.py 2008-08-27 14:44:46 UTC (rev 5852)
@@ -0,0 +1,114 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Bad word checker bot
+Should not be run manually/directly, but automatically by maintainer.py
+Warning: experimental software, use at your own risk
+"""
+
+__version__ = '$Id$'
+
+# Author: Balasyum
+#
http://hu.wikipedia.org/wiki/User:Balasyum
+
+import wikipedia
+import sys
+import thread
+
+# The page, where the bot logs to
+
+logPages = {
+ 'hu': u'Wikipédia:Cenzúra',
+ }
+
+# To add a new language, create or find the bad word page
+# similarly to the 'hu' one (one word per line, starting with <pre> and
ending with </pre> lines),
+# and add to the badWordList lines below.
+
+badWordList = {
+ 'hu': u'User:Cenzúrabot/lista',
+ }
+
+site = wikipedia.getSite()
+if not badWordList.has_key(site.language()) or not logPages.has_key(site.language()):
+ wikipedia.output('Error: your language isn\'t supported, see the source code
for further details')
+ sys.exit(1)
+ownWordPage = wikipedia.Page(site, badWordList[site.language()])
+try:
+ ownWordList = ownWordPage.get(get_redirect = True)
+except wikipedia.NoPage:
+ wikipedia.output('Error: the page containing the bad word list of your language
doesn\'t exist')
+ sys.exit(1)
+ownWordList = ownWordList.split('\n')
+del ownWordList[0]
+del ownWordList[len(ownWordList) - 1]
+
+def seekbpos(str1, str2):
+ i = 0
+ while i < len(str1):
+ if str1[i] != str2[i]:
+ return i
+ i += 1
+ return i
+
+def seekepos(str1, str2, bpos):
+ i1 = len(str1) - 1
+ i2 = len(str2) - 1
+ while i1 > -1 and i2 > -1:
+ if i1 == bpos:
+ return i2
+ elif i1 < bpos or str1[i1] != str2[i2]:
+ return i2 + 1
+ i1 -= 1
+ i2 -= 1
+ return -1
+
+def checkPage(title, onlyLastDiff = False):
+ if title == logPages[site.language()]:
+ return
+ wikipedia.output('Checking ' + title + ' for bad word list')
+ page = wikipedia.Page(site, title)
+ try:
+ text = page.get()
+ if onlyLastDiff:
+ oldver = page.getOldVersion(page.previousRevision())
+ if len(text) > len(oldver):
+ bpos = seekbpos(oldver, text)
+ epos = seekepos(oldver, text, bpos)
+ diff = text[bpos:epos]
+ text = diff
+ except wikipedia.NoPage:
+ wikipedia.output('Page ' + title + ' doesn\'t exist,
skipping')
+ return
+ except wikipedia.IsRedirectPage:
+ wikipedia.output('Page ' + title + ' is a redirect, skipping')
+ return
+
+ report = False
+ wordsIn = []
+ for badWord in ownWordList:
+ if text.find(' ' + badWord + ' ') != -1:
+ wordsIn.append(badWord)
+ report = True
+ if report:
+ logPage = wikipedia.Page(site, logPages[site.language()])
+ try:
+ log = logPage.get()
+ except:
+ pass
+ wikipedia.output(title + ' matches the bad word list')
+ log = '* [' + page.permalink()+ ' ' + title + '] - ' +
' '.join(wordsIn) + '\n' + log
+ logPage.put(log, title)
+ else:
+ wikipedia.output(title + ' doesn\'t match any of the bad word list')
+
+def main():
+ wikipedia.output('Warning: this script should not be run manually/directly, but
automatically by maintainer.py')
+ if len(sys.argv) == 1:
+ wikipedia.output("Usage: censure.py <article title>")
+ sys.exit(1)
+ del sys.argv[0]
+ checkPage(' '.join(sys.argv).decode('utf-8'))
+
+if __name__ == "__main__":
+ main()
Modified: trunk/pywikipedia/maintainer.py
===================================================================
--- trunk/pywikipedia/maintainer.py 2008-08-27 13:41:53 UTC (rev 5851)
+++ trunk/pywikipedia/maintainer.py 2008-08-27 14:44:46 UTC (rev 5852)
@@ -3,37 +3,81 @@
"""
A wiki-maintainer script that shares tasks between workers, requires no intervention.
-Note: the script requires the Python IRC library
http://python-irclib.sourceforge.net/
+This script requires the Python IRC library
http://python-irclib.sourceforge.net/
+
+Warning: experimental software, use at your own risk
"""
__version__ = '$Id$'
# Author: Balasyum
#
http://hu.wikipedia.org/wiki/User:Balasyum
-# License : LGPL
from ircbot import SingleServerIRCBot
from irclib import nm_to_n
import random
import wikipedia
+import thread
import threading
import time
import rciw
+import censure
ver = 1
+site = wikipedia.getSite()
+site.forceLogin()
+
+class rcFeeder(SingleServerIRCBot):
+ def __init__(self, channel, nickname, server, port=6667):
+ SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname)
+ self.channel = channel
+ self.rcbot = rciw.IWRCBot(site)
+ self.tasks = []
+
+ def on_nicknameinuse(self, c, e):
+ c.nick(c.get_nickname() + "_")
+
+ def on_welcome(self, c, e):
+ c.join(self.channel)
+
+ def on_privmsg(self, c, e):
+ pass
+
+ def on_pubmsg(self, c, e):
+ try:
+ msg = unicode(e.arguments()[0],'utf-8')
+ except UnicodeDecodeError:
+ return
+ name = msg[8:msg.find(u'14',9)]
+ if 'rciw' in self.tasks:
+ self.rcbot.addQueue(name)
+ if 'censure' in self.tasks:
+ thread.start_new_thread(censure.checkPage, (name, True))
+
+ def on_dccmsg(self, c, e):
+ pass
+
+ def on_dccchat(self, c, e):
+ pass
+
+ def on_quit(self, e, cmd):
+ pass
+
class MaintcontBot(SingleServerIRCBot):
def __init__(self, nickname, server, port=6667):
SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname)
- self.rct = threading.Thread(target=rciw.main)
- self.rct.setDaemon(True)
- self.rciwrunning = False
+ feederThread = threading.Thread(target=self.feederBot)
+ feederThread.setDaemon(True)
+ feederThread.start()
+ def feederBot(self):
+ self.feed = rcFeeder('#' + site.language() + '.' +
site.family.name, site.loggedInAs(), "irc.wikimedia.org")
+ self.feed.start()
+
def on_nicknameinuse(self, c, e):
c.nick("mainter" + str(random.randrange(100, 999)))
def on_welcome(self, c, e):
- site = wikipedia.getSite()
- site.forceLogin()
self.connection.privmsg("maintcont", "workerjoin " +
site.language() + '.' + site.family.name + ' ' + str(ver))
def on_privmsg(self, c, e):
@@ -43,17 +87,9 @@
do = cmd.split()
if do[0] == "accepted":
print "Joined the network"
- t = threading.Thread(target=self.activator)
- t.setDaemon(True)
- t.start()
+ thread.start_new_thread(self.activator,())
elif do[0] == "tasklist" and len(do) > 1:
- tasks = do[1].split('|')
- if 'rciw' in do[1]:
- self.rct.start()
- self.rciwrunning = True
- if (not 'rciw' in do[1]) and self.rciwrunning:
- self.rct.join(0)
- self.rciwrunning = False
+ self.feed.tasks = do[1].split('|')
def on_dccmsg(self, c, e):
pass
@@ -66,9 +102,17 @@
self.connection.privmsg("maintcont", "active")
time.sleep(10)
-def main():
- bot = MaintcontBot("mainter" + str(random.randrange(100, 999)),
"irc.freenode.net")
- bot.start()
+class Maintainer:
+ def __init__(self):
+ controllThread = threading.Thread(target=self.controllBot)
+ controllThread.setDaemon(True)
+ controllThread.start()
+ while True:
+ raw_input()
+ def controllBot(self):
+ bot = MaintcontBot("mainter" + str(random.randrange(100, 999)),
"irc.freenode.net")
+ bot.start()
+
if __name__ == "__main__":
- main()
\ No newline at end of file
+ Maintainer()
Modified: trunk/pywikipedia/rciw.py
===================================================================
--- trunk/pywikipedia/rciw.py 2008-08-27 13:41:53 UTC (rev 5851)
+++ trunk/pywikipedia/rciw.py 2008-08-27 14:44:46 UTC (rev 5852)
@@ -4,6 +4,8 @@
A simple IRC script to check for Recent Changes through IRC,
and to check for interwikis in those recently modified articles.
+Can not be run manually/directly, but automatically by maintainer.py
+
In use on hu:, not sure if this scales well on a large wiki such
as en: (Depending on the edit rate, the number of IW threads
could grow continuously without ever decreasing)
@@ -12,6 +14,7 @@
-safe Does not handle the same page more than once in a session
+Warning: experimental software, use at your own risk
"""
__version__ = '$Id$'
@@ -19,8 +22,6 @@
#
http://hu.wikipedia.org/wiki/User:Kisbes
# License : GFDL
-from ircbot import SingleServerIRCBot
-from irclib import nm_to_n, nm_to_h, irc_lower, ip_numstr_to_quad, ip_quad_to_numstr
import interwiki
import threading
import re
@@ -28,10 +29,8 @@
import time
from Queue import Queue
-class IWRCBot(SingleServerIRCBot):
- def __init__(self, site, channel, nickname, server, port, safe):
- SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname)
- self.channel = channel
+class IWRCBot():
+ def __init__(self, site, safe = True):
self.other_ns = re.compile(u'14\[\[07(' +
u'|'.join(site.namespaces()) + u')')
interwiki.globalvar.autonomous = True
self.site = site
@@ -52,27 +51,9 @@
bot.queryStep()
self.queue.task_done()
- def join(self):
- self.queue.join()
-
- def on_nicknameinuse(self, c, e):
- c.nick(c.get_nickname() + "_")
-
- def on_welcome(self, c, e):
- c.join(self.channel)
-
- def on_privmsg(self, c, e):
- pass
-
- def on_pubmsg(self, c, e):
- try:
- msg = unicode(e.arguments()[0],'utf-8')
- except UnicodeDecodeError:
+ def addQueue(self, name):
+ if self.other_ns.match(name):
return
- if self.other_ns.match(msg):
- return
-
- name = msg[8:msg.find(u'14',9)]
if self.safe:
if name in self.processed:
return
@@ -82,35 +63,8 @@
# it is a simple atomic append(), no need to acquire a semaphore
self.queue.put_nowait(page)
- def on_dccmsg(self, c, e):
- pass
-
- def on_dccchat(self, c, e):
- pass
-
- def do_command(self, e, cmd):
- pass
-
- def on_quit(self, e, cmd):
- pass
-
def main():
- safe = False
- for arg in wikipedia.handleArgs():
- if arg == 'safe':
- safe = True
- site = wikipedia.getSite()
- site.forceLogin()
- chan = '#' + site.language() + '.' + site.family.name
- bot = IWRCBot(site, chan, site.loggedInAs(), "irc.wikimedia.org", 6667,
safe)
- try:
- bot.start()
- except:
- # Quit IRC
- bot.disconnect()
- # Join the IW threads
- bot.join()
- raise
+ wikipedia.output('Warning: this script can not be run manually/directly, but
automatically by maintainer.py')
if __name__ == "__main__":
main()