Revision: 4023 Author: btongminh Date: 2007-08-09 20:51:47 +0000 (Thu, 09 Aug 2007)
Log Message: ----------- Make the replacer site independent and rename it.
Modified Paths: -------------- trunk/pywikipedia/delinker.txt
Added Paths: ----------- trunk/pywikipedia/image_replacer.py
Removed Paths: ------------- trunk/pywikipedia/replacer.py
Modified: trunk/pywikipedia/delinker.txt =================================================================== --- trunk/pywikipedia/delinker.txt 2007-08-09 19:13:07 UTC (rev 4022) +++ trunk/pywikipedia/delinker.txt 2007-08-09 20:51:47 UTC (rev 4023) @@ -104,6 +104,8 @@ give the bot commands. * ''clean_list = False'': Auto clean the list. Only use this if the bot actually has edit permissions to the list. +* ''disallowed_replacements = [(r'.png$', r'.svg$')]'': List of regular expressions + of refused replacements.
==== SQL settings ==== * ''sql_engine = "mysql"'': Database engine to use. Currently supported: @@ -208,6 +210,7 @@ # Auto clean the list. Only use this if the bot actually # has edit permissions to the list. CommonsDelinker['clean_list'] = False +CommonsDelinker['disallowed_replacements'] = [(r'.png$', r'.svg$')]
## SQL connection information. # Database engine to use. Currently supported: MySQL, sqlite3.
Copied: trunk/pywikipedia/image_replacer.py (from rev 4022, trunk/pywikipedia/replacer.py) =================================================================== --- trunk/pywikipedia/image_replacer.py (rev 0) +++ trunk/pywikipedia/image_replacer.py 2007-08-09 20:51:47 UTC (rev 4023) @@ -0,0 +1,133 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +Please refer to delinker.txt for full documentation. +""" +# +# +# (C) Bryan Tong Minh, 2007 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' +import config, wikipedia +import re, time + +from delinker import wait_callback, output, connect_database + +def mw_timestamp(ts): + return '%s%s%s%s-%s%s-%s%sT%s%s:%s%s:%s%sZ' % tuple(ts) +DB_TS = re.compile('[^0-9]') +def db_timestamp(ts): + return DB_TS.sub('', ts) +IMG_NS = re.compile(r'(?i)^\s*Image:') +def strip_image(img): + img = IMG_NS.sub('', img) + img = img.replace(' ', '_') + img = img[0].upper() + img[1:] + return img.strip() + +class Replacer(object): + def __init__(self): + self.config = config.CommonsDelinker + self.config.update(getattr(config, 'Replacer', ())) + self.template = re.compile(r'{{%s|([^|]*?)|([^|]*?)(?:(?:|reason=(.*?))?)}}' % \ + self.config['replace_template']) + self.disallowed_replacements = [(re.compile(i[0]), re.compile(i[1])) + for i in self.config.get('disallowed_replacements', ())] + + self.site = wikipedia.getSite() + + self.database = connect_database() + self.cursor = self.database.cursor() + + def read_replace_log(self): + # FIXME: Make sqlite3 compatible + insert = """INSERT INTO %s (timestamp, old_image, new_image, + status, user, comment) VALUES (%%s, %%s, %%s, + 'pending', %%s, %%s)""" % self.config['replacer_table'] + + page = wikipedia.Page(self.site, self.config['command_page']) + + # Get last revision date + if self.cursor.execute("""SELECT timestamp FROM %s + ORDER BY timestamp DESC LIMIT 1""" % \ + self.config['replacer_table']): + since = mw_timestamp(self.cursor.fetchone()[0]) + else: + since = None + + try: + revisions = page.fullVersionHistory(max = 500, since = since) + # Fetch the page any way, to prevent editconflicts + old_text = text = page.get() + except StandardError, e: + # Network error, not critical + output(u'Warning! Unable to read replacement log.', False) + output('%s: %s' % (e.__class__.__name__, str(e)), False) + return time.sleep(self.config['timeout']) + + revisions.sort(key = lambda rev: rev[0]) + replacements = self.template.finditer(text) + + if self.config.get('clean_list', False): + username = config.sysopnames[self.site.family.name][self.site.lang] + else: + username = None + + for replacement in replacements: + res = self.examine_revision_history( + revisions, replacement, username) + if res and self.allowed_replacement(replacement): + self.cursor.execute(insert, res) + text = text.replace(replacement.group(0), '') + output('Replacing %s by %s: %s' % replacement.groups()) + self.database.commit() + + if text != old_text and self.config.get('clean_list', False): + page.put(text.strip(), comment = 'Removing images being processed') + + def examine_revision_history(self, revisions, replacement, username): + if replacement.group(0) in revisions[0][2]: + return (db_timestamp(revisions[0][0]), + strip_image(replacement.group(1)), + strip_image(replacement.group(2)), + '<Unknown>', replacement.group(3)) + + for timestamp, user, text in revisions[1:]: + if replacement.group(0) in text and user != username: + return (db_timestamp(timestamp), + strip_image(replacement.group(1)), + strip_image(replacement.group(2)), + user, replacement.group(3)) + + output('Warning! Could not find out who did %s' % \ + repr(replacement.group(0)), False) + return + + def start(self): + while True: + self.read_replace_log() + # Replacer should not loop as often as delinker + time.sleep(self.config['timeout'] * 2) + + def allowed_replacement(self, replacement): + for source, target in self.disallowed_replacements: + if source.search(replacement.group(1)) and \ + target.search(replacement.group(2)): + return False + return True + +if __name__ == '__main__': + import sys, cgitb + try: + # FIXME: Add support for single-process replacer. + r = Replacer() + r.start() + except StandardError, e: + if type(e) not in (SystemExit, KeyboardInterrupt): + output('A critical error has occured! Aborting!') + print >>sys.stderr, cgitb.text(sys.exc_info()) + except: + pass + wikipedia.stopme() \ No newline at end of file
Deleted: trunk/pywikipedia/replacer.py =================================================================== --- trunk/pywikipedia/replacer.py 2007-08-09 19:13:07 UTC (rev 4022) +++ trunk/pywikipedia/replacer.py 2007-08-09 20:51:47 UTC (rev 4023) @@ -1,119 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- -""" -Please refer to delinker.txt for full documentation. -""" -# -# -# (C) Bryan Tong Minh, 2007 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id$' -import config, wikipedia -import re, time - -from delinker import wait_callback, output, connect_database - -def mw_timestamp(ts): - return '%s%s%s%s-%s%s-%s%sT%s%s:%s%s:%s%sZ' % tuple(ts) -DB_TS = re.compile('[^0-9]') -def db_timestamp(ts): - return DB_TS.sub('', ts) -IMG_NS = re.compile(r'(?i)^\s*Image:') -def strip_image(img): - img = IMG_NS.sub('', img) - img = img.replace(' ', '_') - img = img[0].upper() + img[1:] - return img.strip() - -class Replacer(object): - def __init__(self): - self.config = config.CommonsDelinker - self.config.update(getattr(config, 'Replacer', ())) - self.template = re.compile(r'{{%s|([^|]*?)|([^|]*?)(?:(?:|reason=(.*?))?)}}' % \ - self.config['template']) - self.site = wikipedia.getSite() - - self.database = connect_database() - self.cursor = self.database.cursor() - - def read_replace_log(self): - # FIXME: Make sqlite3 compatible - insert = """INSERT INTO %s (timestamp, old_image, new_image, - status, user, comment) VALUES (%%s, %%s, %%s, - 'pending', %%s, %%s)""" % self.config['replacer_table'] - - page = wikipedia.Page(self.site, self.config['command_page']) - - # Get last revision date - if self.cursor.execute("""SELECT timestamp FROM %s - ORDER BY timestamp DESC LIMIT 1""" % \ - self.config['replacer_table']): - since = mw_timestamp(self.cursor.fetchone()[0]) - else: - since = None - - try: - revisions = page.fullVersionHistory(max = 500, since = since) - # Fetch the page any way, to prevent editconflicts - old_text = text = page.get() - except StandardError, e: - # Network error, not critical - output(u'Warning! Unable to read replacement log.', False) - output('%s: %s' % (e.__class__.__name__, str(e)), False) - return time.sleep(self.config['timeout']) - - revisions.sort(key = lambda rev: rev[0]) - replacements = self.template.finditer(text) - - for replacement in replacements: - res = self.examine_revision_history( - revisions, replacement, - config.sysopnames['commons']['commons']) - if res: - self.cursor.execute(insert, res) - text = text.replace(replacement.group(0), '') - output('Replacing %s by %s: %s' % replacement.groups()) - self.database.commit() - - if text != old_text and self.config.get('clean_list', False): - page.put(text.strip(), comment = 'Removing images being processed') - - def examine_revision_history(self, revisions, replacement, username): - if replacement.group(0) in revisions[0][2]: - return (db_timestamp(revisions[0][0]), - strip_image(replacement.group(1)), - strip_image(replacement.group(2)), - '<Unknown>', replacement.group(3)) - - for timestamp, user, text in revisions[1:]: - if replacement.group(0) in text and user != username: - return (db_timestamp(timestamp), - strip_image(replacement.group(1)), - strip_image(replacement.group(2)), - user, replacement.group(3)) - - output('Warning! Could not find out who did %s' % \ - repr(replacement.group(0)), False) - return - - def start(self): - while True: - self.read_replace_log() - # Replacer should not loop as often as delinker - time.sleep(self.config['timeout'] * 2) - -if __name__ == '__main__': - import sys, cgitb - try: - # FIXME: Add support for single-process replacer. - r = Replacer() - r.start() - except StandardError, e: - if type(e) not in (SystemExit, KeyboardInterrupt): - output('A critical error has occured! Aborting!') - print >>sys.stderr, cgitb.text(sys.exc_info()) - except: - pass - wikipedia.stopme() \ No newline at end of file
pywikipedia-l@lists.wikimedia.org