Revision: 4387
Author: btongminh
Date: 2007-09-29 14:49:01 +0000 (Sat, 29 Sep 2007)
Log Message:
-----------
Moving CommonsDelinker to its own directory (2)
Added Paths:
-----------
trunk/pywikipedia/commonsdelinker/
trunk/pywikipedia/commonsdelinker/delinker.py
Removed Paths:
-------------
trunk/pywikipedia/delinker/
Copied: trunk/pywikipedia/commonsdelinker (from rev 4386, trunk/pywikipedia/delinker)
Copied: trunk/pywikipedia/commonsdelinker/delinker.py (from rev 4385,
trunk/pywikipedia/delinker.py)
===================================================================
--- trunk/pywikipedia/commonsdelinker/delinker.py (rev 0)
+++ trunk/pywikipedia/commonsdelinker/delinker.py 2007-09-29 14:49:01 UTC (rev 4387)
@@ -0,0 +1,810 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+This script keeps track of image deletions and delinks removed files
+from (any) wiki. Usage
+on protected pages or pages containing blacklisted external links cannot
+be processed.
+
+This script is run by [[commons:User:Siebrand]] on the toolserver. It should
+not be run by other users without prior contact.
+
+Although the classes are called CommonsDelinker and Delinker, it is in fact
+a general delinker/replacer, also suitable for local use.
+
+Please refer to delinker.txt for full documentation.
+"""
+#
+#
+# (C) Kyle/Orgullomoore, 2006-2007
+# (C) Siebrand Mazeland, 2006-2007
+# (C) Bryan Tong Minh, 2007
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+# This script requires MySQLdb and simplejson. Tested with:
+# * Python 2.4.4, MySQLdb 1.2.1_p, simplejson 1.3
+# * Python 2.5, MySQLdb 1.2.2, simplejson 1.5 (recommended)
+# TODO:
+# * Don't replace within <nowiki /> tags
+# * Make as many config settings site dependend
+# BUGS:
+# * There is a problem with images in the es.wikisource project namespace.
+# The exact problem is described somewhere in Bryan's IRC logs, but it is
+# unknown where exactly.
+
+import sys, os, threading, time
+import traceback
+import re, cgitb
+
+import threadpool
+import checkusage
+
+import wikipedia
+import config
+
+def wait_callback(object):
+ output(u'Connection has been lost in %s. Attempting reconnection.' %
repr(object), False)
+ if hasattr(object, 'error'):
+ output(u'Error was %s: %s' % tuple(object.error))
+def universal_unicode(s):
+ if type(s) is str:
+ return s.decode('utf-8', 'ignore')
+ return unicode(s)
+def connect_database():
+ engine = config.CommonsDelinker['sql_engine']
+ kwargs = config.CommonsDelinker['sql_config'].copy()
+ if engine == 'mysql':
+ import mysql_autoconnection
+ # This setting is required for MySQL
+ kwargs['charset'] = 'utf8'
+ # Work around for a bug in MySQLdb 1.2.1_p. This version will
+ # set use_unicode for all connections to the value of the first
+ # initialized connection. This bug is not relevant for MySQLdb
+ # versions 1.2.2 and upwards. The connection character set must
+ # be set to utf8 however, to prevent INSERTs to be converted to
+ # the standard MySQL character set.
+ kwargs['use_unicode'] = False
+ kwargs['callback'] = wait_callback
+
+ return mysql_autoconnection.connect(**kwargs)
+ # TODO: Add support for sqlite3
+ raise RuntimeError('Unsupported database engine %s' % engine)
+
+class Delinker(threadpool.Thread):
+ # TODO: Method names could use some clean up
+ def __init__(self, pool, CommonsDelinker):
+ threadpool.Thread.__init__(self, pool)
+ self.CommonsDelinker = CommonsDelinker
+ self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new')
+
+ def delink_image(self, image, usage, timestamp, admin, reason, replacement = None):
+ """ Performs the delink for image on usage. """
+ output(u'%s Usage of %s: %s' % (self, image, usage))
+
+ skipped_images = {}
+ for (lang, family), pages in usage.iteritems():
+ site = self.CommonsDelinker.get_site(lang, family)
+
+ try:
+ summary = self.get_summary(site, image, admin, reason, replacement)
+
+ for page_namespace, page_title, title in pages:
+ if (site.lang, site.family.name) == (self.CommonsDelinker.site.lang,
+ self.CommonsDelinker.site.family.name) and \
+ (page_namespace, page_title) == (6, image):
+ continue
+
+ if self.CommonsDelinker.set_edit(str(site), title):
+ # The page is currently being editted. Postpone.
+ if (lang, family) not in skipped_images:
+ skipped_images[(lang, family)] = []
+ skipped_images[(lang, family)].append(
+ (page_namespace, page_title, title))
+ else:
+ # Delink the image
+ output(u'%s Delinking %s from %s' % (self, image, site))
+
+ try:
+ result = self.replace_image(image, site, title, summary, replacement)
+ finally:
+ self.CommonsDelinker.unset_edit(str(site), title)
+
+ # Add to logging queue
+ if self.sql_layout == 'new':
+ self.CommonsDelinker.Loggers.append((timestamp, image,
+ site.lang, site.family.name, page_namespace, page_title,
+ result, replacement))
+ else:
+ self.CommonsDelinker.Loggers.append((timestamp, image, site.hostname(),
+ page_namespace, page_title, result, replacement))
+ finally:
+ self.CommonsDelinker.unlock_site(site)
+
+ if skipped_images:
+ time.sleep(self.CommonsDelinker.config['timeout'])
+ return self.delink_image(image, skipped_images, timestamp, admin, reason,
replacement)
+ elif replacement:
+ # Let them know that we are done replacing.
+ self.CommonsDelinker.Loggers.append((timestamp, image, replacement))
+
+ def replace_image(self, image, site, page_title, summary, replacement = None):
+ """ The actual replacement. Giving None as argument for replacement
+ will delink instead of replace."""
+
+ page = wikipedia.Page(site, page_title)
+
+ # TODO: Per site config.
+ if page.namespace() in self.CommonsDelinker.config['delink_namespaces']:
+ try:
+ text = page.get(nofollow_redirects = True)
+ except wikipedia.NoPage:
+ return 'failed'
+ new_text = text
+
+ def create_regex(s):
+ s = re.escape(s)
+ return ur'(?:[%s%s]%s)' % (s[0].upper(), s[0].lower(), s[1:])
+ def create_regex_i(s):
+ return ur'(?:%s)' % u''.join([u'[%s%s]' % (c.upper(),
c.lower()) for c in s])
+
+ namespaces = ('Image', 'Media') + site.namespace(6, all = True) +
site.namespace(-2, all = True)
+ r_namespace = ur'\s*(?:%s)\s*\:\s*' % u'|'.join(map(create_regex_i,
namespaces))
+ # Note that this regex creates a group!
+ r_image = u'(%s)' % create_regex(image).replace(r'\_', '[
_]')
+
+ def simple_replacer(match):
+ if replacement == None:
+ return u''
+ else:
+ groups = list(match.groups())
+ groups[1] = replacement
+ return u''.join(groups)
+
+ # Previously links in image descriptions will cause
+ # unexpected behaviour: [[Image:image.jpg|thumb|[[link]] in description]]
+ # will truncate at the first occurence of ]]. This cannot be
+ # fixed using one regular expression.
+ # This means that all ]] after the start of the image
+ # must be located. If it then does not have an associated
+ # [[, this one is the closure of the image.
+
+ r_simple_s = u'(\[\[%s)%s' % (r_namespace, r_image)
+ r_s = '\[\['
+ r_e = '\]\]'
+ # First determine where wikilinks start and end
+ image_starts = [match.start() for match in re.finditer(r_simple_s, text)]
+ link_starts = [match.start() for match in re.finditer(r_s, text)]
+ link_ends = [match.end() for match in re.finditer(r_e, text)]
+
+ r_simple = u'(\[\[%s)%s(.*)' % (r_namespace, r_image)
+ replacements = []
+ for image_start in image_starts:
+ current_link_starts = [link_start for link_start in link_starts
+ if link_start > image_start]
+ current_link_ends = [link_end for link_end in link_ends
+ if link_end > image_start]
+ end = image_start
+ if current_link_ends: end = current_link_ends[0]
+
+ while current_link_starts and current_link_ends:
+ start = current_link_starts.pop(0)
+ end = current_link_ends.pop(0)
+ if end <= start and end > image_start:
+ # Found the end of the image
+ break
+
+ # Add the replacement to the todo list. Doing the
+ # replacement right know would alter the indices.
+ replacements.append((new_text[image_start:end],
+ re.sub(r_simple, simple_replacer,
+ new_text[image_start:end])))
+
+ # Perform the replacements
+ for old, new in replacements:
+ if old: new_text = new_text.replace(old, new)
+
+ # Remove the image from galleries
+ r_galleries = ur'(?s)(\<%s\>)(.*?)(\<\/%s\>)' %
(create_regex_i('gallery'),
+ create_regex_i('gallery'))
+ r_gallery = ur'(?m)^((?:%s)?)%s(\s*(?:\|.*?)?\s*)$' % (r_namespace, r_image)
+ def gallery_replacer(match):
+ return ur'%s%s%s' % (match.group(1), re.sub(r_gallery,
+ simple_replacer, match.group(2)), match.group(3))
+ new_text = re.sub(r_galleries, gallery_replacer, new_text)
+
+ if text == new_text:
+ # All previous steps did not work, so the image is
+ # likely embedded in a complicated template.
+ r_templates = ur'(?s)(\{\{.*?\}\})'
+ r_complicated = u'(?s)((?:%s)?)%s' % (r_namespace, r_image)
+
+ def template_replacer(match):
+ return re.sub(r_complicated, simple_replacer, match.group(1))
+ new_text = re.sub(r_templates, template_replacer, text)
+
+ if text != new_text:
+ # Save to the wiki
+ # Code for checking user page existance has been moved
+ # to summary() code, to avoid checking the user page
+ # for each removal.
+ try:
+ if config.CommonsDelinker.get('save_diff', False):
+ # Save a diff
+ import difflib
+ diff = difflib.context_diff(
+ text.encode('utf-8').splitlines(True),
+ new_text.encode('utf-8').splitlines(True))
+ f = open((u'diff/%s-%s-%s.txt' % (page_title.replace('/',
'-'),
+ site.dbName(), page.editTime())).encode('utf-8', 'ignore'),
'w')
+ f.writelines(diff)
+ f.close()
+
+ if self.CommonsDelinker.config.get('edit', True) and not \
+ ((self.CommonsDelinker.site.lang == 'commons') ^ \
+ (config.usernames.get('commons', {}).get(
+ 'commons') == 'CommonsDelinker')):
+ page.put(new_text, summary)
+ return 'ok'
+ except wikipedia.EditConflict:
+ # Try again
+ output(u'Got EditConflict trying to remove %s from %s:%s.' % \
+ (image, site, page_title))
+ return self.replace_image(image, site, page_title, summary, replacement = None)
+ except (wikipedia.LockedPage, wikipedia.PageNotSaved):
+ return 'failed'
+ else:
+ return 'skipped'
+ return 'skipped'
+
+
+
+ def do(self, args):
+ try:
+ self.delink_image(*args)
+ except:
+ output(u'An exception occured in %s' % self, False)
+ traceback.print_exc(file = sys.stderr)
+
+ def get_summary(self, site, image, admin, reason, replacement):
+ """ Get the summary template and substitute the
+ correct values."""
+ # FIXME: Don't insert commons: on local delink
+ # FIXME: Hardcode is EVIL
+ if replacement:
+ tlp = self.CommonsDelinker.SummaryCache.get(site, 'replace-I18n')
+ else:
+ tlp = self.CommonsDelinker.SummaryCache.get(site, 'summary-I18n')
+
+ tlp = tlp.replace('$1', image)
+ if replacement:
+ tlp = tlp.replace('$2', replacement)
+ tlp = tlp.replace('$3', unicode(admin))
+ tlp = tlp.replace('$4', unicode(reason).replace('[[',
'[[w:commons:'))
+ else:
+ tlp = tlp.replace('$2', unicode(admin))
+ tlp = tlp.replace('$3', reason.replace('[[', '[[w:commons:'))
+
+ return tlp
+
+class SummaryCache(object):
+ """ Object to thread-safe cache summary templates. """
+ def __init__(self, CommonsDelinker):
+ self.summaries = {}
+ self.lock = threading.Lock()
+ self.CommonsDelinker = CommonsDelinker
+
+ def get(self, site, type, key = None):
+ # This can probably also provide something for
+ # localised settings, but then it first needs to
+ # check whether the page is sysop only.
+ if not key:
+ key = str(site)
+
+ self.lock.acquire()
+ try:
+ if type not in self.summaries:
+ self.summaries[type] = {}
+ if key in self.summaries[type]:
+ if (time.time() - self.summaries[type][key][1]) < \
+ self.CommonsDelinker.config['summary_cache']:
+ # Return cached result
+ return self.summaries[type][key][0]
+
+ output(u'%s Fetching new summary for %s' % (self, site))
+
+ # FIXME: evil
+ if self.CommonsDelinker.config['global']:
+ self.check_user_page(site)
+ page = wikipedia.Page(site, '%s%s' % \
+ (self.CommonsDelinker.config['local_settings'], type))
+ try:
+ # Fetch the summary template, follow redirects
+ i18n = page.get(get_redirect = True)
+ self.summaries[type][key] = (i18n, time.time())
+ return i18n
+ except wikipedia.NoPage:
+ pass
+ finally:
+ self.lock.release()
+
+ # No i18n available, but it may be available in the wikipedia
+ # of that language. Only do so for wiktionary, wikibooks,
+ # wikiquote, wikisource, wikinews, wikiversity
+ # This will cause the bot to function even on special wikis
+ # like
mediawiki.org and meta and species.
+ output(u'%s Using default summary for %s' % (self, site))
+
+ if site.family.name != 'wikipedia' and
self.CommonsDelinker.config['global']:
+ if site.family.name in ('wiktionary', 'wikibooks',
'wikiquote',
+ 'wikisource', 'wikinews', 'wikiversity'):
+ newsite = self.CommonsDelinker.get_site(site.lang,
+ wikipedia.Family('wikipedia'))
+ return self.get(newsite, type, key = key)
+ return self.CommonsDelinker.config['default_settings'].get(type, '')
+
+ def check_user_page(self, site):
+ "Check whether a userpage exists. Only used for CommonsDelinker."
+ try:
+ # Make sure the userpage is not empty
+ # Note: if wikis delete the userpage, it's there own fault
+ filename = 'canedit.cdl'
+ try:
+ f = open(filename, 'r')
+ except IOError:
+ # Don't care
+ return
+ ftxt = f.read()
+ f.close()
+ if not '#' + str(site) in ftxt:
+ username = config.usernames[site.family.name][site.lang]
+
+ userpage = wikipedia.Page(site, 'User:' + username)
+ # Removed check for page existence. If it is not in our
+ # database we can safely assume that we have no user page
+ # there. In case there is, we will just overwrite it once.
+ # It causes no real problems, but it is one call to the
+ # servers less.
+ # TODO: Config setting?
+ userpage.put('#REDIRECT [[m:User:CommonsDelinker]]', '')
+
+ f = open(filename, 'a')
+ f.write('#' + str(site))
+ f.close()
+ except wikipedia.LockedPage:
+ # User page is protected, continue anyway
+ pass
+
+class CheckUsage(threadpool.Thread):
+ def __init__(self, pool, CommonsDelinker):
+ threadpool.Thread.__init__(self, pool)
+ self.CommonsDelinker = CommonsDelinker
+ # Not really thread safe, but we should only do read operations...
+ self.site = CommonsDelinker.site
+
+ def run(self):
+ self.connect()
+ threadpool.Thread.run(self)
+
+ def connect(self):
+ config = self.CommonsDelinker.config
+ if config['global']:
+ # Note: global use requires MySQL
+ self.CheckUsage = checkusage.CheckUsage(limit = sys.maxint,
+ mysql_kwargs = config['sql_config'],
+ use_autoconn = True,
+ http_callback = wait_callback,
+ mysql_callback = wait_callback)
+ else:
+ self.CheckUsage = checkusage.CheckUsage(sys.maxint,
+ http_callback = wait_callback, no_db = True)
+
+
+ def check_usage(self, image, timestamp, admin, reason, replacement):
+ """ Check whether this image needs to be delinked. """
+
+ # Check whether the image still is deleted on Commons.
+ # BUG: This also returns true for images with a page, but
+ # without the image itself. Can be fixed by querying query.php
+ # instead of api.php. Also should this be made as an exits()
+ # method of checkusage.CheckUsage?
+ if self.site.shared_image_repository() != (None, None):
+ shared_image_repository =
self.CommonsDelinker.get_site(*self.site.shared_image_repository())
+ try:
+ if self.CheckUsage.exists(shared_image_repository, image) \
+ and not bool(replacement):
+ output(u'%s %s exists on the shared image repository!' % (self, image))
+ return
+ finally:
+ self.CommonsDelinker.unlock_site(shared_image_repository)
+ if self.CheckUsage.exists(self.site, image) and \
+ not bool(replacement):
+ output(u'%s %s exists again!' % (self, image))
+ return
+
+
+ if self.CommonsDelinker.config['global']:
+ usage = self.CheckUsage.get_usage(image)
+ usage_domains = {}
+
+ count = 0
+ # Sort usage per domain
+ for (lang, family), (page_namespace, page_title, title) in usage:
+ if (lang, family) not in usage_domains:
+ usage_domains[(lang, family)] = []
+ usage_domains[(lang, family)].append((page_namespace, page_title, title))
+ count += 1
+ else:
+ #FIX!
+ usage_domains = {(self.site.lang, self.site.family.name):
+ list(self.CheckUsage.get_usage_live(self.site,
+ image))}
+ count = len(usage_domains[(self.site.lang, self.site.family.name)])
+
+ output(u'%s %s used on %s pages' % (self, image, count))
+
+ if count:
+ # Pass the usage to the Delinker pool along with other arguments
+ self.CommonsDelinker.Delinkers.append((image, usage_domains,
+ timestamp, admin, reason, replacement))
+ elif replacement:
+ # Record replacement done
+ self.CommonsDelinker.Loggers.append((timestamp, image, replacement))
+
+ def do(self, args):
+ try:
+ self.check_usage(*args)
+ except:
+ # Something unexpected happened. Report and die.
+ output('An exception occured in %s' % self, False)
+ traceback.print_exc(file = sys.stderr)
+ self.exit()
+ self.CommonsDelinker.thread_died()
+
+class Logger(threadpool.Thread):
+ def __init__(self, pool, CommonsDelinker):
+ threadpool.Thread.__init__(self, pool)
+ self.CommonsDelinker = CommonsDelinker
+ self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new')
+ self.enabled = self.CommonsDelinker.config.get('enable_logging', True)
+
+ def run(self):
+ self.connect()
+ threadpool.Thread.run(self)
+
+ def connect(self):
+ self.database = connect_database()
+ self.cursor = self.database.cursor()
+
+
+ def log_result_legacy(self, timestamp, image, domain, namespace, page, status =
"ok", newimage = None):
+ # TODO: Make sqlite3 ready
+
+ # The original delinker code cached log results,
+ # in order to limit the number of connections.
+ # However, since we are now using persistent
+ # connections, we can safely insert the result
+ # on the fly.
+ output(u'%s Logging %s for %s on %s' % (self, repr(status), image, page))
+
+ # There is no need to escape each parameter if
+ # a parametrized call is made.
+ self.cursor.execute("""INSERT INTO %s (timestamp, img, wiki,
page_title,
+ namespace, status, newimg) VALUES
+ (%%s, %%s, %%s, %%s, %%s, %%s, %%s)""" %
self.CommonsDelinker.config['log_table'],
+ (timestamp, image, domain, page, namespace, status, newimage))
+ self.database.commit()
+
+ def log_result_new(self, timestamp, image, site_lang, site_family,
+ page_namespace, page_title, status = 'ok', new_image = None):
+
+ output(u'%s Logging %s for %s on %s' % (self, repr(status), image,
page_title))
+
+ self.cursor.execute("""INSERT INTO %s (timestamp, image, site_lang,
site_family,
+ page_namespace, page_title, status, new_image) VALUES
+ (%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" %
self.CommonsDelinker.config['log_table'],
+ (timestamp, image, site_lang, site_family, page_namespace, page_title,
+ status, new_image))
+ self.database.commit()
+
+ def log_replacement(self, timestamp, old_image, new_image):
+ # TODO: Same as above
+
+ output(u'Replacing %s by %s done' % (old_image, new_image))
+ self.cursor.execute("""UPDATE %s SET status = 'done' WHERE
+ timestamp = %%s AND old_image = %%s AND
+ new_image = %%s""" %
self.CommonsDelinker.config['replacer_table'],
+ (timestamp, old_image, new_image))
+ self.database.commit()
+
+ def do(self, args):
+ if not self.enabled: return
+ try:
+ if len(args) == 3:
+ self.log_replacement(*args)
+ else:
+ if self.sql_layout == 'new':
+ self.log_result_new(*args)
+ else:
+ self.log_result_legacy(*args)
+ except:
+ # Something unexpected happened. Report and die.
+ output('An exception occured in %s' % self, False)
+ traceback.print_exc(file = sys.stderr)
+ self.exit()
+ self.CommonsDelinker.thread_died()
+
+class CommonsDelinker(object):
+ def __init__(self):
+ self.config = config.CommonsDelinker
+ self.site = wikipedia.getSite()
+ self.site.forceLogin()
+
+ # Initialize workers
+ self.CheckUsages = threadpool.ThreadPool(CheckUsage)
+ [self.CheckUsages.add_thread(self) for i in
xrange(self.config['checkusage_instances'])]
+
+ self.Delinkers = threadpool.ThreadPool(Delinker)
+ [self.Delinkers.add_thread(self) for i in
xrange(self.config['delinker_instances'])]
+
+ self.Loggers = threadpool.ThreadPool(Logger)
+ if self.config.get('enable_logging', True):
+ [self.Loggers.add_thread(self) for i in
xrange(self.config['logger_instances'])]
+ else:
+ self.Loggers.add_thread(self)
+
+ self.http = checkusage.HTTP(self.site.hostname())
+
+ self.edit_list = []
+ self.editLock = threading.Lock()
+
+ self.sites = {}
+ self.siteLock = threading.Lock()
+
+ self.SummaryCache = SummaryCache(self)
+
+ if self.config.get('enable_replacer', False):
+ self.connect_mysql()
+
+ if self.config.get('no_sysop', False):
+ # Don't edit as sysop
+ if hasattr(config, 'sysopnames'):
+ config.sysopnames = dict([(fam, {}) for fam in config.sysopnames.keys()])
+
+ self.last_check = time.time()
+
+ #if 'bot' in self.site.userGroups:
+ # self.log_limit = '5000'
+ #else:
+ # self.log_limit = '500'
+ self.log_limit = '500'
+
+ def connect_mysql(self):
+ self.database = connect_database()
+ self.cursor = self.database.cursor()
+
+ def set_edit(self, domain, page):
+ """ Make sure the bot does not create edit
+ conflicts with itself."""
+ self.editLock.acquire()
+ being_editted = (domain, page) in self.edit_list
+ if not being_editted:
+ self.edit_list.append((domain, page))
+ self.editLock.release()
+ return being_editted
+ def unset_edit(self, domain, page):
+ """ Done editting. """
+ self.editLock.acquire()
+ self.edit_list.remove((domain, page))
+ self.editLock.release()
+
+ def get_site(self, code, fam):
+ # Threadsafe replacement of wikipedia.getSite
+ key = '%s:%s' % (code, fam)
+ self.siteLock.acquire()
+ try:
+ if key not in self.sites:
+ self.sites[key] = []
+ for site, used in self.sites[key]:
+ if not used:
+ self.sites[key][self.sites[key].index((site, False))] = (site, True)
+ return site
+ site = wikipedia.Site(code, fam)
+ self.sites[key].append((site, True))
+ return site
+ finally:
+ self.siteLock.release()
+ def unlock_site(self, site):
+ key = '%s:%s' % (site.lang, site.family.name)
+ self.siteLock.acquire()
+ try:
+ self.sites[key][self.sites[key].index((site, True))] = (site, False)
+ finally:
+ self.siteLock.release()
+
+
+ def read_deletion_log(self):
+ ts_format = '%Y-%m-%dT%H:%M:%SZ'
+ wait = self.config['delink_wait']
+ exclusion = self.config['exclude_string']
+
+ ts_from = self.last_check
+ # Truncate -> int()
+ ts_end = int(time.time())
+ self.last_check = ts_end
+
+ # Format as a Mediawiki timestamp and substract a
+ # certain wait period.
+ ts_from_s = time.strftime(ts_format, time.gmtime(ts_from - wait + 1))
+ ts_end_s = time.strftime(ts_format, time.gmtime(ts_end - wait))
+
+ try:
+ # Assume less than 500 deletion have been made between
+ # this and the previous check of the log. If this is not
+ # the case, timeout should be set lower.
+ result = self.http.query_api(self.site.hostname(), self.site.apipath(),
+ action = 'query', list = 'logevents', letype = 'delete',
+ lelimit = self.log_limit, lestart = ts_from_s, leend = ts_end_s,
+ ledir = 'newer')
+ logevents = result['query']['logevents']
+ except Exception, e:
+ if type(e) in (SystemError, KeyboardInterrupt): raise
+ # Something happened, but since it is a network error,
+ # it will not be critical. In order to prevent data loss
+ # the last_check timestamp has to be set correctly.
+ self.last_check = ts_from
+ output('Warning! Unable to read deletion logs', False)
+ output('%s: %s' % (e.__class__.__name__, str(e)), False)
+ return time.sleep(self.config['timeout'])
+
+ for logevent in logevents:
+ if logevent['ns'] == 6 and logevent['action'] == 'delete':
+ if exclusion not in logevent.get('comment', ''):
+ timestamp = logevent['timestamp']
+ timestamp = timestamp.replace('-', '')
+ timestamp = timestamp.replace(':', '')
+ timestamp = timestamp.replace('T', '')
+ timestamp = timestamp.replace('Z', '')
+
+ output(u'Deleted image: %s' % logevent['title'])
+ self.CheckUsages.append((checkusage.strip_ns(logevent['title']),
+ timestamp, logevent['user'], logevent.get('comment', ''),
+ None))
+ else:
+ output(u'Skipping deleted image: %s' % logevent['title'])
+
+ def read_replacement_log(self):
+ # TODO: Make sqlite3 ready
+ # TODO: Single process replacer
+ update = """UPDATE %s SET status = %%s WHERE id = %%s"""
% \
+ self.config['replacer_table']
+ self.cursor.execute("""SELECT id, timestamp, old_image, new_image, user,
comment
+ FROM %s WHERE status = 'pending'""" %
self.config['replacer_table'])
+ result = ([universal_unicode(s) for s in i] for i in self.cursor.fetchall())
+
+
+ for id, timestamp, old_image, new_image, user, comment in result:
+ # TODO: remove code; should now be part of the replacer
+ if (not old_image.lower().endswith('.svg')) and \
+ new_image.lower().endswith('.svg'):
+ output(u'Refused to replace %s by %s' % (old_image, new_image))
+ self.cursor.execute(update, ('refused', id))
+ else:
+ self.CheckUsages.append((old_image, timestamp, user, comment, new_image))
+ output(u'Replacing %s by %s' % (old_image, new_image))
+ self.cursor.execute(update, ('ok', id))
+
+ self.database.commit()
+
+ def start(self):
+ # Gracefully exit all threads on SIG_INT or SIG_TERM
+ threadpool.catch_signals()
+
+ # Start threads
+ self.Loggers.start()
+ self.Delinkers.start()
+ self.CheckUsages.start()
+
+ # Give threads some time to initialize
+ time.sleep(self.config['timeout'])
+ output(u'All workers started')
+
+ if self.config.get('monitor'):
+ # For debugging a special monitor may be used.
+ # This monitor is optional, and will be used if
+ # a configuration variable monitor is set and True.
+ import monitor
+ monitor.Monitor(self).start()
+
+ # Main loop
+ while True:
+ if self.config.get('enable_delinker', True):
+ if 'deletion_log_table' in self.config:
+ if not self.read_deletion_log_db():
+ self.read_deletion_log()
+ else:
+ self.read_deletion_log()
+ if self.config.get('enable_replacer', False):
+ self.read_replacement_log()
+
+ time.sleep(self.config['timeout'])
+
+ def thread_died(self):
+ # A thread died, it may be possible that we cannot
+ # function any more. Currently only for CheckUsages
+ # and Loggers. Delinkers should not be able to die.
+ cu = 0
+ self.CheckUsages.jobLock.acquire()
+ for thread in self.CheckUsages.threads:
+ if thread.isAlive() and not thread.quit:
+ cu += 1
+ self.CheckUsages.jobLock.release()
+ lg = 0
+ self.Loggers.jobLock.acquire()
+ for thread in self.Loggers.threads:
+ if thread.isAlive() and not thread.quit:
+ lg += 1
+ unlogged = self.Loggers.jobQueue[:]
+ self.Loggers.jobLock.release()
+
+ # We can no longer function if we have only one
+ # CheckUsage or zero Loggers available.
+ # TODO: config settings?
+ if cu <= 1:
+ output(u'ERROR!!! Too few CheckUsages left to function', False)
+ threadpool.terminate()
+ if lg <= 0:
+ output(u'ERROR!!! Too few Loggers left to function', False)
+ print >>sys.stderr, 'Currently unlogged:', unlogged
+ threadpool.terminate()
+
+def output(message, toStdout = True):
+ message = time.strftime('[%Y-%m-%d %H:%M:%S] ') + message
+ wikipedia.output(message, toStdout = toStdout)
+ if toStdout:
+ sys.stdout.flush()
+ else:
+ sys.stderr.flush()
+
+if __name__ == '__main__':
+ output(u'Running ' + __version__)
+ CD = CommonsDelinker()
+ output(u'This bot runs from: ' + str(CD.site))
+
+ re._MAXCACHE = 4
+
+ args = wikipedia.handleArgs()
+ if '-since' in args:
+ # NOTE: Untested
+ ts_format = '%Y-%m-%d %H:%M:%S'
+ try:
+ since = time.strptime(
+ args[args.index('-since') + 1],
+ ts_format)
+ except ValueError:
+ if args[args.index('-since') + 1][0] == '[' and \
+ len(args) != args.index('-since') + 2:
+ since = time.strptime('%s %s' % \
+ args[args.index('-since') + 1],
+ '[%s]' % ts_format)
+ else:
+ raise ValueError('Incorrect time format!')
+ output(u'Reading deletion log since [%s]' %\
+ time.strftime(ts_format, since))
+ CD.last_check = time.mktime(since)
+
+ try:
+ try:
+ CD.start()
+ except Exception, e:
+ if type(e) not in (SystemExit, KeyboardInterrupt):
+ output('An exception occured in the main thread!', False)
+ traceback.print_exc(file = sys.stderr)
+ threadpool.terminate()
+ finally:
+ output(u'Stopping CommonsDelinker')
+ wikipedia.stopme()
+ # Flush the standard streams
+ sys.stdout.flush()
+ sys.stderr.flush()