[Pywikipedia-l] SVN: [4387] trunk/pywikipedia

29 Sep 2007

Revision: 4387
Author:   btongminh
Date:     2007-09-29 14:49:01 +0000 (Sat, 29 Sep 2007)

Log Message:
-----------
Moving CommonsDelinker to its own directory (2)

Added Paths:
-----------
    trunk/pywikipedia/commonsdelinker/
    trunk/pywikipedia/commonsdelinker/delinker.py

Removed Paths:
-------------
    trunk/pywikipedia/delinker/

Copied: trunk/pywikipedia/commonsdelinker (from rev 4386, trunk/pywikipedia/delinker)

Copied: trunk/pywikipedia/commonsdelinker/delinker.py (from rev 4385,
trunk/pywikipedia/delinker.py)
===================================================================

--- trunk/pywikipedia/commonsdelinker/delinker.py	                        (rev 0)
+++ trunk/pywikipedia/commonsdelinker/delinker.py	2007-09-29 14:49:01 UTC (rev 4387)
@@ -0,0 +1,810 @@
+#!/usr/bin/python
+# -*- coding: utf-8  -*-
+"""
+This script keeps track of image deletions and delinks removed files 
+from (any) wiki. Usage
+on protected pages or pages containing blacklisted external links cannot
+be processed.
+
+This script is run by [[commons:User:Siebrand]] on the toolserver. It should
+not be run by other users without prior contact.
+
+Although the classes are called CommonsDelinker and Delinker, it is in fact
+a general delinker/replacer, also suitable for local use.
+
+Please refer to delinker.txt for full documentation.
+"""
+#
+# 
+# (C) Kyle/Orgullomoore, 2006-2007
+# (C) Siebrand Mazeland, 2006-2007
+# (C) Bryan Tong Minh, 2007
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+# This script requires MySQLdb and simplejson. Tested with:
+# * Python 2.4.4, MySQLdb 1.2.1_p, simplejson 1.3
+# * Python 2.5, MySQLdb 1.2.2, simplejson 1.5 (recommended)
+# TODO:
+# * Don't replace within <nowiki /> tags
+# * Make as many config settings site dependend
+# BUGS:
+# * There is a problem with images in the es.wikisource project namespace.
+#   The exact problem is described somewhere in Bryan's IRC logs, but it is
+#   unknown where exactly.
+
+import sys, os, threading, time
+import traceback
+import re, cgitb
+
+import threadpool
+import checkusage
+
+import wikipedia
+import config
+	
+def wait_callback(object):
+	output(u'Connection has been lost in %s. Attempting reconnection.' %
repr(object), False)
+	if hasattr(object, 'error'):
+		output(u'Error was %s: %s' % tuple(object.error))
+def universal_unicode(s):
+	if type(s) is str:
+		return s.decode('utf-8', 'ignore')
+	return unicode(s)
+def connect_database():
+	engine = config.CommonsDelinker['sql_engine']
+	kwargs = config.CommonsDelinker['sql_config'].copy()
+	if engine == 'mysql':
+		import mysql_autoconnection
+		# This setting is required for MySQL
+		kwargs['charset'] = 'utf8'
+		# Work around for a bug in MySQLdb 1.2.1_p. This version will
+		# set use_unicode for all connections to the value of the first
+		# initialized connection. This bug is not relevant for MySQLdb
+		# versions 1.2.2 and upwards. The connection character set must
+		# be set to utf8 however, to prevent INSERTs to be converted to
+		# the standard MySQL character set.
+		kwargs['use_unicode'] = False
+		kwargs['callback'] = wait_callback
+		
+		return mysql_autoconnection.connect(**kwargs)
+	# TODO: Add support for sqlite3
+	raise RuntimeError('Unsupported database engine %s' % engine)
+
+class Delinker(threadpool.Thread):
+	# TODO: Method names could use some clean up
+	def __init__(self, pool, CommonsDelinker):
+		threadpool.Thread.__init__(self, pool)
+		self.CommonsDelinker = CommonsDelinker
+		self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new')
+		
+	def delink_image(self, image, usage, timestamp, admin, reason, replacement = None):
+		""" Performs the delink for image on usage. """
+		output(u'%s Usage of %s: %s' % (self, image, usage))
+		
+		skipped_images = {}
+		for (lang, family), pages in usage.iteritems():
+			site = self.CommonsDelinker.get_site(lang, family)
+			
+			try:
+				summary = self.get_summary(site, image, admin, reason, replacement)
+				
+				for page_namespace, page_title, title in pages:
+					if (site.lang, site.family.name) == (self.CommonsDelinker.site.lang,
+							self.CommonsDelinker.site.family.name) and \
+							(page_namespace, page_title) == (6, image):
+						continue
+						
+					if self.CommonsDelinker.set_edit(str(site), title):
+						# The page is currently being editted. Postpone.
+						if (lang, family) not in skipped_images:
+							skipped_images[(lang, family)] = []
+						skipped_images[(lang, family)].append(
+							(page_namespace, page_title, title))
+					else:
+						# Delink the image
+						output(u'%s Delinking %s from %s' % (self, image, site))
+						
+						try:
+							result = self.replace_image(image, site, title, summary, replacement)
+						finally:
+							self.CommonsDelinker.unset_edit(str(site), title)
+						
+						# Add to logging queue
+						if self.sql_layout == 'new':
+							self.CommonsDelinker.Loggers.append((timestamp, image, 
+								site.lang, site.family.name, page_namespace, page_title,
+								result, replacement))
+						else:
+							self.CommonsDelinker.Loggers.append((timestamp, image, site.hostname(), 
+								page_namespace, page_title, result, replacement))
+			finally:
+				self.CommonsDelinker.unlock_site(site)
+		
+		if skipped_images:
+			time.sleep(self.CommonsDelinker.config['timeout'])
+			return self.delink_image(image, skipped_images, timestamp, admin, reason,
replacement)
+		elif replacement:
+			# Let them know that we are done replacing.
+			self.CommonsDelinker.Loggers.append((timestamp, image, replacement))
+			
+	def replace_image(self, image, site, page_title, summary, replacement = None):
+		""" The actual replacement. Giving None as argument for replacement
+		will delink instead of replace."""
+		
+		page = wikipedia.Page(site, page_title)
+		
+		# TODO: Per site config.
+		if page.namespace() in self.CommonsDelinker.config['delink_namespaces']:
+			try:
+				text = page.get(nofollow_redirects = True)
+			except wikipedia.NoPage:
+				return 'failed'
+			new_text = text
+			
+			def create_regex(s):
+				s = re.escape(s)
+				return ur'(?:[%s%s]%s)' % (s[0].upper(), s[0].lower(), s[1:])
+			def create_regex_i(s):
+				return ur'(?:%s)' % u''.join([u'[%s%s]' % (c.upper(),
c.lower()) for c in s])
+			
+			namespaces = ('Image', 'Media') + site.namespace(6, all = True) +
site.namespace(-2, all = True)
+			r_namespace = ur'\s*(?:%s)\s*\:\s*' % u'|'.join(map(create_regex_i,
namespaces))
+			# Note that this regex creates a group!
+			r_image = u'(%s)' % create_regex(image).replace(r'\_', '[
_]')
+			
+			def simple_replacer(match):
+				if replacement == None:
+					return u''
+				else:
+					groups = list(match.groups())
+					groups[1] = replacement
+					return u''.join(groups)
+					
+			# Previously links in image descriptions will cause 
+			# unexpected behaviour: [[Image:image.jpg|thumb|[[link]] in description]]
+			# will truncate at the first occurence of ]]. This cannot be
+			# fixed using one regular expression.
+			# This means that all ]] after the start of the image
+			# must be located. If it then does not have an associated
+			# [[, this one is the closure of the image.
+			
+			r_simple_s = u'(\[\[%s)%s' % (r_namespace, r_image)
+			r_s = '\[\['
+			r_e = '\]\]'
+			# First determine where wikilinks start and end
+			image_starts = [match.start() for match in re.finditer(r_simple_s, text)]
+			link_starts = [match.start() for match in re.finditer(r_s, text)]
+			link_ends = [match.end() for match in re.finditer(r_e, text)]
+				
+			r_simple = u'(\[\[%s)%s(.*)' % (r_namespace, r_image)
+			replacements = []
+			for image_start in image_starts:
+				current_link_starts = [link_start for link_start in link_starts 
+					if link_start > image_start]
+				current_link_ends = [link_end for link_end in link_ends 
+					if link_end > image_start]
+				end = image_start
+				if current_link_ends: end = current_link_ends[0]
+				
+				while current_link_starts and current_link_ends:
+					start = current_link_starts.pop(0)
+					end = current_link_ends.pop(0)
+					if end <= start and end > image_start:
+						# Found the end of the image
+						break
+				
+				# Add the replacement to the todo list. Doing the
+				# replacement right know would alter the indices.
+				replacements.append((new_text[image_start:end],
+					re.sub(r_simple, simple_replacer, 
+					new_text[image_start:end])))
+			
+			# Perform the replacements
+			for old, new in replacements:
+				if old: new_text = new_text.replace(old, new)
+			
+			# Remove the image from galleries
+			r_galleries = ur'(?s)(\<%s\>)(.*?)(\<\/%s\>)' %
(create_regex_i('gallery'), 
+				create_regex_i('gallery'))
+			r_gallery = ur'(?m)^((?:%s)?)%s(\s*(?:\|.*?)?\s*)$' % (r_namespace, r_image)
+			def gallery_replacer(match):
+				return ur'%s%s%s' % (match.group(1), re.sub(r_gallery, 
+					simple_replacer, match.group(2)), match.group(3))
+			new_text = re.sub(r_galleries, gallery_replacer, new_text)
+			
+			if text == new_text:
+				# All previous steps did not work, so the image is
+				# likely embedded in a complicated template.
+				r_templates = ur'(?s)(\{\{.*?\}\})'
+				r_complicated = u'(?s)((?:%s)?)%s' % (r_namespace, r_image)
+				
+				def template_replacer(match):
+					return re.sub(r_complicated, simple_replacer, match.group(1))
+				new_text = re.sub(r_templates, template_replacer, text)
+			
+			if text != new_text:
+				# Save to the wiki
+				# Code for checking user page existance has been moved
+				# to summary() code, to avoid checking the user page
+				# for each removal.
+				try:
+					if config.CommonsDelinker.get('save_diff', False):
+						# Save a diff
+						import difflib
+						diff = difflib.context_diff(
+							text.encode('utf-8').splitlines(True),
+							new_text.encode('utf-8').splitlines(True))
+						f = open((u'diff/%s-%s-%s.txt' % (page_title.replace('/',
'-'),
+							site.dbName(), page.editTime())).encode('utf-8', 'ignore'),
'w')
+						f.writelines(diff)
+						f.close()
+					
+					if self.CommonsDelinker.config.get('edit', True) and not \
+							((self.CommonsDelinker.site.lang == 'commons') ^ \
+							(config.usernames.get('commons', {}).get(
+							'commons') == 'CommonsDelinker')):
+						page.put(new_text, summary)
+					return 'ok'
+				except wikipedia.EditConflict:
+					# Try again
+					output(u'Got EditConflict trying to remove %s from %s:%s.' % \
+						(image, site, page_title))
+					return self.replace_image(image, site, page_title, summary, replacement = None)
+				except (wikipedia.LockedPage, wikipedia.PageNotSaved):
+					return 'failed'
+			else:
+				return 'skipped'
+		return 'skipped'
+			
+		
+		
+	def do(self, args):
+		try:
+			self.delink_image(*args)
+		except:
+			output(u'An exception occured in %s' % self, False)
+			traceback.print_exc(file = sys.stderr)
+		
+	def get_summary(self, site, image, admin, reason, replacement):
+		""" Get the summary template and substitute the 
+		correct values."""
+		# FIXME: Don't insert commons: on local delink
+		# FIXME: Hardcode is EVIL
+		if replacement:
+			tlp = self.CommonsDelinker.SummaryCache.get(site, 'replace-I18n')
+		else:
+			tlp = self.CommonsDelinker.SummaryCache.get(site, 'summary-I18n')
+		
+		tlp = tlp.replace('$1', image)
+		if replacement:
+			tlp = tlp.replace('$2', replacement)
+			tlp = tlp.replace('$3', unicode(admin))
+			tlp = tlp.replace('$4', unicode(reason).replace('[[',
'[[w:commons:'))
+		else:
+			tlp = tlp.replace('$2', unicode(admin))
+			tlp = tlp.replace('$3', reason.replace('[[', '[[w:commons:'))
+		
+		return tlp
+		
+class SummaryCache(object):
+	""" Object to thread-safe cache summary templates. """
+	def __init__(self, CommonsDelinker):
+		self.summaries = {}
+		self.lock = threading.Lock()
+		self.CommonsDelinker = CommonsDelinker
+		
+	def get(self, site, type, key = None):
+		# This can probably also provide something for 
+		# localised settings, but then it first needs to 
+		# check whether the page is sysop only.
+		if not key:
+			key = str(site)
+			
+		self.lock.acquire()
+		try:
+			if type not in self.summaries:
+				self.summaries[type] = {}
+			if key in self.summaries[type]:
+				if (time.time() - self.summaries[type][key][1]) < \
+						self.CommonsDelinker.config['summary_cache']:
+					# Return cached result
+					return self.summaries[type][key][0]
+					
+			output(u'%s Fetching new summary for %s' % (self, site))
+			
+			# FIXME: evil
+			if self.CommonsDelinker.config['global']:
+				self.check_user_page(site)
+			page = wikipedia.Page(site, '%s%s' % \
+				(self.CommonsDelinker.config['local_settings'], type))
+			try:
+				# Fetch the summary template, follow redirects
+				i18n = page.get(get_redirect = True)
+				self.summaries[type][key] = (i18n, time.time())
+				return i18n
+			except wikipedia.NoPage:
+				pass
+		finally:
+			self.lock.release()
+			
+		# No i18n available, but it may be available in the wikipedia
+		# of that language. Only do so for wiktionary, wikibooks,
+		# wikiquote, wikisource, wikinews, wikiversity
+		# This will cause the bot to function even on special wikis
+		# like mediawiki.org and meta and species.
+		output(u'%s Using default summary for %s' % (self, site))
+				
+		if site.family.name != 'wikipedia' and
self.CommonsDelinker.config['global']:
+			if site.family.name in ('wiktionary', 'wikibooks',
'wikiquote', 
+					'wikisource', 'wikinews', 'wikiversity'):
+				newsite = self.CommonsDelinker.get_site(site.lang, 
+					wikipedia.Family('wikipedia'))
+				return self.get(newsite, type, key = key)
+		return self.CommonsDelinker.config['default_settings'].get(type, '')
+				
+	def check_user_page(self, site):
+		"Check whether a userpage exists. Only used for CommonsDelinker."
+		try:
+			# Make sure the userpage is not empty
+			# Note: if wikis delete the userpage, it's there own fault
+			filename = 'canedit.cdl'
+			try:
+				f = open(filename, 'r')
+			except IOError:
+				# Don't care
+				return
+			ftxt = f.read()
+			f.close()
+			if not '#' + str(site) in ftxt:
+				username = config.usernames[site.family.name][site.lang] 
+				
+				userpage = wikipedia.Page(site, 'User:' + username)
+				# Removed check for page existence. If it is not in our 
+				# database we can safely assume that we have no user page
+				# there. In case there is, we will just overwrite it once.
+				# It causes no real problems, but it is one call to the 
+				# servers less.
+				# TODO: Config setting?
+				userpage.put('#REDIRECT [[m:User:CommonsDelinker]]', '')
+				
+				f = open(filename, 'a')
+				f.write('#' + str(site))
+				f.close()
+		except wikipedia.LockedPage:
+			# User page is protected, continue anyway
+			pass	
+			
+class CheckUsage(threadpool.Thread):
+	def __init__(self, pool, CommonsDelinker):
+		threadpool.Thread.__init__(self, pool)
+		self.CommonsDelinker = CommonsDelinker
+		# Not really thread safe, but we should only do read operations...
+		self.site = CommonsDelinker.site
+		
+	def run(self):
+		self.connect()
+		threadpool.Thread.run(self)
+		
+	def connect(self):
+		config = self.CommonsDelinker.config
+		if config['global']:
+			# Note: global use requires MySQL
+			self.CheckUsage = checkusage.CheckUsage(limit = sys.maxint,
+				mysql_kwargs = config['sql_config'],
+				use_autoconn = True, 
+				http_callback = wait_callback,
+				mysql_callback = wait_callback)
+		else:
+			self.CheckUsage = checkusage.CheckUsage(sys.maxint,
+				http_callback = wait_callback, no_db = True)
+		
+		
+	def check_usage(self, image, timestamp, admin, reason, replacement):
+		""" Check whether this image needs to be delinked. """
+		
+		# Check whether the image still is deleted on Commons.
+		# BUG: This also returns true for images with a page, but
+		# without the image itself. Can be fixed by querying query.php
+		# instead of api.php. Also should this be made as an exits() 
+		# method of checkusage.CheckUsage?
+		if self.site.shared_image_repository() != (None, None):
+			shared_image_repository =
self.CommonsDelinker.get_site(*self.site.shared_image_repository())
+			try:
+				if self.CheckUsage.exists(shared_image_repository, image) \
+						and not bool(replacement):
+					output(u'%s %s exists on the shared image repository!' % (self, image))
+					return
+			finally:
+				self.CommonsDelinker.unlock_site(shared_image_repository)
+		if self.CheckUsage.exists(self.site, image) and \
+				not bool(replacement):
+			output(u'%s %s exists again!' % (self, image))
+			return
+		
+		
+		if self.CommonsDelinker.config['global']:
+			usage = self.CheckUsage.get_usage(image)
+			usage_domains = {}
+			
+			count = 0
+			# Sort usage per domain
+			for (lang, family), (page_namespace, page_title, title) in usage:
+				if (lang, family) not in usage_domains:
+					usage_domains[(lang, family)] = []
+				usage_domains[(lang, family)].append((page_namespace, page_title, title))
+				count += 1
+		else:
+			#FIX!
+			usage_domains = {(self.site.lang, self.site.family.name): 
+				list(self.CheckUsage.get_usage_live(self.site, 
+					image))}
+			count = len(usage_domains[(self.site.lang, self.site.family.name)])
+			
+		output(u'%s %s used on %s pages' % (self, image, count))
+		
+		if count:
+			# Pass the usage to the Delinker pool along with other arguments
+			self.CommonsDelinker.Delinkers.append((image, usage_domains, 
+				timestamp, admin, reason, replacement))
+		elif replacement:
+			# Record replacement done
+			self.CommonsDelinker.Loggers.append((timestamp, image, replacement))
+		
+	def do(self, args):
+		try:
+			self.check_usage(*args)
+		except:
+			# Something unexpected happened. Report and die.
+			output('An exception occured in %s' % self, False)
+			traceback.print_exc(file = sys.stderr)
+			self.exit()
+			self.CommonsDelinker.thread_died()
+		
+class Logger(threadpool.Thread):
+	def __init__(self, pool, CommonsDelinker):
+		threadpool.Thread.__init__(self, pool)
+		self.CommonsDelinker = CommonsDelinker
+		self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new')
+		self.enabled = self.CommonsDelinker.config.get('enable_logging', True)
+		
+	def run(self):
+		self.connect()
+		threadpool.Thread.run(self)
+		
+	def connect(self):
+		self.database = connect_database()
+		self.cursor = self.database.cursor()
+		
+		
+	def log_result_legacy(self, timestamp, image, domain, namespace, page, status =
"ok", newimage = None):
+		# TODO: Make sqlite3 ready
+		
+		# The original delinker code cached log results,
+		# in order to limit the number of connections.
+		# However, since we are now using persistent
+		# connections, we can safely insert the result
+		# on the fly.
+		output(u'%s Logging %s for %s on %s' % (self, repr(status), image, page))
+		
+		# There is no need to escape each parameter if
+		# a parametrized call is made.		
+		self.cursor.execute("""INSERT INTO %s (timestamp, img, wiki,
page_title,
+	namespace, status, newimg) VALUES
+	(%%s, %%s, %%s, %%s, %%s, %%s, %%s)""" %
self.CommonsDelinker.config['log_table'],
+			(timestamp, image, domain, page, namespace, status, newimage))
+		self.database.commit()
+			
+	def log_result_new(self, timestamp, image, site_lang, site_family, 
+			page_namespace, page_title, status = 'ok', new_image = None):
+				
+		output(u'%s Logging %s for %s on %s' % (self, repr(status), image,
page_title))
+
+		self.cursor.execute("""INSERT INTO %s (timestamp, image, site_lang,
site_family,
+	page_namespace, page_title, status, new_image) VALUES
+	(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" %
self.CommonsDelinker.config['log_table'],
+			(timestamp, image, site_lang, site_family, page_namespace, page_title, 
+				status, new_image))
+		self.database.commit()
+			
+	def log_replacement(self, timestamp, old_image, new_image):
+		# TODO: Same as above
+		
+		output(u'Replacing %s by %s done' % (old_image, new_image))
+		self.cursor.execute("""UPDATE %s SET status = 'done' WHERE 
+			timestamp = %%s AND old_image = %%s AND 
+			new_image = %%s""" %
self.CommonsDelinker.config['replacer_table'],
+			(timestamp, old_image, new_image))
+		self.database.commit()
+		
+	def do(self, args):
+		if not self.enabled: return
+		try:
+			if len(args) == 3:
+				self.log_replacement(*args)
+			else:
+				if self.sql_layout == 'new':
+					self.log_result_new(*args)
+				else:
+					self.log_result_legacy(*args)
+		except:
+			# Something unexpected happened. Report and die.
+			output('An exception occured in %s' % self, False)
+			traceback.print_exc(file = sys.stderr)
+			self.exit()
+			self.CommonsDelinker.thread_died()
+			
+class CommonsDelinker(object):
+	def __init__(self):
+		self.config = config.CommonsDelinker
+		self.site = wikipedia.getSite()
+		self.site.forceLogin()
+		
+		# Initialize workers
+		self.CheckUsages = threadpool.ThreadPool(CheckUsage)
+		[self.CheckUsages.add_thread(self) for i in
xrange(self.config['checkusage_instances'])]
+		
+		self.Delinkers = threadpool.ThreadPool(Delinker)
+		[self.Delinkers.add_thread(self) for i in
xrange(self.config['delinker_instances'])]
+			
+		self.Loggers = threadpool.ThreadPool(Logger)
+		if self.config.get('enable_logging', True):
+			[self.Loggers.add_thread(self) for i in
xrange(self.config['logger_instances'])]
+		else:
+			self.Loggers.add_thread(self)
+		
+		self.http = checkusage.HTTP(self.site.hostname())
+		
+		self.edit_list = []
+		self.editLock = threading.Lock()
+		
+		self.sites = {}
+		self.siteLock = threading.Lock()
+		
+		self.SummaryCache = SummaryCache(self)
+		
+		if self.config.get('enable_replacer', False):
+			self.connect_mysql()
+			
+		if self.config.get('no_sysop', False):
+			# Don't edit as sysop
+			if hasattr(config, 'sysopnames'):
+				config.sysopnames = dict([(fam, {}) for fam in config.sysopnames.keys()])
+				
+		self.last_check = time.time()
+		
+		#if 'bot' in self.site.userGroups:
+		#	self.log_limit = '5000'
+		#else:
+		#	self.log_limit = '500'
+		self.log_limit = '500'
+		
+	def connect_mysql(self):
+		self.database = connect_database()
+		self.cursor = self.database.cursor()
+		
+	def set_edit(self, domain, page):
+		""" Make sure the bot does not create edit 
+		conflicts with itself."""
+		self.editLock.acquire()
+		being_editted = (domain, page) in self.edit_list
+		if not being_editted:
+			self.edit_list.append((domain, page))
+		self.editLock.release()
+		return being_editted
+	def unset_edit(self, domain, page):
+		""" Done editting. """
+		self.editLock.acquire()
+		self.edit_list.remove((domain, page))
+		self.editLock.release()
+		
+	def get_site(self, code, fam):
+		# Threadsafe replacement of wikipedia.getSite 
+		key = '%s:%s' % (code, fam)
+		self.siteLock.acquire()
+		try:
+			if key not in self.sites:
+				self.sites[key] = []
+			for site, used in self.sites[key]:
+				if not used:
+					self.sites[key][self.sites[key].index((site, False))] = (site, True)
+					return site
+			site = wikipedia.Site(code, fam)
+			self.sites[key].append((site, True))
+			return site
+		finally:
+			self.siteLock.release()
+	def unlock_site(self, site):
+		key = '%s:%s' % (site.lang, site.family.name)
+		self.siteLock.acquire()
+		try:
+			self.sites[key][self.sites[key].index((site, True))] = (site, False)
+		finally:
+			self.siteLock.release()
+		
+		
+	def read_deletion_log(self):
+		ts_format = '%Y-%m-%dT%H:%M:%SZ'
+		wait = self.config['delink_wait']
+		exclusion = self.config['exclude_string']
+		
+		ts_from = self.last_check
+		# Truncate -> int()
+		ts_end = int(time.time())
+		self.last_check = ts_end
+		
+		# Format as a Mediawiki timestamp and substract a
+		# certain wait period.
+		ts_from_s = time.strftime(ts_format, time.gmtime(ts_from - wait + 1))
+		ts_end_s = time.strftime(ts_format, time.gmtime(ts_end - wait))
+		
+		try:
+			# Assume less than 500 deletion have been made between
+			# this and the previous check of the log. If this is not 
+			# the case, timeout should be set lower.
+			result = self.http.query_api(self.site.hostname(), self.site.apipath(),
+				action = 'query', list = 'logevents', letype = 'delete', 
+				lelimit = self.log_limit, lestart = ts_from_s, leend = ts_end_s, 
+				ledir = 'newer')
+			logevents = result['query']['logevents']
+		except Exception, e:
+			if type(e) in (SystemError, KeyboardInterrupt): raise			
+			# Something happened, but since it is a network error,
+			# it will not be critical. In order to prevent data loss
+			# the last_check timestamp has to be set correctly.
+			self.last_check = ts_from
+			output('Warning! Unable to read deletion logs', False)
+			output('%s: %s' % (e.__class__.__name__, str(e)), False)
+			return time.sleep(self.config['timeout'])
+		
+		for logevent in logevents:
+			if logevent['ns'] == 6 and logevent['action'] == 'delete':
+				if exclusion not in logevent.get('comment', ''):
+					timestamp = logevent['timestamp']
+					timestamp = timestamp.replace('-', '')
+					timestamp = timestamp.replace(':', '')
+					timestamp = timestamp.replace('T', '')
+					timestamp = timestamp.replace('Z', '')
+					
+					output(u'Deleted image: %s' % logevent['title'])
+					self.CheckUsages.append((checkusage.strip_ns(logevent['title']),
+						timestamp, logevent['user'], logevent.get('comment', ''),
+						None))
+				else:
+					output(u'Skipping deleted image: %s' % logevent['title'])
+					
+	def read_replacement_log(self):
+		# TODO: Make sqlite3 ready
+		# TODO: Single process replacer
+		update = """UPDATE %s SET status = %%s WHERE id = %%s"""
% \
+			self.config['replacer_table']
+		self.cursor.execute("""SELECT id, timestamp, old_image, new_image, user,
comment
+			FROM %s WHERE status = 'pending'""" %
self.config['replacer_table'])
+		result = ([universal_unicode(s) for s in i] for i in self.cursor.fetchall())
+
+
+		for id, timestamp, old_image, new_image, user, comment in result:
+			# TODO: remove code; should now be part of the replacer
+			if (not old_image.lower().endswith('.svg')) and \
+					new_image.lower().endswith('.svg'):
+				output(u'Refused to replace %s by %s' % (old_image, new_image))
+				self.cursor.execute(update, ('refused', id))
+			else:
+				self.CheckUsages.append((old_image, timestamp, user, comment, new_image))
+				output(u'Replacing %s by %s'  % (old_image, new_image))
+				self.cursor.execute(update, ('ok', id))
+			
+		self.database.commit()
+			
+	def start(self):
+		# Gracefully exit all threads on SIG_INT or SIG_TERM
+		threadpool.catch_signals()
+		
+		# Start threads
+		self.Loggers.start()
+		self.Delinkers.start()
+		self.CheckUsages.start()
+		
+		# Give threads some time to initialize
+		time.sleep(self.config['timeout'])
+		output(u'All workers started')
+		
+		if self.config.get('monitor'):
+			# For debugging a special monitor may be used.
+			# This monitor is optional, and will be used if
+			# a configuration variable monitor is set and True.
+			import monitor
+			monitor.Monitor(self).start()
+		
+		# Main loop
+		while True:
+			if self.config.get('enable_delinker', True):
+				if 'deletion_log_table' in self.config:
+					if not self.read_deletion_log_db():
+						self.read_deletion_log()
+				else:
+					self.read_deletion_log()
+			if self.config.get('enable_replacer', False):
+				self.read_replacement_log()
+				
+			time.sleep(self.config['timeout'])
+			
+	def thread_died(self):
+		# A thread died, it may be possible that we cannot 
+		# function any more. Currently only for CheckUsages
+		# and Loggers. Delinkers should not be able to die.
+		cu = 0
+		self.CheckUsages.jobLock.acquire()
+		for thread in self.CheckUsages.threads:
+			if thread.isAlive() and not thread.quit:
+				cu += 1
+		self.CheckUsages.jobLock.release()
+		lg = 0
+		self.Loggers.jobLock.acquire()
+		for thread in self.Loggers.threads:
+			if thread.isAlive() and not thread.quit:
+				lg += 1
+		unlogged = self.Loggers.jobQueue[:]
+		self.Loggers.jobLock.release()
+		
+		# We can no longer function if we have only one
+		# CheckUsage or zero Loggers available. 
+		# TODO: config settings?
+		if cu <= 1:
+			output(u'ERROR!!! Too few CheckUsages left to function', False)
+			threadpool.terminate()
+		if lg <= 0:
+			output(u'ERROR!!! Too few Loggers left to function', False)
+			print >>sys.stderr, 'Currently unlogged:', unlogged
+			threadpool.terminate()
+			
+def output(message, toStdout = True):
+	message = time.strftime('[%Y-%m-%d %H:%M:%S] ') + message
+	wikipedia.output(message, toStdout = toStdout)
+	if toStdout:
+		sys.stdout.flush()
+	else:
+		sys.stderr.flush()
+			
+if __name__ == '__main__':	
+	output(u'Running ' + __version__)
+	CD = CommonsDelinker()
+	output(u'This bot runs from: ' + str(CD.site))
+	
+	re._MAXCACHE = 4
+	
+	args = wikipedia.handleArgs()
+	if '-since' in args:
+		# NOTE: Untested
+		ts_format = '%Y-%m-%d %H:%M:%S'
+		try:
+			since = time.strptime(
+				args[args.index('-since') + 1], 
+				ts_format)
+		except ValueError:
+			if args[args.index('-since') + 1][0] == '[' and \
+					len(args) != args.index('-since') + 2:
+				since = time.strptime('%s %s' % \
+					args[args.index('-since') + 1],
+					'[%s]' % ts_format)
+			else:
+				raise ValueError('Incorrect time format!')
+		output(u'Reading deletion log since [%s]' %\
+			time.strftime(ts_format, since))
+		CD.last_check = time.mktime(since)
+	
+	try:
+		try:
+			CD.start()
+		except Exception, e:
+			if type(e) not in (SystemExit, KeyboardInterrupt):
+				output('An exception occured in the main thread!', False)
+				traceback.print_exc(file = sys.stderr)
+				threadpool.terminate()
+	finally:
+		output(u'Stopping CommonsDelinker')
+		wikipedia.stopme()
+		# Flush the standard streams
+		sys.stdout.flush()
+		sys.stderr.flush()




    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

[Pywikipedia-l] SVN: [4387] trunk/pywikipedia