http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9053
Revision: 9053 Author: xqt Date: 2011-03-13 12:24:49 +0000 (Sun, 13 Mar 2011) Log Message: ----------- stripped trailing whitespace
Modified Paths: -------------- trunk/pywikipedia/commonsdelinker/checkusage.py trunk/pywikipedia/commonsdelinker/delinker.py trunk/pywikipedia/commonsdelinker/image_replacer.py trunk/pywikipedia/commonsdelinker/threadpool.py
Modified: trunk/pywikipedia/commonsdelinker/checkusage.py =================================================================== --- trunk/pywikipedia/commonsdelinker/checkusage.py 2011-03-13 12:19:20 UTC (rev 9052) +++ trunk/pywikipedia/commonsdelinker/checkusage.py 2011-03-13 12:24:49 UTC (rev 9053) @@ -1,32 +1,32 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -This module provides a way for users of the Wikimedia toolserver to check the +This module provides a way for users of the Wikimedia toolserver to check the use of images from Commons on other Wikimedia wikis. It supports both running -checkusage against the database and against the live wikis. It is very -efficient as it only creates one HTTP connection and one MySQL connection +checkusage against the database and against the live wikis. It is very +efficient as it only creates one HTTP connection and one MySQL connection during its life time. It is not suitable for multithreading! - + The CheckUsage class' constructor accept as parameters the maximum number of wikis that should be checked, an option to use it only live and the parameters -to connect to the MySQL database. The top wikis in size will be checked. The +to connect to the MySQL database. The top wikis in size will be checked. The class provides multiple methods: - + get_usage(image) -This method will return a generator object that generates the usage of the +This method will return a generator object that generates the usage of the image, returned as the following tuple: (page_namespace, page_title, full_title). page_namespace is the numeric namespace, page_title the page title without namespace, full_title the page title including localized namespace. - + get_usage_db(dbname, image), get_usage_live(domain, image) Those methods allow querying a specific wiki, respectively against the database and against the live wiki. They accept respectively the database name and the -domain name. The return a generator which generates the same results as +domain name. The return a generator which generates the same results as get_usage(). - + get_usage_multi(images) Calls get_usage for each image and returns a dictionary with usages. - + get_replag(dbname) Returns the time in seconds since the latest known edit of dbname. """ @@ -37,13 +37,13 @@ # __version__ = '$Id$' # - + import httplib, urlparse, socket, time from urllib import urlencode import simplejson
import wikipedia, family - + try: import MySQLdb except ImportError: @@ -53,7 +53,7 @@ except ImportError: pass __ver__ = '0.4c' - + def strip_ns(title): title = title.replace(' ', '_') if title.find(':') != -1: @@ -63,14 +63,14 @@ if title.startswith('Image:'): return strip_ns(title) return title - + def family(domain): if domain is None: raise RuntimeError('None is not a valid family') - + wiki = domain.split('.') # Standard family - if wiki[1] in ('wikipedia', 'wiktionary', 'wikibooks', + if wiki[1] in ('wikipedia', 'wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikinews', 'wikiversity'): return wiki[0], wiki[1] # Family on own domain @@ -92,7 +92,7 @@ #self._conn.set_debuglevel(100) self._conn.connect()
- def request(self, method, path, headers, data): + def request(self, method, path, headers, data): if not headers: headers = {} if not data: data = '' headers['Connection'] = 'Keep-Alive' @@ -143,28 +143,28 @@ data = simplejson.load(res) finally: res.close() - + if 'error' in data: if data['error']['code'] == u'internal_api_error_DBConnectionError': return self.query_api(host, path, **kwargs) - raise wikipedia.Error(data['error']['code'], + raise wikipedia.Error(data['error']['code'], data['error']['info']) - + return data def close(self): self._conn.close()
class HTTPPool(list): - def __init__(self, retry_timeout = 10, max_retries = -1, + def __init__(self, retry_timeout = 10, max_retries = -1, callback = lambda *args: None): - + self.retry_timeout = retry_timeout self.max_retries = -1 self.callback = callback self.current_retry = 0 - + list.__init__(self, ()) - + def query_api(self, host, path, **kwargs): conn = self.find_conn(host) while True: @@ -180,7 +180,7 @@ self.wait() conn = self.find_conn(host)
- + def find_conn(self, host): for conn in self: if host in conn.hosts: @@ -199,37 +199,37 @@ conn.hosts = [] self.append(conn) return self - + def wait(self): if self.current_retry > self.max_retries and self.max_retries != -1: raise RuntimeError('Maximum retries exceeded') if self.current_retry: self.callback(self) - time.sleep(self.current_retry * self.retry_timeout) + time.sleep(self.current_retry * self.retry_timeout) self.current_retry += 1 - + def close(self): for conn in self: conn.close() del self[:] -
+ class CheckUsage(object): - def __init__(self, limit = 100, + def __init__(self, limit = 100, mysql_default_server = 3, mysql_host_prefix = 'sql-s', mysql_host_suffix = '', - mysql_kwargs = {}, no_db = False, use_autoconn = False, - - http_retry_timeout = 30, http_max_retries = -1, + mysql_kwargs = {}, no_db = False, use_autoconn = False, + + http_retry_timeout = 30, http_max_retries = -1, http_callback = lambda *args: None, - + mysql_retry_timeout = 60, mysql_max_retries = -1, mysql_callback = lambda *args: None): - - self.http = None + + self.http = None self.http_retry_timeout = http_retry_timeout self.http_max_retries = http_max_retries self.http_callback = http_callback - + if no_db: return
self.mysql_host_prefix = mysql_host_prefix @@ -239,18 +239,18 @@ self.mysql_retry_timeout = mysql_retry_timeout self.mysql_max_retries = mysql_max_retries self.mysql_callback = mysql_callback - + self.connections = [] - + # Mapping database name -> mysql connection self.databases = {} # Mapping server id -> mysql connection self.servers = {} # Mapping database name -> (lang, family) self.sites = {} - + self.domains = {} - + self.unknown_families = [] # Mapping family name -> family object self.known_families = {} @@ -263,7 +263,7 @@ for dbname, domain, server in cursor.fetchall(): if server not in self.servers: self.servers[server] = self.connect_mysql(mysql_host_prefix + str(server) + mysql_host_suffix) - + # FIXME: wikimediafoundation! # TODO: This is one big mess try: @@ -275,7 +275,7 @@ else: self.sites[dbname] = (lang, fam) self.databases[dbname] = self.servers[server] - + self.domains[dbname] = domain
@@ -286,8 +286,8 @@ if self.use_autoconn: database = mysql_autoconnection.connect( use_unicode = False, host = host, - retry_timeout = self.mysql_retry_timeout, - max_retries = self.mysql_max_retries, + retry_timeout = self.mysql_retry_timeout, + max_retries = self.mysql_max_retries, callback = self.mysql_callback, **self.mysql_kwargs) else: @@ -298,7 +298,7 @@ return database, cursor def connect_http(self): if not self.http: - self.http = HTTPPool(retry_timeout = self.http_retry_timeout, + self.http = HTTPPool(retry_timeout = self.http_retry_timeout, max_retries = self.http_max_retries, callback = self.http_callback)
def get_usage(self, image): @@ -311,14 +311,14 @@ #image = strip_image(image) lang, family_name = self.sites[dbname] family = self.known_families[family_name] - + if family.shared_image_repository(lang) != (lang, family_name) and shared: left_join = 'LEFT JOIN %s.image ON (il_to = img_name) WHERE img_name IS NULL AND' % dbname else: left_join = 'WHERE'; query = """SELECT page_namespace, page_title FROM %s.page, %s.imagelinks %s page_id = il_from AND il_to = %%s""" - self.databases[dbname][1].execute(query % (dbname, dbname, left_join), + self.databases[dbname][1].execute(query % (dbname, dbname, left_join), (image.encode('utf-8', 'ignore'), )) for page_namespace, page_title in self.databases[dbname][1]: stripped_title = page_title.decode('utf-8', 'ignore') @@ -330,32 +330,32 @@
def get_usage_live(self, site, image, shared = False): self.connect_http() - + if type(site) is str: hostname = site apipath = '/w/api.php' else: hostname = site.hostname() apipath = site.apipath() - + # FIXME: Use continue kwargs = {'action': 'query', 'iutitle': u'Image:' + image, 'titles': u'Image:' + image, 'prop': 'info'} kwargs['list'] = 'imageusage' kwargs['iulimit'] = '500' - + res = self.http.query_api(hostname, apipath, **kwargs) if '-1' not in res['query']['pages'] and shared: return - + usages = res['query'].get('imageusage') if not usages: return - + # Apparently this someday changed from dict to list? if type(usages) is dict: usages = usages.values() - + for usage in usages: title = usage['title'].replace(' ', '_') namespace = usage['ns'] @@ -365,7 +365,7 @@ stripped_title = title yield namespace, stripped_title, title
- + def exists(self, site, image): self.connect_http() # Check whether the image still is deleted on Commons. @@ -375,8 +375,8 @@ # BUG: This is ugly. return '-1' not in self.http.query_api(site.hostname(), site.apipath(), action = 'query', titles = 'Image:' + image)['query']['pages'] - - + + def close(self): if getattr(self, 'http'): self.http.close() @@ -384,7 +384,6 @@ for connection, cursor in self.databases.itervalues(): try: connection.close() - except: + except: pass
- \ No newline at end of file
Modified: trunk/pywikipedia/commonsdelinker/delinker.py =================================================================== --- trunk/pywikipedia/commonsdelinker/delinker.py 2011-03-13 12:19:20 UTC (rev 9052) +++ trunk/pywikipedia/commonsdelinker/delinker.py 2011-03-13 12:24:49 UTC (rev 9053) @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -This script keeps track of image deletions and delinks removed files +This script keeps track of image deletions and delinks removed files from (any) wiki. Usage on protected pages or pages containing blacklisted external links cannot be processed. @@ -15,7 +15,7 @@ Please refer to delinker.txt for full documentation. """ # -# +# # (C) Kyle/Orgullomoore, 2006-2007 # (C) Siebrand Mazeland, 2006-2007 # (C) Bryan Tong Minh, 2007-2008 @@ -55,7 +55,7 @@ output(u'%s Connection has been lost in %s. Attempting reconnection.' % (threading.currentThread(), repr(object)), False) if hasattr(object, 'error'): output(u'Error was %s: %s' % tuple(object.error)) - + def universal_unicode(s): if type(s) is str: return s.decode('utf-8', 'ignore') @@ -75,11 +75,11 @@ # the standard MySQL character set. kwargs['use_unicode'] = False kwargs['callback'] = wait_callback - + return mysql_autoconnection.connect(**kwargs) # TODO: Add support for sqlite3 raise RuntimeError('Unsupported database engine %s' % engine) - + class ImmutableByReference(object): def __init__(self, data): self.data = data @@ -100,30 +100,30 @@ threadpool.Thread.__init__(self, pool) self.CommonsDelinker = CommonsDelinker self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new') - + def delink_image(self, image, usage, timestamp, admin, reason, replacement = None): """ Performs the delink for image on usage. """ output(u'%s Usage of %s: %s' % (self, image, usage)) if self.CommonsDelinker.exec_hook('before_delink', (image, usage, timestamp, admin, reason, replacement)) is False: return - + skipped_images = {} for (lang, family), pages in usage.iteritems(): site = self.CommonsDelinker.get_site(lang, family) if not site: output(u'%s Warning! Unknown site %s:%s' % (self, family, lang)) continue - + try: summary = self.get_summary(site, image, admin, reason, replacement) - + for page_namespace, page_title, title in pages: if (site.lang, site.family.name) == (self.CommonsDelinker.site.lang, self.CommonsDelinker.site.family.name) and \ (page_namespace, page_title) == (6, image): continue - + if self.CommonsDelinker.set_edit(str(site), title): # The page is currently being editted. Postpone. if (lang, family) not in skipped_images: @@ -133,7 +133,7 @@ else: # Delink the image output(u'%s Delinking %s from %s' % (self, image, site)) - + try: try: result = self.replace_image(image, site, title, summary, replacement) @@ -147,14 +147,14 @@ (page_namespace, page_title, title)) finally: self.CommonsDelinker.unset_edit(str(site), title) - + # Add to logging queue if self.sql_layout == 'new': - self.CommonsDelinker.Loggers.append((timestamp, image, + self.CommonsDelinker.Loggers.append((timestamp, image, site.lang, site.family.name, page_namespace, page_title, result, replacement)) else: - self.CommonsDelinker.Loggers.append((timestamp, image, site.hostname(), + self.CommonsDelinker.Loggers.append((timestamp, image, site.hostname(), page_namespace, page_title, result, replacement)) finally: self.CommonsDelinker.unlock_site(site) @@ -168,14 +168,14 @@ elif replacement: # Let them know that we are done replacing. self.CommonsDelinker.Loggers.append((timestamp, image, replacement)) - + def replace_image(self, image, site, page_title, summary, replacement = None): """ The actual replacement. Giving None as argument for replacement will delink instead of replace.""" - + page = wikipedia.Page(site, page_title) hook = None - + # TODO: Per site config. if page.namespace() in self.CommonsDelinker.config['delink_namespaces']: try: @@ -183,25 +183,25 @@ except wikipedia.NoPage: return 'failed' new_text = text - + m_image = ImmutableByReference(image) m_replacement = ImmutableByReference(replacement) - self.CommonsDelinker.exec_hook('before_replace', + self.CommonsDelinker.exec_hook('before_replace', (page, summary, m_image, m_replacement)) image = m_image.get() replacement = m_replacement.get() - + def create_regex(s): first, other = re.escape(s[0]), re.escape(s[1:]) return ur'(?:[%s%s]%s)' % (first.upper(), first.lower(), other) def create_regex_i(s): return ur'(?:%s)' % u''.join([u'[%s%s]' % (c.upper(), c.lower()) for c in s]) - + namespaces = site.namespace(6, all = True) + site.namespace(-2, all = True) r_namespace = ur'\s*(?:%s)\s*:\s*' % u'|'.join(map(create_regex_i, namespaces)) # Note that this regex creates a group! r_image = u'(%s)' % create_regex(image).replace(r'_', '[ _]') - + def simple_replacer(match): m_replacement = ImmutableByReference(replacement) groups = list(match.groups()) @@ -209,21 +209,21 @@ if False is self.CommonsDelinker.exec_hook('%s_replace' % hook, (page, summary, image, m_replacement, match, groups)): return u''.join(groups) - + if m_replacement.get() is None: return u'' else: groups[1] = m_replacement.get() return u''.join(groups) - - # Previously links in image descriptions will cause + + # Previously links in image descriptions will cause # unexpected behaviour: [[Image:image.jpg|thumb|[[link]] in description]] # will truncate at the first occurence of ]]. This cannot be # fixed using one regular expression. # This means that all ]] after the start of the image # must be located. If it then does not have an associated # [[, this one is the closure of the image. - + r_simple_s = u'([[%s)%s' % (r_namespace, r_image) r_s = '[[' r_e = ']]' @@ -231,25 +231,25 @@ image_starts = [match.start() for match in re.finditer(r_simple_s, text)] link_starts = [match.start() for match in re.finditer(r_s, text)] link_ends = [match.end() for match in re.finditer(r_e, text)] - + r_simple = u'([[%s)%s(.*)' % (r_namespace, r_image) hook = 'simple' replacements = [] for image_start in image_starts: - current_link_starts = [link_start for link_start in link_starts + current_link_starts = [link_start for link_start in link_starts if link_start > image_start] - current_link_ends = [link_end for link_end in link_ends + current_link_ends = [link_end for link_end in link_ends if link_end > image_start] end = image_start if current_link_ends: end = current_link_ends[0] - + while current_link_starts and current_link_ends: start = current_link_starts.pop(0) end = current_link_ends.pop(0) if end <= start and end > image_start: # Found the end of the image break - + # Check whether this image is the first one on the line if image_start == 0: prev = '' @@ -262,38 +262,38 @@ end += 1 else: break - + # Add the replacement to the todo list. Doing the # replacement right know would alter the indices. replacements.append((new_text[image_start:end], - re.sub(r_simple, simple_replacer, + re.sub(r_simple, simple_replacer, new_text[image_start:end]))) - + # Perform the replacements for old, new in replacements: if old: new_text = new_text.replace(old, new) - + # Remove the image from galleries hook = 'gallery' - r_galleries = ur'(?s)(<%s>)(.*?)(</%s>)' % (create_regex_i('gallery'), + r_galleries = ur'(?s)(<%s>)(.*?)(</%s>)' % (create_regex_i('gallery'), create_regex_i('gallery')) r_gallery = ur'(?m)^((?:%s)?)%s(\s*(?:|.*?)?\s*$)' % (r_namespace, r_image) def gallery_replacer(match): - return ur'%s%s%s' % (match.group(1), re.sub(r_gallery, + return ur'%s%s%s' % (match.group(1), re.sub(r_gallery, simple_replacer, match.group(2)), match.group(3)) new_text = re.sub(r_galleries, gallery_replacer, new_text) - + if text == new_text or self.CommonsDelinker.config.get('force_complex', False): # All previous steps did not work, so the image is # likely embedded in a complicated template. hook = 'complex' r_templates = ur'(?s)({{.*?}})' r_complicated = u'(?s)(?<=[|{=])[\s\u200E\uFEFF\u200B\u200C]*((?:%s)?)%s[\u200E\uFEFF\u200B\u200C]*' % (r_namespace, r_image) - + def template_replacer(match): return re.sub(r_complicated, simple_replacer, match.group(1)) new_text = re.sub(r_templates, template_replacer, text) - + if text != new_text: # Save to the wiki # Code for checking user page existance has been moved @@ -304,7 +304,7 @@ if False is self.CommonsDelinker.exec_hook('before_save', (page, text, new_text, m_summary)): return 'skipped' - + is_retry = False while True: try: @@ -330,18 +330,18 @@ else: return 'skipped' return 'skipped' - - - + + + def do(self, args): try: self.delink_image(*args) except: output(u'An exception occured in %s' % self, False) traceback.print_exc(file = sys.stderr) - + def get_summary(self, site, image, admin, reason, replacement): - """ Get the summary template and substitute the + """ Get the summary template and substitute the correct values.""" # FIXME: Hardcode is EVIL, but now only the global bot uses this if (site.lang != 'commons' and self.CommonsDelinker.config['global']): @@ -350,7 +350,7 @@ tlp = self.CommonsDelinker.SummaryCache.get(site, 'replace-I18n') else: tlp = self.CommonsDelinker.SummaryCache.get(site, 'summary-I18n') - + tlp = tlp.replace('$1', image) if replacement: tlp = tlp.replace('$2', replacement) @@ -359,23 +359,23 @@ else: tlp = tlp.replace('$2', unicode(admin)) tlp = tlp.replace('$3', unicode(reason)) - + return tlp - + class SummaryCache(object): """ Object to thread-safe cache summary templates. """ def __init__(self, CommonsDelinker): self.summaries = {} self.lock = threading.Lock() self.CommonsDelinker = CommonsDelinker - + def get(self, site, type, key = None, default = None): - # This can probably also provide something for - # localised settings, but then it first needs to + # This can probably also provide something for + # localised settings, but then it first needs to # check whether the page is sysop only. if not key: key = str(site) - + self.lock.acquire() try: if type not in self.summaries: @@ -385,9 +385,9 @@ self.CommonsDelinker.config['summary_cache']: # Return cached result return self.summaries[type][key][0] - + output(u'%s Fetching new summary for %s' % (self, site)) - + # FIXME: evil if self.CommonsDelinker.config['global']: self.check_user_page(site) @@ -402,25 +402,25 @@ pass finally: self.lock.release() - + # No i18n available, but it may be available in the wikipedia # of that language. Only do so for wiktionary, wikibooks, # wikiquote, wikisource, wikinews, wikiversity # This will cause the bot to function even on special wikis # like mediawiki.org and meta and species. output(u'%s Using default summary for %s' % (self, site)) - + if default: return default - + if site.family.name != 'wikipedia' and self.CommonsDelinker.config['global']: - if site.family.name in ('wiktionary', 'wikibooks', 'wikiquote', + if site.family.name in ('wiktionary', 'wikibooks', 'wikiquote', 'wikisource', 'wikinews', 'wikiversity'): if site.lang in config.usernames['wikipedia']: - newsite = self.CommonsDelinker.get_site(site.lang, + newsite = self.CommonsDelinker.get_site(site.lang, wikipedia.Family('wikipedia')) return self.get(newsite, type, key = key) return self.CommonsDelinker.config['default_settings'].get(type, '') - + def check_user_page(self, site): "Check whether a userpage exists. Only used for CommonsDelinker." try: @@ -435,24 +435,24 @@ ftxt = f.read() f.close() if not '#' + str(site) in ftxt: - username = config.usernames[site.family.name][site.lang] - + username = config.usernames[site.family.name][site.lang] + userpage = wikipedia.Page(site, 'User:' + username) - # Removed check for page existence. If it is not in our + # Removed check for page existence. If it is not in our # database we can safely assume that we have no user page # there. In case there is, we will just overwrite it once. - # It causes no real problems, but it is one call to the + # It causes no real problems, but it is one call to the # servers less. # TODO: Config setting? userpage.put('#REDIRECT [[m:User:CommonsDelinker]]', '') - + f = open(filename, 'a') f.write('#' + str(site)) f.close() except wikipedia.LockedPage: # User page is protected, continue anyway - pass - + pass + class CheckUsage(threadpool.Thread): timeout = 120 def __init__(self, pool, CommonsDelinker): @@ -460,14 +460,14 @@ self.CommonsDelinker = CommonsDelinker # Not really thread safe, but we should only do read operations... self.site = CommonsDelinker.site - + def run(self): try: self.connect() except: return self.exit() threadpool.Thread.run(self) - + def connect(self): output(u'%s Connecting to databases' % self) config = self.CommonsDelinker.config @@ -475,22 +475,22 @@ # Note: global use requires MySQL self.CheckUsage = checkusage.CheckUsage(limit = sys.maxint, mysql_kwargs = config['sql_config'], - use_autoconn = True, + use_autoconn = True, http_callback = wait_callback, mysql_callback = wait_callback, mysql_host_suffix = '-fast') else: self.CheckUsage = checkusage.CheckUsage(sys.maxint, http_callback = wait_callback, no_db = True) - - + + def check_usage(self, image, timestamp, admin, reason, replacement): """ Check whether this image needs to be delinked. """ - + # Check whether the image still is deleted on Commons. # BUG: This also returns true for images with a page, but # without the image itself. Can be fixed by querying query.php - # instead of api.php. Also should this be made as an exits() + # instead of api.php. Also should this be made as an exits() # method of checkusage.CheckUsage? if self.site.shared_image_repository() != (None, None): shared_image_repository = self.CommonsDelinker.get_site(*self.site.shared_image_repository()) @@ -505,12 +505,12 @@ not bool(replacement): output(u'%s %s exists again!' % (self, image)) return - - + + if self.CommonsDelinker.config['global']: usage = self.CheckUsage.get_usage(image) usage_domains = {} - + count = 0 # Sort usage per domain for (lang, family), (page_namespace, page_title, title) in usage: @@ -520,21 +520,21 @@ count += 1 else: #FIX! - usage_domains = {(self.site.lang, self.site.family.name): - list(self.CheckUsage.get_usage_live(self.site, + usage_domains = {(self.site.lang, self.site.family.name): + list(self.CheckUsage.get_usage_live(self.site, image))} count = len(usage_domains[(self.site.lang, self.site.family.name)]) - + output(u'%s %s used on %s pages' % (self, image, count)) - + if count: # Pass the usage to the Delinker pool along with other arguments - self.CommonsDelinker.Delinkers.append((image, usage_domains, + self.CommonsDelinker.Delinkers.append((image, usage_domains, timestamp, admin, reason, replacement)) elif replacement: # Record replacement done self.CommonsDelinker.Loggers.append((timestamp, image, replacement)) - + def do(self, args): try: self.check_usage(*args) @@ -544,12 +544,12 @@ traceback.print_exc(file = sys.stderr) self.exit() self.CommonsDelinker.thread_died() - + def starve(self): self.pool.jobLock.acquire() try: if self.pool[id(self)].isSet(): return False - + output(u'%s Starving' % self) self.CheckUsage.close() del self.pool[id(self)] @@ -557,66 +557,66 @@ return True finally: self.pool.jobLock.release() - + class Logger(threadpool.Thread): timeout = 360 - + def __init__(self, pool, CommonsDelinker): threadpool.Thread.__init__(self, pool) self.CommonsDelinker = CommonsDelinker self.sql_layout = self.CommonsDelinker.config.get('sql_layout', 'new') self.enabled = self.CommonsDelinker.config.get('enable_logging', True) - + def run(self): self.connect() threadpool.Thread.run(self) - + def connect(self): output(u'%s Connecting to log database' % self) self.database = connect_database() self.cursor = self.database.cursor() - - + + def log_result_legacy(self, timestamp, image, domain, namespace, page, status = "ok", newimage = None): # TODO: Make sqlite3 ready - + # The original delinker code cached log results, # in order to limit the number of connections. # However, since we are now using persistent # connections, we can safely insert the result # on the fly. output(u'%s Logging %s for %s on %s' % (self, repr(status), image, page)) - + # There is no need to escape each parameter if - # a parametrized call is made. + # a parametrized call is made. self.cursor.execute("""INSERT INTO %s (timestamp, img, wiki, page_title, namespace, status, newimg) VALUES (%%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % self.CommonsDelinker.config['log_table'], (timestamp, image, domain, page, namespace, status, newimage)) self.database.commit() - - def log_result_new(self, timestamp, image, site_lang, site_family, + + def log_result_new(self, timestamp, image, site_lang, site_family, page_namespace, page_title, status = 'ok', new_image = None): - + output(u'%s Logging %s for %s on %s' % (self, repr(status), image, page_title))
self.cursor.execute("""INSERT INTO %s (timestamp, image, site_lang, site_family, page_namespace, page_title, status, new_image) VALUES (%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s)""" % self.CommonsDelinker.config['log_table'], - (timestamp, image, site_lang, site_family, page_namespace, page_title, + (timestamp, image, site_lang, site_family, page_namespace, page_title, status, new_image)) self.database.commit() - + def log_replacement(self, timestamp, old_image, new_image): # TODO: Same as above - + output(u'Replacing %s by %s done' % (old_image, new_image)) - self.cursor.execute("""UPDATE %s SET status = 'done' WHERE - timestamp = %%s AND old_image = %%s AND + self.cursor.execute("""UPDATE %s SET status = 'done' WHERE + timestamp = %%s AND old_image = %%s AND new_image = %%s""" % self.CommonsDelinker.config['replacer_table'], (timestamp, old_image, new_image)) self.database.commit() - + def do(self, args): if not self.enabled: return try: @@ -633,12 +633,12 @@ traceback.print_exc(file = sys.stderr) self.exit() self.CommonsDelinker.thread_died() - + def starve(self): self.pool.jobLock.acquire() try: if self.pool[id(self)].isSet(): return False - + output(u'%s Starving' % self) self.database.close() del self.pool[id(self)] @@ -653,7 +653,7 @@ self.config = config.CommonsDelinker self.site = wikipedia.getSite() self.site.forceLogin() - + # Initialize workers self.CheckUsages = threadpool.ThreadPool(CheckUsage, self.config['checkusage_instances'], self) self.Delinkers = threadpool.ThreadPool(Delinker, self.config['delinker_instances'], self) @@ -661,34 +661,34 @@ self.Loggers = threadpool.ThreadPool(Logger, self.config['logger_instances'], self) else: self.Loggers = threadpool.ThreadPool(Logger, 1, self) - + self.http = checkusage.HTTP(self.site.hostname()) - + self.edit_list = [] self.editLock = threading.Lock() - + self.sites = {} self.siteLock = threading.Lock() - + self.SummaryCache = SummaryCache(self) - + if self.config.get('enable_replacer', False): self.connect_mysql() - + if self.config.get('no_sysop', False): # Don't edit as sysop if hasattr(config, 'sysopnames'): config.sysopnames = dict([(fam, {}) for fam in config.sysopnames.keys()]) - + self.last_check = time.time() - + #if 'bot' in self.site.userGroups: # self.log_limit = '5000' #else: # self.log_limit = '500' self.log_limit = '500' self.init_plugins() - + def init_plugins(self, do_reload = False): import plugins self.hooks = {} @@ -705,7 +705,7 @@ self.hooks[plugin.hook].append(plugin) output(u"%s Loaded plugin %s for hook '%s'" % \ (self, plugin, plugin.hook)) - + def exec_hook(self, name, args): # TODO: Threadsafety! if name in self.hooks: @@ -729,16 +729,16 @@ self.hooks[name].remove(plugin) finally: self.siteLock.release() - + def reload_plugins(signalnum, stack): pass - + def connect_mysql(self): self.database = connect_database() self.cursor = self.database.cursor() - + def set_edit(self, domain, page): - """ Make sure the bot does not create edit + """ Make sure the bot does not create edit conflicts with itself.""" self.editLock.acquire() being_editted = (domain, page) in self.edit_list @@ -751,9 +751,9 @@ self.editLock.acquire() self.edit_list.remove((domain, page)) self.editLock.release() - + def get_site(self, code, fam): - # Threadsafe replacement of wikipedia.getSite + # Threadsafe replacement of wikipedia.getSite key = '%s:%s' % (code, fam) self.siteLock.acquire() try: @@ -779,34 +779,34 @@ self.sites[key][self.sites[key].index((site, True))] = (site, False) finally: self.siteLock.release() - - + + def read_deletion_log(self): ts_format = '%Y-%m-%dT%H:%M:%SZ' wait = self.config['delink_wait'] exclusion = self.config['exclude_string'] - + ts_from = self.last_check # Truncate -> int() ts_end = int(time.time()) self.last_check = ts_end - + # Format as a Mediawiki timestamp and substract a # certain wait period. ts_from_s = time.strftime(ts_format, time.gmtime(ts_from - wait + 1)) ts_end_s = time.strftime(ts_format, time.gmtime(ts_end - wait)) - + try: # Assume less than 500 deletion have been made between - # this and the previous check of the log. If this is not + # this and the previous check of the log. If this is not # the case, timeout should be set lower. result = self.http.query_api(self.site.hostname(), self.site.apipath(), - action = 'query', list = 'logevents', letype = 'delete', - lelimit = self.log_limit, lestart = ts_from_s, leend = ts_end_s, + action = 'query', list = 'logevents', letype = 'delete', + lelimit = self.log_limit, lestart = ts_from_s, leend = ts_end_s, ledir = 'newer') logevents = result['query']['logevents'] except Exception, e: - if type(e) in (SystemError, KeyboardInterrupt): raise + if type(e) in (SystemError, KeyboardInterrupt): raise # Something happened, but since it is a network error, # it will not be critical. In order to prevent data loss # the last_check timestamp has to be set correctly. @@ -814,7 +814,7 @@ output('Warning! Unable to read deletion logs', False) output('%s: %s' % (e.__class__.__name__, str(e)), False) return time.sleep(self.config['timeout']) - + for logevent in logevents: if logevent['ns'] == 6 and logevent['action'] == 'delete': if exclusion not in logevent.get('comment', ''): @@ -823,14 +823,14 @@ timestamp = timestamp.replace(':', '') timestamp = timestamp.replace('T', '') timestamp = timestamp.replace('Z', '') - + output(u'Deleted image: %s' % logevent['title']) self.CheckUsages.append((checkusage.strip_ns(logevent['title']), timestamp, logevent['user'], logevent.get('comment', ''), None)) else: output(u'Skipping deleted image: %s' % logevent['title']) - + def read_replacement_log(self): # TODO: Make sqlite3 ready # TODO: Single process replacer @@ -845,22 +845,22 @@ self.CheckUsages.append((old_image, timestamp, user, comment, new_image)) output(u'Replacing %s by %s' % (old_image, new_image)) self.cursor.execute(update, ('ok', id)) - + self.database.commit() - + def start(self): # Gracefully exit all threads on SIG_INT or SIG_TERM threadpool.catch_signals() - + # Start threads self.Loggers.start() self.Delinkers.start() self.CheckUsages.start() - + # Give threads some time to initialize time.sleep(self.config['timeout']) output(u'All workers started') - + # Main loop while True: if self.config.get('enable_delinker', True): @@ -871,17 +871,17 @@ self.read_deletion_log() if self.config.get('enable_replacer', False): self.read_replacement_log() - + time.sleep(self.config['timeout']) - + def thread_died(self): # Obsolete return - + @staticmethod def output(*args): return output(*args) - + def output(message, toStdout = True): message = time.strftime('[%Y-%m-%d %H:%M:%S] ') + message wikipedia.output(message, toStdout = toStdout) @@ -895,16 +895,16 @@ output(u'Running ' + __version__) CD = CommonsDelinker() output(u'This bot runs from: ' + str(CD.site)) - + re._MAXCACHE = 4 - + args = wikipedia.handleArgs() if '-since' in args: # NOTE: Untested ts_format = '%Y-%m-%d %H:%M:%S' try: since = time.strptime( - args[args.index('-since') + 1], + args[args.index('-since') + 1], ts_format) except ValueError: if args[args.index('-since') + 1][0] == '[' and \ @@ -917,7 +917,7 @@ output(u'Reading deletion log since [%s]' %\ time.strftime(ts_format, since)) CD.last_check = time.mktime(since) - + try: try: CD.start()
Modified: trunk/pywikipedia/commonsdelinker/image_replacer.py =================================================================== --- trunk/pywikipedia/commonsdelinker/image_replacer.py 2011-03-13 12:19:20 UTC (rev 9052) +++ trunk/pywikipedia/commonsdelinker/image_replacer.py 2011-03-13 12:24:49 UTC (rev 9053) @@ -4,7 +4,7 @@ Please refer to delinker.txt for full documentation. """ # -# +# # (C) Bryan Tong Minh, 2007 # # Distributed under the terms of the MIT license. @@ -43,44 +43,44 @@ self.config.update(getattr(config, 'Replacer', ())) self.template = re.compile(r'{{%s|([^|]*?)|([^|]*?)(?:(?:|reason=(.*?))?)}}' % \ self.config['replace_template']) - self.disallowed_replacements = [(re.compile(i[0], re.I), re.compile(i[1], re.I)) + self.disallowed_replacements = [(re.compile(i[0], re.I), re.compile(i[1], re.I)) for i in self.config.get('disallowed_replacements', ())] - + self.site = wikipedia.getSite(persistent_http = True) self.site.forceLogin() - + self.database = connect_database() self.cursor = self.database.cursor() - + self.first_revision = 0 if self.config.get('replacer_report_replacements', False): self.reporters = threadpool.ThreadPool(Reporter, 1, self.site, self.config) self.reporters.start() - - + + def read_replace_log(self): """ The actual worker method """ - + # FIXME: Make sqlite3 compatible - insert = """INSERT INTO %s (timestamp, old_image, new_image, + insert = """INSERT INTO %s (timestamp, old_image, new_image, status, user, comment) VALUES (%%s, %%s, %%s, 'pending', %%s, %%s)""" % self.config['replacer_table'] - + page = wikipedia.Page(self.site, self.config['command_page']) - + # Get last revision date - if self.cursor.execute("""SELECT timestamp FROM %s + if self.cursor.execute("""SELECT timestamp FROM %s ORDER BY timestamp DESC LIMIT 1""" % \ self.config['replacer_table']): since = mw_timestamp(self.cursor.fetchone()[0]) else: since = None - + if self.config.get('clean_list', False): username = config.sysopnames[self.site.family.name][self.site.lang] else: username = None - + try: # Fetch revision history revisions = self.get_history(page.title(), since, username) @@ -95,18 +95,18 @@ #self.site.conn.close() #self.site.conn.connect() return time.sleep(self.config['timeout']) - + # We're being killed if '{{stop}}' in text.lower(): output(u'Found {{stop}} on command page. Not replacing anything.') return time.sleep(self.config['timeout']) - + # Sort oldest first revisions.sort(key = lambda rev: rev['timestamp']) - + # Find all commands replacements = self.template.finditer(text) - + remove_from_list = [] count = 0 for replacement in replacements: @@ -122,10 +122,10 @@ remove_from_list.append(replacement.group(0)) output('Replacing %s by %s: %s' % replacement.groups()) count += 1 - + # Save all replaces to database self.database.commit() - + if remove_from_list and self.config.get('clean_list', False): # Cleanup the command page while True: @@ -144,10 +144,10 @@ except wikipedia.EditConflict: # Try again text = page.get() - + def get_history(self, title, since, username): """ Fetch the last 50 revisions using the API """ - + address = self.site.api_address() predata = [ ('action', 'query'), @@ -170,10 +170,10 @@ if 'missing' in page: raise Exception('Missing page!') return page.get('revisions', []) - + def examine_revision_history(self, revisions, replacement, username): """ Find out who is to blame for a replacement """ - + for revision in revisions: if replacement.group(0) in revision['*']: db_time = db_timestamp(revision['timestamp']) @@ -182,52 +182,52 @@ return (db_time, strip_image(replacement.group(1)), strip_image(replacement.group(2)), revision['user'], replacement.group(3)) - + output('Warning! Could not find out who did %s' % \ repr(replacement.group(0)), False) return - + def read_finished_replacements(self): - """ Find out which replacements have been completed and add them to + """ Find out which replacements have been completed and add them to the reporters queue. """ - + self.cursor.execute('START TRANSACTION WITH CONSISTENT SNAPSHOT') self.cursor.execute("""SELECT old_image, new_image, user, comment FROM %s WHERE status = 'done' AND timestamp >= %i""" % \ (self.config['replacer_table'], self.first_revision)) finished_images = list(self.cursor) - self.cursor.execute("""UPDATE %s SET status = 'reported' + self.cursor.execute("""UPDATE %s SET status = 'reported' WHERE status = 'done' AND timestamp >= %i""" % \ (self.config['replacer_table'], self.first_revision)) self.cursor.commit() - + for old_image, new_image, user, comment in finished_images: - self.cursor.execute("""SELECT wiki, namespace, page_title - FROM %s WHERE img = %%s AND status <> 'ok'""" % + self.cursor.execute("""SELECT wiki, namespace, page_title + FROM %s WHERE img = %%s AND status <> 'ok'""" % self.config['log_table'], (old_image, )) not_ok = [(wiki, namespace, page_title.decode('utf-8', 'ignore')) for wiki, namespace, page_title in self.cursor] - + if not comment: comment = '' - + self.reporters.append((old_image.decode('utf-8', 'ignore'), - new_image.decode('utf-8', 'ignore'), - user.decode('utf-8', 'ignore'), + new_image.decode('utf-8', 'ignore'), + user.decode('utf-8', 'ignore'), comment.decode('utf-8', 'ignore'), not_ok)) - - + + def start(self): while True: self.read_replace_log() if self.config.get('replacer_report_replacements', False): self.read_finished_replacements() - + # Replacer should not loop as often as delinker time.sleep(self.config['timeout'] * 2) - + def allowed_replacement(self, replacement): """ Method to prevent World War III """ - + for source, target in self.disallowed_replacements: if source.search(replacement.group(1)) and \ target.search(replacement.group(2)): @@ -236,14 +236,14 @@
class Reporter(threadpool.Thread): """ Asynchronous worker to report finished replacements to file pages. """ - + def __init__(self, pool, site, config): self.site = wikipedia.getSite(site.lang, site.family, site.user, True) self.config = config - + threadpool.Thread.__init__(self, pool) - + def do(self, args): try: self.report(args) @@ -254,7 +254,7 @@ sys.stderr.flush() self.exit() os.kill(0, signal.SIGTERM) - + def report(self, (old_image, new_image, user, comment, not_ok)): not_ok_items = [] for wiki, namespace, page_title in not_ok: @@ -265,7 +265,7 @@ namespace_name = namespace_name + u':' else: namespace_name = u'' - + if unicode(site) == unicode(self.site): if (namespace, page_title) != (6, old_image): not_ok_items.append(u'[[:%s%s]]' % \ @@ -273,13 +273,13 @@ else: not_ok_items.append(u'[[:%s:%s%s]]' % (site_prefix(site), namespace_name, page_title)) - + template = u'{{%s|new_image=%s|user=%s|comment=%s|not_ok=%s}}' % \ (self.config['replacer_report_template'], - new_image, user, comment, + new_image, user, comment, self.config.get('replacer_report_seperator', u', ').join(not_ok_items)) page = wikipedia.Page(self.site, u'Image:' + old_image) - + try: text = page.get() except wikipedia.NoPage: @@ -289,7 +289,7 @@ output(u'Warning! %s is a redirect; not reporting replacement!' % old_image) return try: - page.put(u'%s\n%s' % (template, text), + page.put(u'%s\n%s' % (template, text), comment = u'This image has been replaced by ' + new_image) except wikipedia.PageNotSaved, e: output(u'Warning! Unable to report replacement to %s.' % old_image, False) @@ -301,11 +301,11 @@ else: output(u'Reporting replacement of %s by %s.' % \ (old_image, new_image)) -
+ def main(): global R - + import sys, traceback wikipedia.handleArgs() output(u'Running ' + __version__) @@ -327,5 +327,5 @@ except: pass wikipedia.stopme() - + if __name__ == '__main__': main()
Modified: trunk/pywikipedia/commonsdelinker/threadpool.py =================================================================== --- trunk/pywikipedia/commonsdelinker/threadpool.py 2011-03-13 12:19:20 UTC (rev 9052) +++ trunk/pywikipedia/commonsdelinker/threadpool.py 2011-03-13 12:24:49 UTC (rev 9053) @@ -1,19 +1,19 @@ #!/usr/bin/python # -*- coding: utf-8 -*- """ -This module implements a threadpool which allows scripts that require +This module implements a threadpool which allows scripts that require performing concurrent jobs, an efficient and thread safe way to do this. - -The two classes available are ThreadPool and Thread. ThreadPool is the + +The two classes available are ThreadPool and Thread. ThreadPool is the controller class and contains a collection of Thread objects, which must be subclassed. Any thread can add a job to the ThreadPool by calling its append() method. The pool will add this task to the jobqueue and activate a sleeping thread, if available. In case no thread is directly available, the job will be handled by the first free thread. - + The Thread class must be subclassed and passed to the ThreadPool's constructor. -The subclass should implement a do(args) method, which will receive as its +The subclass should implement a do(args) method, which will receive as its argument the job. Please note that providing mutable variables to the jobqueue may cause thread unsafety! """ @@ -24,9 +24,9 @@ # __version__ = '$Id$' # - + import sys, threading, os - + class ThreadPool(dict): pools = [] def __init__(self, worker, max_threads, *args, **kwargs): @@ -36,7 +36,7 @@ self.jobQueue = [] self.worker = worker self.threads = [] - + self.max_threads = max_threads self.args = args self.kwargs = kwargs @@ -50,7 +50,7 @@ self.jobQueue.append(job) # The amount of workers needed to be unlocked unlock_workers = len(self.jobQueue) - + for event in self.itervalues(): if not event.isSet(): event.set() @@ -62,7 +62,7 @@ if counter == 0 and len(self.threads) < self.max_threads: self.add_thread() self.start() - + def add_thread(self): self.jobLock.acquire() try: @@ -71,7 +71,7 @@ self[id(thread)] = threading.Event() finally: self.jobLock.release() - + def start(self): for thread in self.threads: if not thread.isAlive(): @@ -92,23 +92,23 @@ threading.Thread.__init__(self) self.pool = pool self.quit = False - + def run(self): while True: # No try..finally: lock.release() here: # The lock might be released twice, in case - # the thread waits for an event, a race + # the thread waits for an event, a race # condition might occur where a lock is released - # that is acquired by another thread. + # that is acquired by another thread. self.pool.jobLock.acquire() - + if self.quit and not self.pool.jobQueue: # Only return once the jobQueue is empty. self.pool.jobLock.release() return - + if not self.pool.jobQueue: - # In case no job is available, wait for the pool + # In case no job is available, wait for the pool # to call and do not start a busy while loop. event = self.pool[id(self)] self.pool.jobLock.release() @@ -119,9 +119,9 @@ continue job = self.pool.jobQueue.pop(0) self.pool.jobLock.release() - + self.do(job) - + def exit(self): self.pool.jobLock.acquire() try: @@ -133,7 +133,7 @@ self.pool.threads.remove(self) finally: self.pool.jobLock.release() - + def starve(self): pass
@@ -146,12 +146,12 @@ import signal for pool in ThreadPool.pools: pool.exit() - + if signalnum == signal.SIGINT: raise KeyboardInterrupt if signalnum == signal.SIGTERM: raise SystemExit - + def terminate(): # Maybe not a good idea, will also kill child processes import signal @@ -160,24 +160,24 @@ if __name__ == '__main__': import time # Test cases - + class Worker(Thread): def do(self, args): print 'Working', self time.sleep(10) print 'Done', self - + pool = ThreadPool(Worker) print 'Spawning 5 threads' [pool.add_thread() for i in xrange(5)] pool.start() - + print 'Doing 25 jobs' for i in xrange(25): print 'Job', i pool.append(i) time.sleep(i % 6) - + for thread in pool.threads: thread.exit()