Revision: 5838 Author: russblau Date: 2008-08-23 16:23:23 +0000 (Sat, 23 Aug 2008)
Log Message: ----------- New category redirect bot: converts hard redirects to soft, fixes double-redirects, reports redirect loops, and moves contents of redirected category into target category. Localizable for all wikis.
Added Paths: ----------- trunk/pywikipedia/category_redirect.py
Added: trunk/pywikipedia/category_redirect.py =================================================================== --- trunk/pywikipedia/category_redirect.py (rev 0) +++ trunk/pywikipedia/category_redirect.py 2008-08-23 16:23:23 UTC (rev 5838) @@ -0,0 +1,644 @@ +# -*- coding: utf-8 -*- +"""This bot will move pages out of redirected categories + +Usage: category-redirect.py [options] + +The following command-line options can be used with this bot: + +-often Only scan those redirected categories that have been + identified as often-populated (only useful if the site + has such a category defined) + +""" + +import wikipedia, catlib +import pagegenerators +import simplejson +import cPickle +import math +import re +import sys, traceback +import threading, Queue +import time +from datetime import datetime, timedelta + +class APIError(Exception): + """The wiki API returned an error message.""" + def __init__(self, errordict): + """Save error dict returned by MW API.""" + self.errors = errordict + + def __str__(self): + return "%(code)s: %(info)s" % self.errors + + +class ThreadList(list): + """A simple threadpool class to limit the number of simultaneous threads. + + Any threading.Thread object can be added to the pool using the append() + method. If the maximum number of simultaneous threads has not been + reached, the Thread object will be started immediately; if not, the + append() call will block until the thread is able to start. + + >>> pool = ThreadList(limit=10) + >>> def work(): + ... time.sleep(1) + ... + >>> for x in xrange(20): + ... pool.append(threading.Thread(target=work)) + ... + + """ + def __init__(self, limit=sys.maxint, *args): + self.limit = limit + list.__init__(self, *args) + for item in list(self): + if not isinstance(threading.Thread, item): + raise TypeError("Cannot add '%s' to ThreadList" % type(item)) + + def active_count(self): + """Return the number of alive threads, and delete all non-alive ones.""" + count = 0 + for item in list(self): + if item.isAlive(): + count += 1 + else: + self.remove(item) + return count + + def append(self, thd): + if not isinstance(thd, threading.Thread): + raise TypeError("Cannot append '%s' to ThreadList" % type(thd)) + while self.active_count() >= self.limit: + time.sleep(2) + list.append(self, thd) + thd.start() + + +class CategoryRedirectBot(object): + def __init__(self, often): + self.cooldown = 6 # days + self.often = often + + # Localization: + + # Category that contains all redirected category pages + self.cat_redirect_cat = { + 'wikipedia': { + 'en': "Category:Wikipedia category redirects", + 'no': "Kategori:Wikipedia omdirigertekategorier", + 'simple': "Category:Category redirects", + }, + 'commons': { + 'commons': "Category:Non-empty category redirects" + } + } + + # Category that contains frequently-used redirected category pages + self.often_redirect_cat = { + 'wikipedia': { + 'en': "Category:Often-populated Wikipedia category redirects", + }, + } + + # List of all templates that are used to mark category redirects + # (put the most preferred form first) + self.redir_templates = { + 'wikipedia': { + 'en': ("Category redirect", + "Category redirect3", + "Categoryredirect", + "Empty category", + "CR", + "Catredirect", + "Emptycat", + "Emptycategory", + "Empty cat", + "Seecat",), + 'no': ("Kategoriomdirigering",), + 'simple': ("Category redirect", + "Catredirect"), + }, + 'commons': { + 'commons': (u'Category redirect', + u'Categoryredirect', + u'See cat', + u'Seecat', + u'Catredirect', + u'Cat redirect', + u'CatRed', + u'Catredir',), + } + } + + self.move_comment = { + '_default': +u"Robot: moving pages out of redirected category", + 'no': +u"Robot: Flytter sider ut av omdirigeringskategori", + 'commons': +u'Robot: Changing category link (following [[Template:Category redirect|category redirect]])' + } + + self.redir_comment = { + '_default': +u"Robot: adding category redirect template for maintenance", + 'no': +u"Robot: Legger til vedlikeholdsmal for kategoriomdirigering", + } + + self.dbl_redir_comment = { + '_default': u"Robot: fixing double-redirect", + 'no': u"Robot: Ordner doble omdirigeringer", + } + + self.maint_comment = { + '_default': u"Category redirect maintenance bot", + 'no': u"Bot for vedlikehold av kategoriomdirigeringer", + } + + def change_category(self, article, oldCat, newCat, comment=None, + sortKey=None): + """Given an article in category oldCat, moves it to category newCat. + Moves subcategories of oldCat as well. oldCat and newCat should be + Category objects. If newCat is None, the category will be removed. + + This is a copy of portions of catlib.change_category() with the + added capability to post a talk page message on pages that cannot be + fixed due to protection. + + """ + cats = article.categories(get_redirect=True) + + oldtext = article.get(get_redirect=True) + newtext = wikipedia.replaceCategoryInPlace(oldtext, oldCat, newCat) + if newtext == oldtext: + wikipedia.output( + u'No changes in made in page %s.' % article.aslink()) + return False + try: + article.put(newtext, comment) + return True + except wikipedia.EditConflict: + wikipedia.output( + u'Skipping %s because of edit conflict' % article.aslink()) + except wikipedia.LockedPage: + wikipedia.output(u'Skipping locked page %s' % article.aslink()) + if not article.isTalkPage and article.namespace != 2: + # no messages on user pages or non-talk pages + talkpage = article.toggleTalkPage() + try: + talktext = talk.get() + except wikipedia.IsRedirectPage: + return False + except wikipedia.NoPage: + talktext = u"" + if not talk.isTalkPage(): + return False + talktext = talktext + u""" +== Category link == +{{editprotected}} +* This protected page has been detected in [[%s]], but that category has \ +been redirected to [[%s]]. Please update the category link. --~~~~ +""" % (oldCat.aslink(textlink=True), newCat.aslink(textlink=True)) + # NEEDS LOCALIZATION + try: + talkpage.put(talktext, + u"Robot: Category-redirect notification on protected page", + minorEdit=False) # NEEDS LOCALIZATION + wikipedia.output( + u"Left protected page notification on %s" + % talkpage.aslink()) + except wikipedia.PageNotSaved: + wikipedia.output( + u"Protected page notification on %s failed" + % talkpage.aslink()) + + except wikipedia.SpamfilterError, error: + wikipedia.output( + u'Changing page %s blocked by spam filter (URL=%s)' + % (article.aslink(), error.url)) + except wikipedia.NoUsername: + wikipedia.output( + u"Page %s not saved; sysop privileges required." + % article.aslink()) + except wikipedia.PageNotSaved, error: + wikipedia.output(u"Saving page %s failed: %s" + % (article.aslink(), error.message)) + return False + + def move_contents(self, oldCatTitle, newCatTitle, editSummary): + """The worker function that moves pages out of oldCat into newCat""" + while True: + try: + oldCat = catlib.Category(self.site, + self.catprefix + oldCatTitle) + newCat = catlib.Category(self.site, + self.catprefix + newCatTitle) + + # Move articles + found, moved = 0, 0 + for result in self.query_results(list="categorymembers", + cmtitle=oldCat.title(), + cmprop="title|sortkey", + cmlimit="max"): + found += len(result['categorymembers']) + for item in result['categorymembers']: + article = wikipedia.Page(self.site, item['title']) + changed = self.change_category(article, oldCat, newCat, + comment=editSummary) + if changed: moved += 1 + + # pass 2: look for template doc pages + for result in self.query_results(list="categorymembers", + cmtitle=oldCat.title(), + cmprop="title|sortkey", + cmnamespace="10", + cmlimit="max"): + for item in result['categorymembers']: + doc = wikipedia.Page(self.site, item['title']+"/doc") + try: + old_text = doc.get() + except wikipedia.Error: + continue + changed = self.change_category(article, oldCat, newCat, + comment=editSummary) + if changed: moved += 1 + + if found: + wikipedia.output(u"%s: %s found, %s moved" + % (oldCat.title(), found, moved)) + #Dummy edit to refresh the page, shouldn't show up in any logs. + try: + oldCat.put(oldCat.get()) + except: + self.log_text.append(u'* Dummy edit at %s failed' + % oldCat.aslink(textlink=True)) + self.result_queue.put((oldCatTitle, found, moved)) + return + except wikipedia.ServerError: + wikipedia.output(u"Server error: retrying in 5 seconds...") + time.sleep(5) + continue + except: + self.result_queue.put((oldCatTitle, None, None)) + raise + + def readyToEdit(self, cat): + """Return True if cat not edited during cooldown period, else False.""" + dateformat ="%Y%m%d%H%M%S" + today = datetime.now() + deadline = today + timedelta(days=-self.cooldown) + if cat.editTime() is None: + raise RuntimeError + return (deadline.strftime(dateformat) > cat.editTime()) + + def query_results(self, **data): + """Iterate results from API action=query, using data as parameters.""" + addr = self.site.apipath() + querydata = {'action': 'query', + 'format': 'json', + 'maxlag': str(wikipedia.config.maxlag)} + querydata.update(data) + if not querydata.has_key("action")\ + or not querydata['action'] == 'query': + raise ValueError( + "query_results: 'action' set to value other than 'query'" + ) + waited = 0 + while True: + response, data = self.site.postForm(addr, querydata) + if data.startswith(u"unknown_action"): + e = {'code': data[:14], 'info': data[16:]} + raise APIError(e) + try: + result = simplejson.loads(data) + except ValueError: + # if the result isn't valid JSON, there must be a server + # problem. Wait a few seconds and try again + # TODO: warn user; if the server is down, this could + # cause an infinite loop + wikipedia.output("Invalid API response received; retrying...") + time.sleep(5) + continue + if type(result) is dict and result.has_key("error"): + if result['error']['code'] == "maxlag": + print "Pausing due to server lag.\r", + time.sleep(5) + waited += 5 + if waited % 30 == 0: + wikipedia.output( + u"(Waited %i seconds due to server lag.)" + % waited) + continue + else: + # raise error + raise APIError(result['error']) + waited = 0 + if type(result) is list: + # query returned no results + return + assert type(result) is dict, \ + "Unexpected result of type '%s' received." % type(result) + assert result.has_key("query"), \ + "No 'query' response found, result keys = %s" % result.keys() + yield result['query'] + if result.has_key("query-continue"): + assert len(result['query-continue'].keys()) == 1, \ + "More than one query-continue key returned: %s" \ + % result['query-continue'].keys() + query_type = result['query-continue'].keys()[0] + assert (query_type in querydata.keys() + or query_type in querydata.values()), \ + "Site returned unknown query-continue type '%s'"\ + % query_type + querydata.update(result['query-continue'][query_type]) + else: + return + + def run(self): + """Run the bot""" + self.site = wikipedia.getSite() + self.catprefix = self.site.namespace(14)+":" + self.result_queue = Queue.Queue() + self.log_text = [] + + user = self.site.loggedInAs() + redirect_magicwords = ["redirect"] + other_words = self.site.redirect() + if other_words: + redirect_magicwords.extend(other_words) + + problems = [] + + problem_page = wikipedia.Page(self.site, + u"User:%(user)s/category redirect problems" % locals()) + l = time.localtime() + today = "%04d-%02d-%02d" % l[:3] + log_page = wikipedia.Page(self.site, + u"User:%(user)s/category redirect logs/%(today)s" + % locals()) + + datafile = wikipedia.config.datafilepath( + "%s-catmovebot-data" % self.site.dbName()) + try: + inp = open(datafile, "rb") + record = cPickle.load(inp) + inp.close() + except IOError: + record = {} + if record: + cPickle.dump(record, open(datafile + ".bak", "wb")) + + # Set up regexes for later scanning + template_list = self.redir_templates[self.site.family.name + ][self.site.lang] + # regex to match soft category redirects + template_regexes = [ + re.compile( +ur"{{\s*(?:%(prefix)s\s*:\s*)?%(template)s\s*|(\s*%(catns)s\s*:\s*)?([^}]+)}}" + % {'prefix': self.site.namespace(10).lower(), + 'template': item.replace(" ", "[ _]+"), + 'catns': self.site.namespace(14)}, + re.I) + for item in template_list + ] + template_regex = re.compile( +ur"{{\s*(?:%(prefix)s\s*:\s*)?(?:%(template)s)\s*|(\s*%(catns)s\s*:\s*)?([^}]+)}}" + % {'prefix': self.site.namespace(10).lower(), + 'template': "|".join(item.replace(" ", "[ _]+") + for item in template_list), + 'catns': self.site.namespace(14)}, + re.I) + # regex to match hard redirects to category pages + catredir_regex = re.compile( +ur'\s*#(?:%(redir)s)\s*:?\s*[[\s*:?%(catns)s\s*:(.*?)]]\s*' + % {'redir': "|".join(redirect_magicwords), + 'catns': self.site.namespace(14)}, + re.I) + # regex to match all other hard redirects + redir_regex = re.compile(ur"(?i)\s*#(?:%s)\s*:?\s*[[(.*?)]]" + % "|".join(redirect_magicwords), + re.I) + + # check for hard-redirected categories that are not already marked + # with an appropriate template + comment = wikipedia.translate(self.site.lang, self.redir_comment) + for result in self.query_results(list='allpages', + apnamespace='14', # Category: + apfrom='!', + apfilterredir='redirects', + aplimit='max'): + gen = (wikipedia.Page(self.site, page_item['title']) + for page_item in result['allpages']) + for page in pagegenerators.PreloadingGenerator(gen, 120): + text = page.get(get_redirect=True) + if re.search(template_regex, text): + # this is already a soft-redirect, so skip it (for now) + continue + m = catredir_regex.match(text) + if m: + # this is a hard-redirect to a category page + newtext = (u"{{%(template)s|%(cat)s}}" + % {'cat': m.group(1), + 'template': template_list[0]}) + try: + page.put(newtext, comment, minorEdit=True) + self.log_text.append(u"* Added {{tl|%s}} to %s" + % (template_list[0], + page.aslink(textlink=True))) + except wikipedia.Error, e: + self.log_text.append( + u"* Failed to add {{tl|%s}} to %s (%s)" + % (template_list[0], + page.aslink(textlink=True), + e)) + else: + r = redir_regex.match(text) + if r: + problems.append( + u"# %s is a hard redirect to [[:%s]]" + % (page.aslink(textlink=True), + r.group(1))) + else: + problems.append( + u"# %s is a hard redirect; unable to extract target." + % page.aslink(textlink=True)) + + wikipedia.output("Done checking hard-redirect category pages.") + + comment = wikipedia.translate(self.site.lang, self.move_comment) + scan_data = { + u'action': 'query', + u'list': 'embeddedin', + u'einamespace': '14', # Category: + u'eilimit': 'max', + u'format': 'json' + } + counts = {} + destmap = {} + catmap = {} + catlist = [] + catpages = [] + if self.often: + target = self.often_redirect_cat[self.site.family.name + ][self.site.lang] + else: + target = self.cat_redirect_cat[self.site.family.name + ][self.site.lang] + + # get and preload all members of the category-redirect category + for result in self.query_results(generator=u'categorymembers', + gcmtitle=target, + gcmnamespace=u'14', # CATEGORY + gcmlimit=u'max', + prop='info'): + for catdata in result['pages'].values(): + catpages.append(wikipedia.Page(self.site, catdata['title'])) + + wikipedia.output(u"") + wikipedia.output(u"Preloading %s category redirect pages" + % len(catpages)) + thread_limit = int(math.log(len(catpages), 2)) + for cat in pagegenerators.PreloadingGenerator(catpages, 120): + cat_title = cat.titleWithoutNamespace() + if "Wikipedia category redirect" in cat_title: +# self.log_text.append("* Ignoring %s%s" % (self.catprefix, cat_title)) + continue + try: + text = cat.get(get_redirect=True) + except wikipedia.Error: + self.log_text.append(u"* Could not load %s%s; ignoring" + % (self.catprefix, cat_title)) + continue + match = template_regex.search(text) + if match is None: + self.log_text.append(u"* False positive: %s" % cat_title) + continue + catlist.append(cat) + destination = match.group(2) + target = catlib.Category(self.site, self.catprefix+destination) + destmap.setdefault(target, []).append(cat) + catmap[cat] = destination + if match.group(1): + # category redirect target starts with "Category:" - fix it + text = text[ :match.start(1)] + text[match.end(1): ] + cat.put(text, + u"Robot: fixing category redirect parameter format") + self.log_text.append( + u"* Removed category prefix from parameter in %s" + % cat.aslink(textlink=True)) + + wikipedia.output(u"") + wikipedia.output(u"Checking %s destination categories" % len(destmap)) + for dest in pagegenerators.PreloadingGenerator(destmap.keys(), 120): + if not dest.exists(): + for d in destmap[dest]: + problems.append("# %s redirects to %s" + % (d.aslink(textlink=True), + dest.aslink(textlink=True))) + catlist.remove(d) + # do a null edit on d to make it appear in the + # "needs repair" category (if this wiki has one) + try: + d.put(d.get(get_redirect=True)) + except: + pass + if dest in catlist: + for d in destmap[dest]: + # is catmap[dest] also a redirect? + newcat = catlib.Category(self.site, + self.catprefix+catmap[dest]) + while newcat in catlist: + if newcat == d or newcat == dest: + self.log_text.append(u"* Redirect loop from %s" + % newcat.aslink(textlink=True)) + break + newcat = catlib.Category(self.site, + self.catprefix+catmap[newcat]) + else: + self.log_text.append( + u"* Fixed double-redirect: %s -> %s -> %s" + % (d.aslink(textlink=True), + dest.aslink(textlink=True), + newcat.aslink(textlink=True))) + oldtext = d.get(get_redirect=True) + # remove the old redirect from the old text, + # leaving behind any non-redirect text + oldtext = template_regex.sub("", oldtext) + newtext = (u"{{category redirect|%(ncat)s}}" + % {'ncat': newcat.titleWithoutNamespace()}) + newtext = newtext + oldtext.strip() + try: + d.put(newtext, + wikipedia.translate(self.site.lang, + self.dbl_redir_comment), + minorEdit=True) + except wikipedia.Error, e: + self.log_text.append("** Failed: %s" % str(e)) + + wikipedia.output(u"") + wikipedia.output(u"Moving pages out of %s redirected categories." + % len(catlist)) + threadpool = ThreadList(limit=thread_limit) + + for cat in catlist: + cat_title = cat.titleWithoutNamespace() + if not self.readyToEdit(cat): + counts[cat_title] = None + continue + threadpool.append(threading.Thread(target=self.move_contents, + args=(cat_title, catmap[cat]), + kwargs=dict(editSummary=comment))) + while len(counts) < len(catlist): + title, found, moved = self.result_queue.get() + if found is None: + self.log_text.append( + u"* [[:%s%s]]: error in move_contents thread" + % (self.catprefix, title)) + else: + if found: + self.log_text.append( + u"* [[:%s%s]]: %d found, %d moved" + % (self.catprefix, title, found, moved)) + counts[title] = found + + if not self.often: + for cat in record.keys(): + if cat not in counts.keys(): + del record[cat] + for cat in counts.keys(): + if counts[cat] is not None: + if counts[cat]: + record.setdefault(cat, {})[today] = counts[cat] + else: + record.setdefault(cat, {}) + cPickle.dump(record, open(datafile, "wb")) + + wikipedia.setAction(wikipedia.translate(self.site.lang, + self.maint_comment)) + log_page.put("\n".join(self.log_text)) + problem_page.put("\n".join(problems)) + + +def main(*args): + try: + a = wikipedia.handleArgs(*args) + if "-often" in a: + a.remove("-often") + often = True + else: + often = False + if len(a) == 1: + raise RuntimeError('Unrecognized argument "%s"' % a[0]) + elif a: + raise RuntimeError('Unrecognized arguments: ' + + " ".join(('"%s"' % arg) for arg in a)) + bot = CategoryRedirectBot(often) + bot.run() + finally: + wikipedia.stopme() + + +if __name__ == "__main__": + main()
pywikipedia-l@lists.wikimedia.org