Revision: 6229 Author: russblau Date: 2009-01-08 16:13:30 +0000 (Thu, 08 Jan 2009)
Log Message: ----------- Branch for conversion to new framework.
Added Paths: ----------- branches/rewrite/pywikibot/scripts/category.py
Copied: branches/rewrite/pywikibot/scripts/category.py (from rev 6214, trunk/pywikipedia/category.py) =================================================================== --- branches/rewrite/pywikibot/scripts/category.py (rev 0) +++ branches/rewrite/pywikibot/scripts/category.py 2009-01-08 16:13:30 UTC (rev 6229) @@ -0,0 +1,999 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Scripts to manage categories. + +Syntax: python category.py action [-option] + +where action can be one of these: + * add - mass-add a category to a list of pages + * remove - remove category tag from all pages in a category + * move - move all pages in a category to another category + * tidy - tidy up a category by moving its articles into subcategories + * tree - show a tree of subcategories of a given category + * listify - make a list of all of the articles that are in a category + +and option can be one of these: + * -person - sort persons by their last name (for action 'add') + * -rebuild - reset the database + * -from: - The category to move from (for the move option) + Also, the category to remove from in the remove option + Also, the category to make a list of in the listify option + * -to: - The category to move to (for the move option) + - Also, the name of the list to make in the listify option + NOTE: If the category names have spaces in them you may need to use + a special syntax in your shell so that the names aren't treated as + separate parameters. For instance, in BASH, use single quotes, + e.g. -from:'Polar bears' + * -batch - Don't prompt to delete emptied categories (do it + automatically). + * -summary: - Pick a custom edit summary for the bot. + * -inplace - Use this flag to change categories in place rather than + rearranging them. + * -nodelsum - An option for remove, this specifies not to use the custom + edit summary as the deletion reason. Instead, it uses the + default deletion reason for the language, which is "Category + was disbanded" in English. + * -overwrite - An option for listify, this overwrites the current page with + the list even if something is already there. + * -showimages - An option for listify, this displays images rather than + linking them in the list. + * -talkpages - An option for listify, this outputs the links to talk pages + of the pages to be listified in addition to the pages + themselves. + * -recurse - Recurse through all subcategories of categories. + * -match - Only work on pages whose titles match the given regex (for + move and remove actions). + +If action is "add", the following options are supported: + +¶ms; + +For the actions tidy and tree, the bot will store the category structure +locally in category.dump. This saves time and server load, but if it uses +these data later, they may be outdated; use the -rebuild parameter in this +case. + +For example, to create a new category from a list of persons, type: + + python category.py add -person + +and follow the on-screen instructions. + +Or to do it all from the command-line, use the following syntax: + + python category.py move -from:US -to:'United States' + +This will move all pages in the category US to the category United States. + +""" + +# +# (C) Rob W.W. Hooft, 2004 +# (C) Daniel Herding, 2004 +# (C) Anreas J Schwab, 2007 +# +__version__ = '$Id$' +# +# Distributed under the terms of the MIT license. +# +import os, re, sys, pickle, bz2 +import wikipedia, catlib, config, pagegenerators + +# This is required for the text that is shown when you run this script +# with the parameter -help. +docuReplacements = { + '¶ms;': pagegenerators.parameterHelp +} +# Summary messages +msg_add={ + 'ar':u'روبوت: إضافة [[تصنيف:%s]]', + 'bat-smg':u'Robots: Pridedama [[Kateguorėjė:%s]]', + 'ca':u'Robot: Afegint [[Categoria:%s]]', + 'da':u'Robot: Tilføjer [[Kategori:%s]]', + 'de':u'Bot: Ergänze [[Kategorie:%s]]', + 'en':u'Robot: Adding [[Category:%s]]', + 'es':u'Bot: Añadida [[Categoría:%s]]', + 'id':u'Bot: Menambahkan [[Kategori:%s]]', + 'fi':u'Botti lisäsi luokkaan [[Luokka:%s]]', + 'fr':u'Robot : ajoute [[Catégorie:%s]]', + 'he':u'בוט: מוסיף [[קטגוריה:%s]]', + 'ia':u'Robot: Addition de [[Categoria:%s]]', + 'is':u'Vélmenni: Bæti við [[Flokkur:%s]]', + 'it':u'Bot: Aggiungo [[Categoria:%s]]', + 'ja':u'ロボットによる: カテゴリ追加 [[Category:%s]]', + 'kk':u'Бот: [[Санат:%s]] үстеді', + 'ko': u'로봇: [[분류:%s]] 추가', + 'ksh':u'Bot: [[Saachjropp:%s]] erinjedonn', + 'lb': u'Bot: Derbäi setzen [[Kategorie:%s]]', + 'lt':u'robotas: Pridedama [[Kategorija:%s]]', + 'nds':u'Kat-Bot: [[Kategorie:%s]] rin', + 'nds-nl':u'bot: [[kattegerie:%s]] derbie edaon', + 'nl':u'Bot: [[Categorie:%s]] toegevoegd', + 'no':u'Robot: Legger til [[Kategori:%s]]', + 'nn':u'robot: la til [[Kategori:%s]]', + 'pl':u'Robot dodaje [[Kategoria:%s]]', + 'pt':u'Bot: Adicionando [[Categoria:%s]]', + 'ru':u'Робот: добавление [[Категория:%s]]', + 'sr':u'Бот: Додаје [[Категорија:%s]]', + 'sv':u'Robot: Lägger till [[Kategori:%s]]', + 'zh':u'機器人:新增目錄 [[Category:%s]]', + } + +msg_change={ + 'ar':u'روبوت: تغيير %s', + 'ca':u'Robot: Canviant %s', + 'da':u'Robot: Ændrer %s', + 'de':u'Bot: Ändere %s', + 'en':u'Robot: Changing %s', + 'es':u'Bot: Cambiada %s', + 'id':u'Bot: Mengganti %s', + 'fi':u'Botti muutti luokan %s', + 'fr':u'Robot : modifie [[%s]]', + 'he':u'בוט: משנה %s', + 'ia':u'Robot: Modification de %s', + 'is':u'Vélmenni: Breyti flokknum [[%s]]', + 'it':u'Bot: Modifico %s', + 'lt':u'robotas: Keičiama %s', + 'ja':u'ロボットによる: カテゴリ変更 [[%s]]', + 'kk':u'Бот: %s дегенді түзетті', + 'ko': u'로봇: %s 수정', + 'ksh':u'Bot: %s ußjewääßelt', + 'nds':u'Kat-Bot: %s utwesselt', + 'nds-nl':u'bot: wieziging %s', + 'nl':u'Bot: Wijziging %s', + 'no':u'Robot: Endrer %s', + 'nn':u'robot: endra %s', + 'pt':u'Bot: Modificando [[%s]]', + 'pl':u'Robot przenosi %s', + 'ru':u'Робот: изменение %s', + 'sr':u'Бот: Измена категорије %s', + 'sv':u'Robot: Ändrar %s', + 'zh':u'機器人:變更目錄 [[%s]]', + } + +deletion_reason_move = { + 'ar':u'روبوت: التصنيف نقل إلى [[:تصنيف:%s|%s]]', + 'bat-smg':u'Robots: Kateguorėjė bova parvadėnta i [[:Kateguorėjė:%s|%s]]', + 'ca':u'Robot: La categoria s'ha mogut a [[:Categoria:%s|%s]]', + 'da':u'Robot: Kategori flyttet til [[:Category:%s|%s]]', + 'de':u'Bot: Kategorie wurde nach [[:Category:%s|%s]] verschoben', + 'en':u'Robot: Category was moved to [[:Category:%s|%s]]', + 'es':u'Robot: La categoría ha sido movida a [[:Category:%s|%s]]', + 'fi':u'Botti siirsi luokan nimelle [[:Luokka:%s|%s]]', + 'fr':u'Robot : catégorie déplacée sur [[:Category:%s|%s]]', + 'he':u'בוט: הקטגוריה הועברה לשם [[:קטגוריה:%s|%s]]', + 'ia':u'Robot: Categoria transferite a [[:Category:%s|%s]]', + 'id':u'Bot: Kategori dipindahkan ke [[:Category:%s|%s]]', + 'it':u'Bot: La categoria è stata sostituita da [[:Categoria:%s|%s]]', + 'ja':u'ロボットによる: カテゴリ [[:Category:%s]]へ移動', + 'kk':u'Бот: Санат [[:Санат:%s|%s]] дегенге жылжытылды', + 'ko': u'로봇: 분류가 [[:분류:%s|%s]]로 옮겨짐', + 'ksh':u'Bot: Saachjropp noh [[:Category:%s|%s]] jeschovve', + 'lb': u'Bot: Kategorie gouf gréckelt: Nei [[:Kategorie:%s|%s]]', + 'lt':u'robotas: Kategorija pervadinta į [[:Category:%s|%s]]', + 'nds':u'Kat-Bot: Kategorie na [[:Category:%s|%s]] schaven', + 'nds-nl':u'Bot: kattegerie is herneumd naor [[:Kattegerie:%s|%s]]', + 'nl':u'Bot: Categorie is hernoemd naar [[:Category:%s|%s]]', + 'no':u'Robot: Kategorien ble flyttet til [[:Category:%s|%s]]', + 'nn':u'robot: kategorien blei flytta til [[:Kategori:%s|%s]]', + 'pt':u'Bot: Categoria [[:Category:%s|%s]] foi movida', + 'pl':u'Robot przenosi kategorię do [[:Category:%s|%s]]', + 'ru':u'Робот: категория переименована в [[:Категория:%s|%s]]', + 'sr':u'Бот: Категорија премештена у [[:Category:%s|%s]]', + 'sv':u'Robot: Kategori flyttades till [[:Category:%s|%s]]', + 'zh':u'機器人:移動目錄至 [[:Category:%s|%s]]', + } + +cfd_templates = { + 'en':['cfd', 'cfr', 'cfru', 'cfr-speedy', 'cfm', 'cfdu'], + 'fi':['roskaa', 'poistettava', 'korjattava/nimi', u'yhdistettäväLuokka'], + 'he':[u'הצבעת מחיקה', u'למחוק'], + } + +class CategoryDatabase: + ''' + This is a temporary knowledge base saving for each category the contained + subcategories and articles, so that category pages do not need to + be loaded over and over again + ''' + def __init__(self, rebuild = False, filename = 'category.dump.bz2'): + if rebuild: + self.rebuild() + else: + try: + if not os.path.isabs(filename): + filename = wikipedia.config.datafilepath(filename) + f = bz2.BZ2File(filename, 'r') + wikipedia.output(u'Reading dump from %s' + % wikipedia.config.shortpath(filename)) + databases = pickle.load(f) + f.close() + # keys are categories, values are 2-tuples with lists as entries. + self.catContentDB = databases['catContentDB'] + # like the above, but for supercategories + self.superclassDB = databases['superclassDB'] + del databases + except: + # If something goes wrong, just rebuild the database + self.rebuild() + + def rebuild(self): + self.catContentDB={} + self.superclassDB={} + + def getSubcats(self, supercat): + ''' + For a given supercategory, return a list of Categorys for all its + subcategories. + Saves this list in a temporary database so that it won't be loaded from the + server next time it's required. + ''' + # if we already know which subcategories exist here + if self.catContentDB.has_key(supercat): + return self.catContentDB[supercat][0] + else: + subcatlist = supercat.subcategoriesList() + articlelist = supercat.articlesList() + # add to dictionary + self.catContentDB[supercat] = (subcatlist, articlelist) + return subcatlist + + def getArticles(self, cat): + ''' + For a given category, return a list of Pages for all its articles. + Saves this list in a temporary database so that it won't be loaded from the + server next time it's required. + ''' + # if we already know which articles exist here + if self.catContentDB.has_key(cat): + return self.catContentDB[cat][1] + else: + subcatlist = cat.subcategoriesList() + articlelist = cat.articlesList() + # add to dictionary + self.catContentDB[cat] = (subcatlist, articlelist) + return articlelist + + def getSupercats(self, subcat): + # if we already know which subcategories exist here + if self.superclassDB.has_key(subcat): + return self.superclassDB[subcat] + else: + supercatlist = subcat.supercategoriesList() + # add to dictionary + self.superclassDB[subcat] = supercatlist + return supercatlist + + def dump(self, filename = 'category.dump.bz2'): + ''' + Saves the contents of the dictionaries superclassDB and catContentDB to disk. + ''' + if not os.path.isabs(filename): + filename = wikipedia.config.datafilepath(filename) + wikipedia.output(u'Dumping to %s, please wait...' + % wikipedia.config.shortpath(filename)) + f = bz2.BZ2File(filename, 'w') + databases = { + 'catContentDB': self.catContentDB, + 'superclassDB': self.superclassDB + } + # store dump to disk in binary format + try: + pickle.dump(databases, f, protocol=pickle.HIGHEST_PROTOCOL) + except pickle.PicklingError: + pass + f.close() + +def sorted_by_last_name(catlink, pagelink): + '''Return a Category with key that sorts persons by their last names. + + Parameters: catlink - The Category to be linked + pagelink - the Page to be placed in the category + + Trailing words in brackets will be removed. Example: If + category_name is 'Author' and pl is a Page to [[Alexandre Dumas + (senior)]], this function will return this Category: + [[Category:Author|Dumas, Alexandre]] + + ''' + page_name = pagelink.title() + site = pagelink.site() + # regular expression that matches a name followed by a space and + # disambiguation brackets. Group 1 is the name without the rest. + bracketsR = re.compile('(.*) (.+?)') + match_object = bracketsR.match(page_name) + if match_object: + page_name = match_object.group(1) + split_string = page_name.split(' ') + if len(split_string) > 1: + # pull last part of the name to the beginning, and append the + # rest after a comma; e.g., "John von Neumann" becomes + # "Neumann, John von" + sorted_key = split_string[-1] + ', ' + ' '.join(split_string[:-1]) + # give explicit sort key + return wikipedia.Page(site, catlink.title() + '|' + sorted_key) + else: + return wikipedia.Page(site, catlink.title()) + +def add_category(sort_by_last_name = False): + '''A robot to mass-add a category to a list of pages.''' + site = wikipedia.getSite() + if gen: + newcatTitle = wikipedia.input( + u'Category to add (do not give namespace):') + if not site.nocapitalize: + newcatTitle = newcatTitle[:1].capitalize() + newcatTitle[1:] + + # set edit summary message + wikipedia.setAction(wikipedia.translate(site, msg_add) % newcatTitle) + + cat_namespace = site.category_namespaces()[0] + + answer = '' + for page in gen: + if answer != 'a': + answer = '' + + while answer not in ('y','n','a'): + answer = wikipedia.input(u'%s [y/n/a(ll)]:' % (page.aslink())) + if answer == 'a': + confirm = '' + while confirm not in ('y','n'): + confirm = wikipedia.input(u"""\ +This should be used if and only if you are sure that your links are correct! +Are you sure? [y/n]:""") + if confirm == 'n': + answer = '' + + if answer == 'y' or answer == 'a': + try: + text = page.get() + except wikipedia.NoPage: + wikipedia.output(u"%s doesn't exist yet. Ignoring." + % (page.title())) + pass + except wikipedia.IsRedirectPage, arg: + redirTarget = wikipedia.Page(site, arg.args[0]) + wikipedia.output( + u"WARNING: %s is redirect to %s. Ignoring." + % (page.title(), redirTarget.title())) + else: + cats = page.categories() + # Show the title of the page we're working on. + # Highlight the title in purple. + wikipedia.output( + u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % page.title()) + wikipedia.output(u"Current categories:") + for cat in cats: + wikipedia.output(u"* %s" % cat.title()) + catpl = wikipedia.Page(site, + cat_namespace + ':' + newcatTitle) + if sort_by_last_name: + catpl = sorted_by_last_name(catpl, page) + if catpl in cats: + wikipedia.output(u"%s is already in %s." + % (page.title(), catpl.title())) + else: + wikipedia.output(u'Adding %s' % catpl.aslink()) + cats.append(catpl) + text = page.get() + text = wikipedia.replaceCategoryLinks(text, cats) + try: + page.put(text) + except wikipedia.EditConflict: + wikipedia.output( + u'Skipping %s because of edit conflict' + % (page.title())) + +class CategoryMoveRobot: + """Robot to move pages from one category to another.""" + def __init__(self, oldCatTitle, newCatTitle, batchMode=False, + editSummary='', inPlace=False, moveCatPage=True, + deleteEmptySourceCat=True, titleRegex=None): + site = wikipedia.getSite() + self.editSummary = editSummary + self.oldCat = catlib.Category(site, 'Category:' + oldCatTitle) + self.newCatTitle = newCatTitle + self.inPlace = inPlace + self.moveCatPage = moveCatPage + self.batchMode = batchMode + self.deleteEmptySourceCat = deleteEmptySourceCat + self.titleRegex = titleRegex + # set edit summary message + if self.editSummary: + wikipedia.setAction(self.editSummary) + else: + wikipedia.setAction(wikipedia.translate(site, msg_change) + % self.oldCat.title()) + + def run(self): + site = wikipedia.getSite() + newCat = catlib.Category(site, 'Category:' + self.newCatTitle) + + # Copy the category contents to the new category page + copied = False + oldMovedTalk = None + if self.oldCat.exists() and self.moveCatPage: + copied = self.oldCat.copyAndKeep( + self.newCatTitle, + wikipedia.translate(site, cfd_templates)) + # Also move the talk page + if copied: + reason = wikipedia.translate(site, deletion_reason_move) \ + % (self.newCatTitle, self.newCatTitle) + oldTalk = self.oldCat.toggleTalkPage() + if oldTalk.exists(): + newTalkTitle = newCat.toggleTalkPage().title() + try: + talkMoved = oldTalk.move(newTalkTitle, reason) + except (wikipedia.NoPage, wikipedia.PageNotSaved), e: + #in order : + #Source talk does not exist, or + #Target talk already exists + wikipedia.output(e.message) + else: + if talkMoved: + oldMovedTalk = oldTalk + + # Move articles + gen = pagegenerators.CategorizedPageGenerator(self.oldCat, + recurse=False) + preloadingGen = pagegenerators.PreloadingGenerator(gen) + for article in preloadingGen: + if not self.titleRegex or re.search(self.titleRegex, + article.title()): + catlib.change_category(article, self.oldCat, newCat, + inPlace=self.inPlace) + + # Move subcategories + gen = pagegenerators.SubCategoriesPageGenerator(self.oldCat, + recurse=False) + preloadingGen = pagegenerators.PreloadingGenerator(gen) + for subcategory in preloadingGen: + if not self.titleRegex or re.search(self.titleRegex, + subcategory.title()): + catlib.change_category(subcategory, self.oldCat, newCat, + inPlace=self.inPlace) + + # Delete the old category and its moved talk page + if copied and self.deleteEmptySourceCat == True: + if self.oldCat.isEmpty(): + reason = wikipedia.translate(site, deletion_reason_move) \ + % (self.newCatTitle, self.newCatTitle) + confirm = not self.batchMode + self.oldCat.delete(reason, confirm, mark = True) + if oldMovedTalk is not None: + oldMovedTalk.delete(reason, confirm, mark = True) + else: + wikipedia.output('Couldn't delete %s - not empty.' + % self.oldCat.title()) + + +class CategoryListifyRobot: + ''' + Creates a list containing all of the members in a category. + ''' + listify_msg={ + 'ar':u'روبوت: عرض من %s (%d مدخلة)', + 'ca':u'Robot: Llistant de %s (%d entrades)', + 'en':u'Robot: Listifying from %s (%d entries)', + 'fi':u'Botti listasi luokan %s (%d jäsentä)', + 'he':u'בוט: יוצר רשימה מהקטגוריה %s (%d דפים)', + 'kk':u'Бот: %s дегеннен (%d буын) тізімдеді', + 'nds-nl':u'Bot: lieste van %s (%d pagina's)', + 'nl':u'Bot: Lijst van %s (%d pagina's)', + 'sv':u'Robot: Skapar en lista från %s (%d)', + 'pt':u'Bot: Listando de %s (%d entradas)', + 'zh':u'機器人: 從%s提取列表(%d個項目)', + } + + def __init__(self, catTitle, listTitle, editSummary, overwrite = False, showImages = False, subCats = False, talkPages = False, recurse = False): + self.editSummary = editSummary + self.overwrite = overwrite + self.showImages = showImages + self.cat = catlib.Category(wikipedia.getSite(), 'Category:' + catTitle) + self.list = wikipedia.Page(wikipedia.getSite(), listTitle) + self.subCats = subCats + self.talkPages = talkPages + self.recurse = recurse + + def run(self): + listOfArticles = self.cat.articlesList(recurse = self.recurse) + if self.subCats: + listOfArticles += self.cat.subcategoriesList() + if self.editSummary: + wikipedia.setAction(self.editSummary) + else: + wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.listify_msg) % (self.cat.title(), len(listOfArticles))) + + listString = "" + for article in listOfArticles: + if (not article.isImage() or self.showImages) and not article.isCategory(): + if self.talkPages and not article.isTalkPage(): + listString = listString + "*[[%s]] -- [[%s|talk]]\n" % (article.title(), article.toggleTalkPage().title()) + else: + listString = listString + "*[[%s]]\n" % article.title() + else: + if self.talkPages and not article.isTalkPage(): + listString = listString + "*[[:%s]] -- [[%s|talk]]\n" % (article.title(), article.toggleTalkPage().title()) + else: + listString = listString + "*[[:%s]]\n" % article.title() + if self.list.exists() and not self.overwrite: + wikipedia.output(u'Page %s already exists, aborting.' % self.list.title()) + else: + self.list.put(listString) + +class CategoryRemoveRobot: + ''' + Removes the category tag from all pages in a given category and from the + category pages of all subcategories, without prompting. + Does not remove category tags pointing at subcategories. + ''' + deletion_reason_remove = { + 'ar':u'روبوت: التصنيف تم الاستغناء عنه', + 'ca':u'Robot: La categoria s'ha eliminat', + 'da':u'Robot: Kategorien blev opløst', + 'de':u'Bot: Kategorie wurde aufgelöst', + 'en':u'Robot: Category was disbanded', + 'es':u'Robot: La categoría ha sido eliminada', + 'fi':u'Botti tyhjensi luokan', + 'he':u'בוט: הקטגוריה פורקה', + 'ia':u'Robot: Categoria esseva dissolvite', + 'kk':u'Бот: Санат тарқатылды', + 'ksh':u'Bot: de Saachjropp is nu opjelööß', + 'nds':u'Kat-Bot: Kategorie is nu oplööst', + 'nds-nl':u'Bot: kattegerie besteet neet meer', + 'nl':u'Bot: Categorie is opgeheven', + 'no':u'Robot: Kategorien ble oppløst', + 'nn':u'robot: kategorien blei løyst opp', + 'pt':u'Bot: Categoria foi unida', + 'ru':u'Робот: категория расформирована', + 'sv':u'Robot: Kategorin upplöstes', + 'zh':u'機器人:本目錄已解散', + } + + msg_remove={ + 'ar':u'روبوت: إزالة من %s', + 'bat-smg':u'Robots: Trėnama ėš %s', + 'ca':u'Robot: Eliminant de %s', + 'da':u'Robot: Fjerner fra %s', + 'de':u'Bot: Entferne aus %s', + 'en':u'Robot: Removing from %s', + 'es':u'Bot: Eliminada de la %s', + 'fi':u'Botti poisti luokasta %s', + 'he':u'בוט: מסיר את הדף מהקטגוריה %s', + 'ia':u'Robot: Eliminate de %s', + 'is':u'Vélmenni: Fjarlægi [[Flokkur:%s]]', + 'kk':u'Бот: %s дегеннен аластатты', + 'ksh':u'Bot: uß de %s ußjedraare', + 'lb': u'Bot: Ewech huele vun %s', + 'nds':u'Kat-Bot: rut ut %s', + 'nds-nl':u'Bot: vort-ehaold uut %s', + 'nl':u'Bot: Verwijderd uit %s', + 'no':u'Robot: Fjerner ifra %s', + 'nn':u'robot: fjerna ifrå %s', + 'pt':u'Bot: Removendo [[Categoria:%s]]', + 'ru':u'Робот: исключение из [[Категория:%s]]', + 'sr':u'Бот: Уклањање из категорије [[Категорија:%s|%s]]', + 'sv':u'Robot: Tar bort från %s', + 'zh':u'機器人:移除目錄%s', + } + + def __init__(self, catTitle, batchMode = False, editSummary = '', useSummaryForDeletion = True, titleRegex = None, inPlace = False): + self.editSummary = editSummary + self.cat = catlib.Category(wikipedia.getSite(), 'Category:' + catTitle) + # get edit summary message + self.useSummaryForDeletion = useSummaryForDeletion + self.batchMode = batchMode + self.titleRegex = titleRegex + self.inPlace = inPlace + if self.editSummary: + wikipedia.setAction(self.editSummary) + else: + wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg_remove) % self.cat.title()) + + def run(self): + articles = self.cat.articlesList(recurse = 0) + if len(articles) == 0: + wikipedia.output(u'There are no articles in category %s' % self.cat.title()) + else: + for article in articles: + if not self.titleRegex or re.search(self.titleRegex,article.title()): + catlib.change_category(article, self.cat, None, inPlace = self.inPlace) + # Also removes the category tag from subcategories' pages + subcategories = self.cat.subcategoriesList(recurse = 0) + if len(subcategories) == 0: + wikipedia.output(u'There are no subcategories in category %s' % self.cat.title()) + else: + for subcategory in subcategories: + catlib.change_category(subcategory, self.cat, None, inPlace = self.inPlace) + # Deletes the category page + if self.cat.exists() and self.cat.isEmpty(): + if self.useSummaryForDeletion and self.editSummary: + reason = self.editSummary + else: + reason = wikipedia.translate(wikipedia.getSite(), self.deletion_reason_remove) + talkPage = self.cat.toggleTalkPage() + self.cat.delete(reason, not self.batchMode) + if (talkPage.exists()): + talkPage.delete(reason=reason, prompt=not self.batchMode) + +class CategoryTidyRobot: + """ + Script to help a human to tidy up a category by moving its articles into + subcategories + + Specify the category name on the command line. The program will pick up the + page, and look for all subcategories and supercategories, and show them with + a number adjacent to them. It will then automatically loop over all pages + in the category. It will ask you to type the number of the appropriate + replacement, and perform the change robotically. + + If you don't want to move the article to a subcategory or supercategory, but to + another category, you can use the 'j' (jump) command. + + Typing 's' will leave the complete page unchanged. + + Typing '?' will show you the first few bytes of the current page, helping + you to find out what the article is about and in which other categories it + currently is. + + Important: + * this bot is written to work with the MonoBook skin, so make sure your bot + account uses this skin + """ + def __init__(self, catTitle, catDB): + self.catTitle = catTitle + self.catDB = catDB + + def move_to_category(self, article, original_cat, current_cat): + ''' + Given an article which is in category original_cat, ask the user if + it should be moved to one of original_cat's subcategories. + Recursively run through subcategories' subcategories. + NOTE: current_cat is only used for internal recursion. You should + always use current_cat = original_cat. + ''' + wikipedia.output(u'') + # Show the title of the page where the link was found. + # Highlight the title in purple. + wikipedia.output(u'Treating page \03{lightpurple}%s\03{default}, currently in \03{lightpurple}%s\03{default}' % (article.title(), current_cat.title())) + + # Determine a reasonable amount of context to print + try: + full_text = article.get(get_redirect = True) + except wikipedia.NoPage: + wikipedia.output(u'Page %s not found.' % article.title()) + return + try: + contextLength = full_text.index('\n\n') + except ValueError: # substring not found + contextLength = 500 + if full_text.startswith(u'[['): # probably an image + # Add extra paragraph. + contextLength = full_text.find('\n\n', contextLength+2) + if contextLength > 1000 or contextLength < 0: + contextLength = 500 + print + wikipedia.output(full_text[:contextLength]) + print + + subcatlist = self.catDB.getSubcats(current_cat) + supercatlist = self.catDB.getSupercats(current_cat) + print + if len(subcatlist) == 0: + print 'This category has no subcategories.' + print + if len(supercatlist) == 0: + print 'This category has no supercategories.' + print + # show subcategories as possible choices (with numbers) + for i in range(len(supercatlist)): + # layout: we don't expect a cat to have more than 10 supercats + wikipedia.output(u'u%d - Move up to %s' % (i, supercatlist[i].title())) + for i in range(len(subcatlist)): + # layout: we don't expect a cat to have more than 100 subcats + wikipedia.output(u'%2d - Move down to %s' % (i, subcatlist[i].title())) + print ' j - Jump to another category' + print ' s - Skip this article' + print ' r - Remove this category tag' + print ' ? - Print first part of the page (longer and longer)' + wikipedia.output(u'Enter - Save category as %s' % current_cat.title()) + + flag = False + while not flag: + print '' + choice=wikipedia.input(u'Choice:') + if choice in ['s', 'S']: + flag = True + elif choice == '': + wikipedia.output(u'Saving category as %s' % current_cat.title()) + if current_cat == original_cat: + print 'No changes necessary.' + else: + catlib.change_category(article, original_cat, current_cat) + flag = True + elif choice in ['j', 'J']: + newCatTitle = wikipedia.input(u'Please enter the category the article should be moved to:') + newCat = catlib.Category(wikipedia.getSite(), 'Category:' + newCatTitle) + # recurse into chosen category + self.move_to_category(article, original_cat, newCat) + flag = True + elif choice in ['r', 'R']: + # remove the category tag + catlib.change_category(article, original_cat, None) + flag = True + elif choice == '?': + contextLength += 500 + print + wikipedia.output(full_text[:contextLength]) + print + + # if categories possibly weren't visible, show them additionally + # (maybe this should always be shown?) + if len(full_text) > contextLength: + print '' + print 'Original categories: ' + for cat in article.categories(): + wikipedia.output(u'* %s' % cat.title()) + elif choice[0] == 'u': + try: + choice=int(choice[1:]) + except ValueError: + # user pressed an unknown command. Prompt him again. + continue + self.move_to_category(article, original_cat, supercatlist[choice]) + flag = True + else: + try: + choice=int(choice) + except ValueError: + # user pressed an unknown command. Prompt him again. + continue + # recurse into subcategory + self.move_to_category(article, original_cat, subcatlist[choice]) + flag = True + + def run(self): + cat = catlib.Category(wikipedia.getSite(), 'Category:' + self.catTitle) + + # get edit summary message + wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg_change) % cat.title()) + + articles = cat.articlesList(recurse = False) + if len(articles) == 0: + wikipedia.output(u'There are no articles in category ' + catTitle) + else: + preloadingGen = pagegenerators.PreloadingGenerator(iter(articles)) + for article in preloadingGen: + wikipedia.output(u'\n===================================================================') + self.move_to_category(article, cat, cat) + +class CategoryTreeRobot: + ''' + Robot to create tree overviews of the category structure. + + Parameters: + * catTitle - The category which will be the tree's root. + * catDB - A CategoryDatabase object + * maxDepth - The limit beyond which no subcategories will be listed. + This also guarantees that loops in the category structure + won't be a problem. + * filename - The textfile where the tree should be saved; None to print + the tree to stdout. + ''' + + def __init__(self, catTitle, catDB, filename = None, maxDepth = 10): + self.catTitle = catTitle + self.catDB = catDB + if filename and not os.path.isabs(filename): + filename = wikipedia.config.datafilepath(filename) + self.filename = filename + # TODO: make maxDepth changeable with a parameter or config file entry + self.maxDepth = maxDepth + + def treeview(self, cat, currentDepth = 0, parent = None): + ''' + Returns a multi-line string which contains a tree view of all subcategories + of cat, up to level maxDepth. Recursively calls itself. + + Parameters: + * cat - the Category of the node we're currently opening + * currentDepth - the current level in the tree (for recursion) + * parent - the Category of the category we're coming from + ''' + + # Translations to say that the current category is in more categories than + # the one we're coming from + also_in_cats = { + 'ar': u'(أيضا في %s)', + 'ca': u'(també a %s)', + 'da': u'(også i %s)', + 'de': u'(auch in %s)', + 'en': u'(also in %s)', + 'es': u'(también en %s)', + 'fi': u'(myös luokassa %s)', + 'fr': u'(également dans %s)', + 'he': u'(גם בקטגוריות %s)', + 'ia': u'(equalmente in %s)', + 'is': u'(einnig í %s)', + 'kk': u'(тағы да %s дегенде)', + 'nds-nl': u'(oek in %s)', + 'nl': u'(ook in %s)', + 'no': u'(også i %s)', + 'nn': u'(òg i %s)', + 'pt': u'(também em %s)', + 'ru': u'(также в %s)', + 'sv': u'(också i %s)', + 'ср': u'(такође у %s)', + 'zh': u'(也在 %s)', + } + + result = u'#' * currentDepth + result += '[[:%s|%s]]' % (cat.title(), cat.title().split(':', 1)[1]) + result += ' (%d)' % len(self.catDB.getArticles(cat)) + # We will remove an element of this array, but will need the original array + # later, so we create a shallow copy with [:] + supercats = self.catDB.getSupercats(cat)[:] + # Find out which other cats are supercats of the current cat + try: + supercats.remove(parent) + except: + pass + if supercats != []: + supercat_names = [] + for i in range(len(supercats)): + # create a list of wiki links to the supercategories + supercat_names.append('[[:%s|%s]]' % (supercats[i].title(), supercats[i].title().split(':', 1)[1])) + # print this list, separated with commas, using translations given in also_in_cats + result += ' ' + wikipedia.translate(wikipedia.getSite(), also_in_cats) % ', '.join(supercat_names) + result += '\n' + if currentDepth < self.maxDepth: + for subcat in self.catDB.getSubcats(cat): + # recurse into subdirectories + result += self.treeview(subcat, currentDepth + 1, parent = cat) + else: + if self.catDB.getSubcats(cat) != []: + # show that there are more categories beyond the depth limit + result += '#' * (currentDepth + 1) + '[...]\n' + return result + + def run(self): + """ + Prints the multi-line string generated by treeview or saves it to a file. + + Parameters: + * catTitle - the title of the category which will be the tree's root + * maxDepth - the limit beyond which no subcategories will be listed + """ + cat = catlib.Category(wikipedia.getSite(), 'Category:' + self.catTitle) + tree = self.treeview(cat) + if self.filename: + wikipedia.output(u'Saving results in %s' % self.filename) + import codecs + f = codecs.open(self.filename, 'a', 'utf-8') + f.write(tree) + f.close() + else: + wikipedia.output(tree, toStdout = True) + +if __name__ == "__main__": + fromGiven = False + toGiven = False + batchMode = False + editSummary = '' + inPlace = False + overwrite = False + showImages = False + talkPages = False + recurse = False + titleRegex = None + + # This factory is responsible for processing command line arguments + # that are also used by other scripts and that determine on which pages + # to work on. + genFactory = pagegenerators.GeneratorFactory() + # The generator gives the pages that should be worked upon. + gen = None + + #If this is set to true then the custom edit summary given for removing + #categories from articles will also be used as the deletion reason. + useSummaryForDeletion = True + try: + catDB = CategoryDatabase() + action = None + sort_by_last_name = False + restore = False + for arg in wikipedia.handleArgs(): + if arg == 'add': + action = 'add' + elif arg == 'remove': + action = 'remove' + elif arg == 'move': + action = 'move' + elif arg == 'tidy': + action = 'tidy' + elif arg == 'tree': + action = 'tree' + elif arg == 'listify': + action = 'listify' + elif arg == '-person': + sort_by_last_name = True + elif arg == '-rebuild': + catDB.rebuild() + elif arg.startswith('-from:'): + oldCatTitle = arg[len('-from:'):].replace('_', ' ') + fromGiven = True + elif arg.startswith('-to:'): + newCatTitle = arg[len('-to:'):].replace('_', ' ') + toGiven = True + elif arg == '-batch': + batchMode = True + elif arg == '-inplace': + inPlace = True + elif arg == '-delsum': + # This parameter is kept for historical reasons, as it was not previously the default option. + pass + elif arg == '-nodelsum': + useSummaryForDeletion = False + elif arg == '-overwrite': + overwrite = True + elif arg == '-showimages': + showImages = True + elif arg.startswith('-summary:'): + editSummary = arg[len('-summary:'):] + elif arg.startswith('-match'): + if len(arg) == len('-match'): + titleRegex = wikipedia.input(u'Which regular expression should affected objects match?') + else: + titleRegex = arg[len('-match:'):] + elif arg == '-talkpages': + talkPages = True + elif arg == '-recurse': + recurse = True + else: + gen = genFactory.handleArg(arg) + + if action == 'add': + if not gen: + gen = genFactory.handleArg('-links') #default for backwords compatibility + # The preloading generator is responsible for downloading multiple + # pages from the wiki simultaneously. + gen = pagegenerators.PreloadingGenerator(gen) + add_category(sort_by_last_name) + elif action == 'remove': + if (fromGiven == False): + oldCatTitle = wikipedia.input(u'Please enter the name of the category that should be removed:') + bot = CategoryRemoveRobot(oldCatTitle, batchMode, editSummary, useSummaryForDeletion, inPlace = inPlace) + bot.run() + elif action == 'move': + if (fromGiven == False): + oldCatTitle = wikipedia.input(u'Please enter the old name of the category:') + if (toGiven == False): + newCatTitle = wikipedia.input(u'Please enter the new name of the category:') + bot = CategoryMoveRobot(oldCatTitle, newCatTitle, batchMode, editSummary, inPlace, titleRegex = titleRegex) + bot.run() + elif action == 'tidy': + catTitle = wikipedia.input(u'Which category do you want to tidy up?') + bot = CategoryTidyRobot(catTitle, catDB) + bot.run() + elif action == 'tree': + catTitle = wikipedia.input(u'For which category do you want to create a tree view?') + filename = wikipedia.input(u'Please enter the name of the file where the tree should be saved, or press enter to simply show the tree:') + bot = CategoryTreeRobot(catTitle, catDB, filename) + bot.run() + elif action == 'listify': + if (fromGiven == False): + oldCatTitle = wikipedia.input(u'Please enter the name of the category to listify:') + if (toGiven == False): + newCatTitle = wikipedia.input(u'Please enter the name of the list to create:') + bot = CategoryListifyRobot(oldCatTitle, newCatTitle, editSummary, overwrite, showImages, subCats = True, talkPages = talkPages, recurse = recurse) + bot.run() + else: + wikipedia.showHelp('category') + finally: + catDB.dump() + wikipedia.stopme()
pywikipedia-l@lists.wikimedia.org