Revision: 6256 Author: russblau Date: 2009-01-14 15:17:47 +0000 (Wed, 14 Jan 2009)
Log Message: ----------- Branch for port to rewrite
Added Paths: ----------- branches/rewrite/pywikibot/scripts/solve_disambiguation.py
Copied: branches/rewrite/pywikibot/scripts/solve_disambiguation.py (from rev 6255, trunk/pywikipedia/solve_disambiguation.py) =================================================================== --- branches/rewrite/pywikibot/scripts/solve_disambiguation.py (rev 0) +++ branches/rewrite/pywikibot/scripts/solve_disambiguation.py 2009-01-14 15:17:47 UTC (rev 6256) @@ -0,0 +1,1012 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +Script to help a human solve disambiguations by presenting a set of options. + +Specify the disambiguation page on the command line, or enter it at the +prompt after starting the program. (If the disambiguation page title starts +with a '-', you cannot name it on the command line, but you can enter it at +the prompt.) The program will pick up the page, and look for all +alternative links, and show them with a number adjacent to them. It will +then automatically loop over all pages referring to the disambiguation page, +and show 30 characters of context on each side of the reference to help you +make the decision between the alternatives. It will ask you to type the +number of the appropriate replacement, and perform the change. + +It is possible to choose to replace only the link (just type the number) or +replace both link and link-text (type 'r' followed by the number). + +Multiple references in one page will be scanned in order, but typing 'n' (next) +on any one of them will leave the complete page unchanged. To leave only some reference unchanged, use the 's' (skip) option. + +Command line options: + + -pos:XXXX adds XXXX as an alternative disambiguation + + -just only use the alternatives given on the command line, do not + read the page for other possibilities + + -primary "primary topic" disambiguation (Begriffsklärung nach Modell 2). + That's titles where one topic is much more important, the + disambiguation page is saved somewhere else, and the important + topic gets the nice name. + + -primary:XY like the above, but use XY as the only alternative, instead of + searching for alternatives in [[Keyword (disambiguation)]]. + Note: this is the same as -primary -just -pos:XY + + -file:XYZ reads a list of pages from a text file. XYZ is the name of the + file from which the list is taken. If XYZ is not given, the user is asked for a filename. + Page titles should be inside [[double brackets]]. + The -pos parameter won't work if -file is used. + + -always:XY instead of asking the user what to do, always perform the same + action. For example, XY can be "r0", "u" or "2". Be careful with + this option, and check the changes made by the bot. Note that + some choices for XY don't make sense and will result in a loop, + e.g. "l" or "m". + + -main only check pages in the main namespace, not in the talk, + wikipedia, user, etc. namespaces. + + -start:XY goes through all disambiguation pages in the category on your wiki + that is defined (to the bot) as the category containing disambiguation + pages, starting at XY. If only '-start' or '-start:' is given, it starts + at the beginning. + + -min:XX (XX being a number) only work on disambiguation pages for which + at least XX are to be worked on. + +To complete a move of a page, one can use: + + python solve_disambiguation.py -just -pos:New_Name Old_Name +""" +# +# (C) Rob W.W. Hooft, 2003 +# (C) Daniel Herding, 2004 +# (C) Andre Engels, 2003-2004 +# (C) WikiWichtel, 2004 +# +# Distributed under the terms of the MIT license. +# +__version__='$Id$' +# +# Standard library imports +import re, sys, codecs + +# Application specific imports +import wikipedia, pagegenerators, editarticle + +# Summary message when working on disambiguation pages +msg = { + 'ar': u'توضيح بمساعدة روبوت: %s - غير الوصلة أو الوصلات إلى %s', + 'cs': u'Odstranění linku na rozcestník [[%s]] s použitím robota - Změněn(y) odkaz(y) na %s', + 'en': u'Robot-assisted disambiguation: %s - Changed link(s) to %s', + 'es': u'Bot:Desambiguación asistida: %s - Cambiando enlace(s) para %s', + 'da': u'Retter flertydigt link til: %s - Ændrede link(s) til %s', + 'de': u'Bot-unterstützte Begriffsklärung: %s - Link(s) ersetzt durch %s', + 'fi': u'Täsmennystä botin avulla: %s korvattiin link(e)illä %s', + 'fr': u'Homonymie résolue à l'aide du robot: %s - Modifications du (des) lien(s) pour %s', + 'he': u'תיקון קישור לדף פירושונים באמצעות בוט: %s', + 'hu': u'Bottal végzett egyértelműsítés: %s –> %s', + 'ia': u'Disambiguation assistite per robot: %s - Changed link(s) to %s', + 'it': u'Sistemazione automatica della disambigua: %s - Inversione di redirect %s', + 'lt': u'Nuorodų į nukrepiamąjį straipsnį keitimas: %s - Pakeistos nuorodos į %s', + 'kk': u'Айрықты мағыналарды бот көмегімен шешу: %s - Changed link(s) to %s', + 'ko': u'로봇의 도움을 받아 동음이의 처리 : [[%s]] - %s 문서로 링크 걸음', + 'nl': u'Robot-geholpen doorverwijzing: [[%s]] - Link(s) veranderd naar %s', + 'no': u'bot: Retter lenke til peker: %s - Endret lenke(r) til %s', + 'pl': u'Wspomagane przez robota ujednoznacznienie: %s - Zmieniono link(i) %s', + 'pt': u'Desambiguação assistida por bot: %s link(s) mudado(s) para %s', + 'ru': u'Разрешение значений с помощью бота: %s - Changed link(s) to %s', + 'sr': u'Решавање вишезначних одредница помоћу бота: %s - Changed link(s) to %s', + 'sv': u'Länkar direkt till rätt artikel för: %s - Bytte länk(ar) till %s', + } + +# Summary message when working on disambiguation pages and the link is removed +msg_unlink = { + 'ar': u'توضيح بمساعدة روبوت: %s - أزال الوصلة أو الوصلات.', + 'cs': u'Odstranění linku na rozcestník [[%s]] s použitím robota - Odstraněn(y) odkaz(y)', + 'en': u'Robot-assisted disambiguation: %s - Removed link(s).', + 'da': u'Retter flertydigt link til: %s - Fjernede link(s)', + 'de': u'Bot-unterstützte Begriffsklärung: %s - Link(s) entfernt', + 'fi': u'Täsmennystä botin avulla: %s - poistettiin linkkejä.', + 'fr': u'Homonymie résolue à l'aide du robot: %s - Retrait du (des) lien(s)', + 'he': u'הסרת קישור לדף פירושונים באמצעות בוט: %s', + 'hu': u'Bottal végzett egyértelműsítés: %s – hivatkozások eltávolítása', + 'ia': u'Disambiguation assistite per robot: %s - Removed link(s).', + 'it': u'Sistemazione automatica della disambigua: %s - Collegamenti rimossi', + 'lt': u'Nuorodų į nukrepiamąjį straipsnį keitimas: %s - Pašalintos nuorodos', + 'kk': u'Айрықты мағыналарды бот көмегімен шешу: %s - Removed link(s).', + 'ko': u'로봇의 도움을 받아 동음이의 처리: [[%s]] - 링크 제거', + 'nl': u'Robot-geholpen doorverwijzing: [[%s]] - Link(s) weggehaald.', + 'no': u'bot: Retter lenke til peker: %s - Fjernet lenke(r)', + 'pl': u'Wspomagane przez robota ujednoznacznienie: %s - Usunięto link(i)', + 'pt': u'Desambiguação assistida por bot: %s link(s) removido(s)', + 'ru': u'Разрешение значений с помощью бота: %s - Removed link(s)', + 'sr': u'Решавање вишезначних одредница помоћу бота: %s - Removed link(s)', + 'sv': u'Länkar direkt till rätt artikel för: %s - Tog bort länk(ar)', + } + +# Summary message when working on redirects +msg_redir = { + 'ar': u'توضيح بمساعدة روبوت: %s - غير الوصلة أو الوصلات إلى %s', + 'cs': u'Robot opravil přesměrování na %s - Změněn(y) odkaz(y) na %s', + 'en': u'Robot-assisted disambiguation: %s - Changed link(s) to %s', + 'da': u'Retter flertydigt link til: %s - Ændrede link(s) til %s', + 'de': u'Bot-unterstützte Redirectauflösung: %s - Link(s) ersetzt durch %s', + 'fi': u'Täsmennystä botin avulla: %s korvattiin link(e)illä %s', + 'fr': u'Correction de lien vers redirect: %s - Modifications du (des) lien(s) pour %s', + 'he': u'תיקון קישור לדף פירושונים באמצעות בוט: %s שונה ל%s', + 'hu': u'Bottal végzett egyértelműsítés: %s –> %s', + 'ia': u'Resolution de redirectiones assistite per robot: %s - Changed link(s) to %s', + 'it': u'Sistemazione automatica del redirect: %s - Inversione di redirect %s', + 'lt': u'Nuorodų į peradresavimo straipsnį keitimas: %s - Pakeistos nuorodos į %s', + 'kk': u'Айрықты мағыналарды бот көмегімен шешу: %s - Changed link(s) to %s', + 'ko': u'로봇의 도움을 받아 동음이의 처리: [[%s]] - %s 문서로 링크 걸음', + 'nl': u'Robot-geholpen redirect-oplossing: [[%s]] - Link(s) veranderd naar %s', + 'no': u'bot: Endrer omdirigeringslenke: %s - Endret lenke(r) til %s', + 'pl': u'Wspomagane przez robota ujednoznacznienie: %s - Zmieniono link(i) %s', + 'pt': u'Desambiguação assistida por bot: %s link(s) mudados para %s', + 'ru': u'Разрешение значений с помощью бота: %s - Changed link(s) to %s', + 'sr': u'Решавање вишезначних одредница помоћу бота: %s - Changed link(s) to %s', + 'sv': u'Länkar direkt till rätt artikel för: %s - Bytte länk(ar) till %s', + } + +# Summary message when working on redirects and the link is removed +msg_redir_unlink = { + 'ar': u'توضيح بمساعدة روبوت: %s - أزال الوصلة أو الوصلات', + 'cs': u'Robot opravil přesměrování na %s - Odstraněn(y) odkaz(y)', + 'en': u'Robot-assisted disambiguation: %s - Removed link(s)', + 'da': u'Retter flertydigt link til: %s - Fjernede link(s)', + 'de': u'Bot-unterstützte Redirectauflösung: %s - Link(s) entfernt', + 'fr': u'Correction de lien vers redirect: %s - Retrait du (des) lien(s)', + 'fi': u'Täsmennystä botin avulla: %s - poistettiin linkkejä', + 'he': u'הסרת קישור לדף פירושונים באמצעות בוט: %s', + 'hu': u'Bottal támogatott egyértelműsítés: %s – hivatkozások eltávolítása', + 'ia': u'Resolution de redirectiones assistite per robot: %s - Removed link(s).', + 'it': u'Sistemazione automatica del redirect: %s - Collegamenti rimossi', + 'lt': u'Nuorodų į peradresavimo straipsnį keitimas: %s - Pašalintos nuorodos', + 'kk': u'Айрықты мағыналарды бот көмегімен шешу: %s - Removed link(s).', + 'ko': u'로봇의 도움을 받아 동음이의 처리: [[%s]] - 링크 제거', + 'nl': u'Robot-geholpen redirect-oplossing: [[%s]] - Link(s) weggehaald', + 'no': u'bot: Endrer omdirigeringslenke: %s - Fjernet lenke(r)', + 'pl': u'Wspomagane przez robota ujednoznacznienie: %s - Usunięto link(i)', + 'pt': u'Desambiguação assistida por bot: %s link(s) removidos', + 'ru': u'Разрешение значений с помощью бота: %s - Removed link(s)', + 'sr': u'Решавање вишезначних одредница помоћу бота: %s - Removed link(s)', + 'sv': u'Länkar direkt till rätt artikel för: %s - Tog bort länk(ar)', + } + +# Summary message to (unknown) +unknown_msg = { + 'ar' : u'(غير معروف)', + 'en' : u'(unknown)', + 'fi' : u'(tuntematon)', + 'hu' : u'(ismeretlen)', + 'pt' : u'(desconhecido)', + } + +# disambiguation page name format for "primary topic" disambiguations +# (Begriffsklärungen nach Modell 2) +primary_topic_format = { + 'ar': u'%s_(توضيح)', + 'cs': u'%s_(rozcestník)', + 'de': u'%s_(Begriffsklärung)', + 'en': u'%s_(disambiguation)', + 'fi': u'%s_(täsmennyssivu)', + 'hu': u'%s_(egyértelműsítő lap)', + 'ia': u'%s_(disambiguation)', + 'it': u'%s_(disambigua)', + 'lt': u'%s_(reikšmės)', + 'kk': u'%s_(айрық)', + 'ko': u'%s_(동음이의)', + 'nl': u'%s_(doorverwijspagina)', + 'no': u'%s_(peker)', + 'pl': u'%s_(ujednoznacznienie)', + 'pt': u'%s_(desambiguação)', + 'he': u'%s_(פירושונים)', + 'ru': u'%s_(значения)', + 'sr': u'%s_(вишезначна одредница)', + 'sv': u'%s_(olika betydelser)', + } + +# List pages that will be ignored if they got a link to a disambiguation +# page. An example is a page listing disambiguations articles. +# Special chars should be encoded with unicode (\x##) and space used +# instead of _ + +ignore_title = { + 'wikipedia': { + 'ar': [ + u'تصنيف:صفحات توضيح', + ], + 'cs': [ + u'Wikipedie:Chybějící interwiki/.+', + u'Wikipedie:Rozcestníky', + u'Wikipedie diskuse:Rozcestníky', + u'Wikipedie:Seznam nejvíce odkazovaných rozcestníků', + u'Wikipedie:Seznam rozcestníků/první typ', + u'Wikipedie:Seznam rozcestníků/druhý typ', + u'Wikipedista:Zirland/okres', + ], + 'da': [ + u'Wikipedia:Links til sider med flertydige titler' + ], + 'de': [ + u'Benutzer:Katharina/Begriffsklärungen', + u'Benutzer:Kirschblut/.+buchstabenkürzel', + u'Benutzer:Noisper/Dingliste/[A-Z]', + u'Benutzer:SirJective/.+', + u'Benutzer:SrbBot/Index/.+', + u'Benutzer Diskussion:.+', + u'GISLexikon ([A-Z])', + u'Lehnwort', + u'Liste griechischer Wortstämme in deutschen Fremdwörtern', + u'Liste von Gräzismen', + u'Portal:Abkürzungen/.+', + u'Wikipedia:Archiv:.+', + u'Wikipedia:Artikelwünsche/Ding-Liste/[A-Z]', + u'Wikipedia:Begriffsklärung.*', + u'Wikipedia:Dreibuchstabenkürzel von [A-Z][A-Z][A-Z] bis [A-Z][A-Z][A-Z]', + u'Wikipedia:Interwiki-Konflikte', + u'Wikipedia:Kurze Artikel', + u'Wikipedia:Liste aller 2-Buchstaben-Kombinationen', + u'Wikipedia:Liste mathematischer Themen/BKS', + u'Wikipedia:Liste mathematischer Themen/Redirects', + u'Wikipedia:Löschkandidaten/.+', + u'Wikipedia:Qualitätsoffensive/UNO', #requested by Benutzer:Addicted + u'Wikipedia:WikiProjekt Altertumswissenschaft/.+', + u'Wikipedia:WikiProjekt Verwaiste Seiten/Begriffsklärungen', + ], + 'en': [ + u'Wikipedia:Links to disambiguating pages', + u'Wikipedia:Disambiguation pages with links', + u'Wikipedia:Multiple-place names ([A-Z])', + u'Wikipedia:Non-unique personal name', + u"User:Jerzy/Disambiguation Pages i've Editted", + u'User:Gareth Owen/inprogress', + u'TLAs from [A-Z][A-Z][A-Z] to [A-Z][A-Z][A-Z]', + u'List of all two-letter combinations', + u'User:Daniel Quinlan/redirects.+', + u'User:Oliver Pereira/stuff', + u'Wikipedia:French Wikipedia language links', + u'Wikipedia:Polish language links', + u'Wikipedia:Undisambiguated abbreviations/.+', + u'List of acronyms and initialisms', + u'Wikipedia:Usemod article histories', + u'User:Pizza Puzzle/stuff', + u'List of generic names of political parties', + u'Talk:List of initialisms/marked', + u'Talk:List of initialisms/sorted', + u'Talk:Programming language', + u'Talk:SAMPA/To do', + u"Wikipedia:Outline of Roget's Thesaurus", + u'User:Wik/Articles', + u'User:Egil/Sandbox', + u'Wikipedia talk:Make only links relevant to the context', + u'Wikipedia:Common words, searching for which is not possible', + ], + 'fi': [ + u'Wikipedia:Luettelo täsmennyssivuista', + u'Wikipedia:Luettelo (täsmennyssivuista)', + u'Wikipedia:Täsmennyssivu', + ], + 'fr': [ + u'Wikipédia:Liens aux pages d'homonymie', + u'Wikipédia:Homonymie', + u'Wikipédia:Homonymie/Homonymes dynastiques', + u'Wikipédia:Prise de décision, noms des membres de dynasties/liste des dynastiens', + u'Liste de toutes les combinaisons de deux lettres', + u'Wikipédia:Log d'upload/.*', + u'Sigles de trois lettres de [A-Z]AA à [A-Z]ZZ', + u'Wikipédia:Pages sans interwiki,.' + ], + 'fy': [ + u'Wikipedy:Fangnet', + ], + 'ia': [ + u'Categoria:Disambiguation', + u'Wikipedia:.+', + u'Usator:.+', + u'Discussion Usator:.+', + ], + 'it': [ + u'Aiuto:Disambigua/Disorfanamento', + u'Discussioni utente:.+', + u'Utente:Civvì/disorfanamento', + ], + 'kk': [ + u'Санат:Айрықты бет', + ], + 'ko': [ + u'위키백과:(동음이의) 문서의 목록', + u'위키백과:동음이의어 문서의 목록', + ], + 'lt': [ + u'Wikipedia:Rodomi nukreipiamieji straipsniai', + ], + 'nl': [ + u"Gebruiker:.*", + u"Overleg gebruiker:.+[aA]rchief.*", + u"Overleg gebruiker:Pven", + u"Portaal:.+[aA]rchief.*", + u"Wikipedia:Humor en onzin.*", + u"Wikipedia:Links naar doorverwijspagina's/Winkeldochters.*", + u"Wikipedia:Project aanmelding bij startpagina's", + u"Wikipedia:Wikiproject Roemeense gemeenten/Doorverwijspagina's", + u'Categorie:Doorverwijspagina', + u'Lijst van Nederlandse namen van pausen', + u'Overleg Wikipedia:Discussie spelling 2005', + u'Overleg Wikipedia:Doorverwijspagina', + u'Overleg Wikipedia:Logboek.*', + u'Wikipedia:Logboek.*', + u'Overleg gebruiker:Sybren/test.*', + u'Overleg gebruiker:[0-9][0-9]?[0-9]?.[0-9][0-9]?[0-9]?.[0-9][0-9]?[0-9]?.[0-9][0-9]?[0-9]?', + u'Overleg:Lage Landen (staatkunde)', + u'Wikipedia:.*[aA]rchief.*', + u'Wikipedia:Doorverwijspagina', + u'Wikipedia:Lijst van alle tweeletter-combinaties', + u'Wikipedia:Onderhoudspagina', + u'Wikipedia:Ongelijke redirects', + u'Wikipedia:Protection log', + u'Wikipedia:Te verwijderen.*', + u'Wikipedia:Top 1000 van meest bekeken artikelen', + u'Wikipedia:Wikipedianen met een encyclopedisch artikel', + u'Wikipedia:Woorden die niet als zoekterm gebruikt kunnen worden', + ], + 'pl': [ + u'Wikipedysta:.+', + u'Dyskusja.+:.+', + ], + 'pt': [ + u'Usuário:.+', + u'Usuário Discussão:.+', + u'Discussão:.+', + u'Lista de combinações de duas letras', + u'Wikipedia:Lista de páginas de desambiguação.+', + u'Wikipedia:Páginas para eliminar/.+', + ], + 'ru': [ + u'Категория:Disambig', + u'Википедия:Страницы разрешения неоднозначностей', + u'Википедия:Вики-уборка/Статьи без языковых ссылок', + u'Википедия:Страницы с пометкой «(значения)»', + u'Список общерусских фамилий', + ], + }, + 'memoryalpha': { + 'en': [ + u'Memory Alpha:Links to disambiguating pages' + ], + 'de': [ + u'Memory Alpha:Liste der Wortklärungsseiten' + ], + }, +} + +def firstcap(string): + return string[0].upper()+string[1:] + +def correctcap(link, text): + # If text links to a page with title link uncapitalized, uncapitalize link, otherwise capitalize it + linkupper = link.title() + linklower = linkupper[0].lower() + linkupper[1:] + if text.find("[[%s]]"%linklower) > -1 or text.find("[[%s|"%linklower) > -1: + return linklower + else: + return linkupper + +class ReferringPageGeneratorWithIgnore: + def __init__(self, disambPage, primary=False, minimum = 0): + self.disambPage = disambPage + # if run with the -primary argument, enable the ignore manager + self.primaryIgnoreManager = PrimaryIgnoreManager(disambPage, + enabled=primary) + self.minimum = minimum + + def __iter__(self): + # TODO: start yielding before all referring pages have been found + refs = [page for page in self.disambPage.getReferences(follow_redirects = False, withTemplateInclusion = False)] + wikipedia.output(u"Found %d references." % len(refs)) + # Remove ignorables + if ignore_title.has_key(self.disambPage.site().family.name) and ignore_title[self.disambPage.site().family.name].has_key(self.disambPage.site().lang): + for ig in ignore_title[self.disambPage.site().family.name][self.disambPage.site().lang]: + for i in range(len(refs)-1, -1, -1): + if re.match(ig, refs[i].title()): + if wikipedia.verbose: + wikipedia.output('Ignoring page %s' + % refs[i].title()) + del refs[i] + elif self.primaryIgnoreManager.isIgnored(refs[i]): + #wikipedia.output('Ignoring page %s because it was skipped before' % refs[i].title()) + del refs[i] + if len(refs) < self.minimum: + wikipedia.output(u"Found only %d pages to work on; skipping." % len(refs)) + return + wikipedia.output(u"Will work on %d pages." % len(refs)) + for ref in refs: + yield ref + +class PrimaryIgnoreManager(object): + ''' + If run with the -primary argument, reads from a file which pages should + not be worked on; these are the ones where the user pressed n last time. + If run without the -primary argument, doesn't ignore any pages. + ''' + def __init__(self, disambPage, enabled = False): + self.disambPage = disambPage + self.enabled = enabled + + self.ignorelist = [] + filename = wikipedia.config.datafilepath('disambiguations', + self.disambPage.titleForFilename() + '.txt') + try: + # The file is stored in the disambiguation/ subdir. Create if necessary. + f = codecs.open(filename, 'r', 'utf-8') + for line in f.readlines(): + # remove trailing newlines and carriage returns + while line[-1] in ['\n', '\r']: + line = line[:-1] + #skip empty lines + if line != '': + self.ignorelist.append(line) + f.close() + except IOError: + pass + + def isIgnored(self, refPage): + return self.enabled and refPage.urlname() in self.ignorelist + + def ignore(self, refPage): + if self.enabled: + # Skip this occurence next time. + filename = wikipedia.config.datafilepath('disambiguations', + self.disambPage.urlname() + '.txt') + try: + # Open file for appending. If none exists yet, create a new one. + # The file is stored in the disambiguation/ subdir. Create if necessary. + f = codecs.open(filename, 'a', 'utf-8') + f.write(refPage.urlname() + '\n') + f.close() + except IOError: + pass + + +class DisambiguationRobot(object): + ignore_contents = { + 'de':(u'{{[Ii]nuse}}', + u'{{[Ll]öschen}}', + ), + 'fi':(u'{{[Tt]yöstetään}}', + ), + 'kk':(u'{{[Ii]nuse}}', + u'{{[Pp]rocessing}}', + ), + 'nl':(u'{{wiu2}}', + u'{{nuweg}}', + ), + 'ru':(u'{{[Ii]nuse}}', + u'{{[Pp]rocessing}}', + ), + } + + primary_redir_template = { + # Page.templates() format, first letter uppercase + 'hu': u'Egyért-redir', + } + + def __init__(self, always, alternatives, getAlternatives, generator, primary, main_only, minimum = 0): + self.always = always + self.alternatives = alternatives + self.getAlternatives = getAlternatives + self.generator = generator + self.primary = primary + self.main_only = main_only + self.minimum = minimum + + self.mysite = wikipedia.getSite() + self.mylang = self.mysite.language() + self.comment = None + + self.setupRegexes() + + def checkContents(self, text): + ''' + For a given text, returns False if none of the regular + expressions given in the dictionary at the top of this class + matches a substring of the text. + Otherwise returns the substring which is matched by one of + the regular expressions. + ''' + for ig in self.ignore_contents_regexes: + match = ig.search(text) + if match: + return match.group() + return None + + def makeAlternativesUnique(self): + # remove duplicate entries + result={} + for i in self.alternatives: + result[i]=None + self.alternatives = result.keys() + + def listAlternatives(self): + list = u'\n' + for i in range(len(self.alternatives)): + list += (u"%3i - %s\n" % (i, self.alternatives[i])) + wikipedia.output(list) + + def setupRegexes(self): + # compile regular expressions + self.ignore_contents_regexes = [] + if self.ignore_contents.has_key(self.mylang): + for ig in self.ignore_contents[self.mylang]: + self.ignore_contents_regexes.append(re.compile(ig)) + + linktrail = self.mysite.linktrail() + self.trailR = re.compile(linktrail) + # The regular expression which finds links. Results consist of four groups: + # group title is the target page title, that is, everything before | or ]. + # group section is the page section. It'll include the # to make life easier for us. + # group label is the alternative link title, that's everything between | and ]. + # group linktrail is the link trail, that's letters after ]] which are part of the word. + # note that the definition of 'letter' varies from language to language. + self.linkR = re.compile(r'[[(?P<title>[^]|#]*)(?P<section>#[^]|]*)?(|(?P<label>[^]]*))?]](?P<linktrail>' + linktrail + ')') + + def treat(self, refPage, disambPage): + """ + Parameters: + disambPage - The disambiguation page or redirect we don't want anything + to link on + refPage - A page linking to disambPage + Returns False if the user pressed q to completely quit the program. + Otherwise, returns True. + """ + # TODO: break this function up into subroutines! + + include = False + unlink = False + new_targets = [] + try: + text=refPage.get(throttle=False) + ignoreReason = self.checkContents(text) + if ignoreReason: + wikipedia.output('\n\nSkipping %s because it contains %s.\n\n' % (refPage.title(), ignoreReason)) + else: + include = True + except wikipedia.IsRedirectPage: + wikipedia.output(u'%s is a redirect to %s' % (refPage.title(), disambPage.title())) + if disambPage.isRedirectPage(): + target = self.alternatives[0] + choice = wikipedia.inputChoice(u'Do you want to make redirect %s point to %s?' % (refPage.title(), target), ['yes', 'no'], ['y', 'N'], 'N') + if choice == 'y': + redir_text = '#%s [[%s]]' % (self.mysite.redirect(default=True), target) + try: + refPage.put_async(redir_text,comment=self.comment) + except wikipedia.PageNotSaved, error: + wikipedia.output(u'Page not saved: %s' % error.args) + else: + choice = wikipedia.inputChoice(u'Do you want to work on pages linking to %s?' % refPage.title(), ['yes', 'no', 'change redirect'], ['y', 'N', 'c'], 'N') + if choice == 'y': + gen = ReferringPageGeneratorWithIgnore(refPage, self.primary) + preloadingGen = pagegenerators.PreloadingGenerator(gen) + for refPage2 in preloadingGen: + # run until the user selected 'quit' + if not self.treat(refPage2, refPage): + break + elif choice == 'c': + text=refPage.get(throttle=False,get_redirect=True) + include = "redirect" + except wikipedia.NoPage: + wikipedia.output(u'Page [[%s]] does not seem to exist?! Skipping.' % refPage.title()) + include = False + if include in (True, "redirect"): + # make a backup of the original text so we can show the changes later + original_text = text + n = 0 + curpos = 0 + edited = False + # This loop will run until we have finished the current page + while True: + m = self.linkR.search(text, pos = curpos) + if not m: + if n == 0: + wikipedia.output(u"No changes necessary in %s" % refPage.title()) + return True + else: + # stop loop and save page + break + # Make sure that next time around we will not find this same hit. + curpos = m.start() + 1 + # ignore interwiki links and links to sections of the same page + if m.group('title') == '' or self.mysite.isInterwikiLink(m.group('title')): + continue + else: + try: + linkPage = wikipedia.Page(disambPage.site(), m.group('title')) + # Check whether the link found is to disambPage. + except wikipedia.InvalidTitle: + continue + if linkPage != disambPage: + continue + + n += 1 + # how many bytes should be displayed around the current link + context = 60 + # This loop will run while the user doesn't choose an option + # that will actually change the page + while True: + # Show the title of the page where the link was found. + # Highlight the title in purple. + wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % refPage.title()) + + # at the beginning of the link, start red color. + # at the end of the link, reset the color to default + wikipedia.output(text[max(0, m.start() - context) : m.start()] + '\03{lightred}' + text[m.start() : m.end()] + '\03{default}' + text[m.end() : m.end() + context]) + + if not self.always: + if edited: + choice = wikipedia.input(u"Option (#, r#, s=skip link, e=edit page, n=next page, u=unlink, q=quit\n" + " m=more context, l=list, a=add new, x=save in this form):") + else: + choice = wikipedia.input(u"Option (#, r#, s=skip link, e=edit page, n=next page, u=unlink, q=quit\n" + " m=more context, d=show disambiguation page, l=list, a=add new):") + else: + choice = self.always + if choice in ['a', 'A']: + newAlternative = wikipedia.input(u'New alternative:') + self.alternatives.append(newAlternative) + self.listAlternatives() + elif choice in ['e', 'E']: + editor = editarticle.TextEditor() + newText = editor.edit(text, jumpIndex = m.start(), highlight = disambPage.title()) + # if user didn't press Cancel + if newText and newText != text: + text = newText + break + elif choice in ['d', 'D']: + editor = editarticle.TextEditor() + if disambPage.isRedirectPage(): + disambredir = disambPage.getRedirectTarget() + disambigText = editor.edit(disambredir.get(), jumpIndex = m.start(), highlight = disambredir.title()) + else: + disambigText = editor.edit(disambPage.get(), jumpIndex = m.start(), highlight = disambPage.title()) + elif choice in ['l', 'L']: + self.listAlternatives() + elif choice in ['m', 'M']: + # show more text around the link we're working on + context *= 2 + else: + break + + if choice in ['e', 'E']: + # user has edited the page and then pressed 'OK' + edited = True + curpos = 0 + continue + elif choice in ['n', 'N']: + # skip this page + if self.primary: + # If run with the -primary argument, skip this occurence next time. + self.primaryIgnoreManager.ignore(refPage) + return True + elif choice in ['q', 'Q']: + # quit the program + return False + elif choice in ['s', 'S']: + # Next link on this page + n -= 1 + continue + elif choice in ['x', 'X'] and edited: + # Save the page as is + break + + # The link looks like this: + # [[page_title|link_text]]trailing_chars + page_title = m.group('title') + link_text = m.group('label') + + if not link_text: + # or like this: [[page_title]]trailing_chars + link_text = page_title + if m.group('section') == None: + section = '' + else: + section = m.group('section') + trailing_chars = m.group('linktrail') + if trailing_chars: + link_text += trailing_chars + + if choice in ['u', 'U']: + # unlink - we remove the section if there's any + text = text[:m.start()] + link_text + text[m.end():] + unlink = True + continue + else: + if len(choice)>0 and choice[0] == 'r': + # we want to throw away the original link text + replaceit = True + choice = choice[1:] + elif include == "redirect": + replaceit = True + else: + replaceit = False + + try: + choice=int(choice) + except ValueError: + wikipedia.output(u"Unknown option") + # step back to ask the user again what to do with the current link + curpos -= 1 + continue + if choice >= len(self.alternatives) or choice < 0: + wikipedia.output(u"Choice out of range. Please select a number between 0 and %i." % (len(self.alternatives) - 1)) + # show list of possible choices + self.listAlternatives() + # step back to ask the user again what to do with the current link + curpos -= 1 + continue + new_page_title = self.alternatives[choice] + repPl = wikipedia.Page(disambPage.site(), new_page_title) + if (new_page_title[0].isupper()) or (link_text[0].isupper()): + new_page_title = repPl.title() + else: + new_page_title = repPl.title() + new_page_title = new_page_title[0].lower() + new_page_title[1:] + if new_page_title not in new_targets: + new_targets.append(new_page_title) + if replaceit and trailing_chars: + newlink = "[[%s%s]]%s" % (new_page_title, section, trailing_chars) + elif replaceit or (new_page_title == link_text and not section): + newlink = "[[%s]]" % new_page_title + # check if we can create a link with trailing characters instead of a pipelink + elif len(new_page_title) <= len(link_text) and firstcap(link_text[:len(new_page_title)]) == firstcap(new_page_title) and re.sub(self.trailR, '', link_text[len(new_page_title):]) == '' and not section: + newlink = "[[%s]]%s" % (link_text[:len(new_page_title)], link_text[len(new_page_title):]) + else: + newlink = "[[%s%s|%s]]" % (new_page_title, section, link_text) + text = text[:m.start()] + newlink + text[m.end():] + continue + + wikipedia.output(text[max(0,m.start()-30):m.end()+30]) + if text == original_text: + wikipedia.output(u'\nNo changes have been made:\n') + else: + wikipedia.output(u'\nThe following changes have been made:\n') + wikipedia.showDiff(original_text, text) + wikipedia.output(u'') + # save the page + self.setSummaryMessage(disambPage, new_targets, unlink) + try: + refPage.put_async(text,comment=self.comment) + except wikipedia.LockedPage: + wikipedia.output(u'Page not saved: page is locked') + except wikipedia.PageNotSaved, error: + wikipedia.output(u'Page not saved: %s' % error.args) + return True + + def findAlternatives(self, disambPage): + if disambPage.isRedirectPage() and not self.primary: + if self.primary_redir_template.has_key(disambPage.site().lang) and self.primary_redir_template[disambPage.site().lang] in disambPage.templates(get_redirect = True): + baseTerm = disambPage.title() + for template in disambPage.templatesWithParams(get_redirect = True): + if template[0] == self.primary_redir_template[disambPage.site().lang] and len(template[1]) > 0: + baseTerm = template[1][1] + disambTitle = primary_topic_format[self.mylang] % baseTerm + try: + disambPage2 = wikipedia.Page(self.mysite, disambTitle) + links = disambPage2.linkedPages() + links = [correctcap(l,disambPage2.get()) for l in links] + except wikipedia.NoPage: + wikipedia.output(u"No page at %s, using redirect target." % disambTitle) + links = disambPage.linkedPages()[:1] + links = [correctcap(l,disambPage.get(get_redirect = True)) for l in links] + self.alternatives += links + else: + try: + target = disambPage.getRedirectTarget().title() + self.alternatives.append(target) + except wikipedia.NoPage: + wikipedia.output(u"The specified page was not found.") + user_input = wikipedia.input(u"""\ +Please enter the name of the page where the redirect should have pointed at, +or press enter to quit:""") + if user_input == "": + sys.exit(1) + else: + self.alternatives.append(user_input) + except wikipedia.IsNotRedirectPage: + wikipedia.output( + u"The specified page is not a redirect. Skipping.") + return False + elif self.getAlternatives: + try: + if self.primary: + try: + disambPage2 = wikipedia.Page(self.mysite, + primary_topic_format[self.mylang] + % disambPage.title() + ) + links = disambPage2.linkedPages() + links = [correctcap(l,disambPage2.get()) for l in links] + except wikipedia.NoPage: + wikipedia.output(u"Page does not exist, using the first link in page %s." % disambPage.title()) + links = disambPage.linkedPages()[:1] + links = [correctcap(l,disambPage.get()) for l in links] + else: + try: + links = disambPage.linkedPages() + links = [correctcap(l,disambPage.get()) for l in links] + except wikipedia.NoPage: + wikipedia.output(u"Page does not exist, skipping.") + return False + except wikipedia.IsRedirectPage: + wikipedia.output(u"Page is a redirect, skipping.") + return False + self.alternatives += links + return True + + def setSummaryMessage(self, disambPage, new_targets = [], unlink = False): + # make list of new targets + targets = '' + for page_title in new_targets: + targets += u'[[%s]], ' % page_title + # remove last comma + targets = targets[:-2] + + if not targets: + targets = wikipedia.translate(self.mysite, unknown_msg) + + # first check whether user has customized the edit comment + if wikipedia.config.disambiguation_comment.has_key(self.mysite.family.name) and wikipedia.config.disambiguation_comment[self.mysite.family.name].has_key(self.mylang): + try: + self.comment = wikipedia.translate(self.mysite, + wikipedia.config.disambiguation_comment[ + self.mysite.family.name] + ) % (disambPage.title(), targets) + #Backwards compatibility, type error probably caused by too many arguments for format string + except TypeError: + self.comment = wikipedia.translate(self.mysite, + wikipedia.config.disambiguation_comment[ + self.mysite.family.name] + ) % disambPage.title() + elif disambPage.isRedirectPage(): + # when working on redirects, there's another summary message + if unlink and not new_targets: + self.comment = wikipedia.translate(self.mysite, msg_redir_unlink) % disambPage.title() + else: + self.comment = wikipedia.translate(self.mysite, msg_redir) % (disambPage.title(), targets) + else: + if unlink and not new_targets: + self.comment = wikipedia.translate(self.mysite, msg_unlink) % disambPage.title() + else: + self.comment = wikipedia.translate(self.mysite, msg) % (disambPage.title(), targets) + + def run(self): + if self.main_only: + if not ignore_title.has_key(self.mysite.family.name): + ignore_title[self.mysite.family.name] = {} + if not ignore_title[self.mysite.family.name].has_key(self.mylang): + ignore_title[self.mysite.family.name][self.mylang] = [] + ignore_title[self.mysite.family.name][self.mylang] += [ + u'%s:' % namespace for namespace in self.mysite.namespaces()] + + for disambPage in self.generator: + self.primaryIgnoreManager = PrimaryIgnoreManager(disambPage, enabled=self.primary) + + if not self.findAlternatives(disambPage): + continue + + self.makeAlternativesUnique() + # sort possible choices + if wikipedia.config.sort_ignore_case: + self.alternatives.sort(lambda x,y: cmp(x.lower(), y.lower())) + else: + self.alternatives.sort() + self.listAlternatives() + + gen = ReferringPageGeneratorWithIgnore(disambPage, self.primary, minimum = self.minimum) + preloadingGen = pagegenerators.PreloadingGenerator(gen) + for refPage in preloadingGen: + if not self.primaryIgnoreManager.isIgnored(refPage): + # run until the user selected 'quit' + if not self.treat(refPage, disambPage): + break + + # clear alternatives before working on next disambiguation page + self.alternatives = [] + +def main(): + # the option that's always selected when the bot wonders what to do with + # a link. If it's None, the user is prompted (default behaviour). + always = None + alternatives = [] + getAlternatives = True + # if the -file argument is used, page titles are dumped in this array. + # otherwise it will only contain one page. + generator = None + # This temporary array is used to read the page title if one single + # page to work on is specified by the arguments. + pageTitle = [] + primary = False + main_only = False + + # For sorting the linked pages, case can be ignored + ignoreCase = False + minimum = 0 + + for arg in wikipedia.handleArgs(): + if arg.startswith('-primary:'): + primary = True + getAlternatives = False + alternatives.append(arg[9:]) + elif arg == '-primary': + primary = True + elif arg.startswith('-always:'): + always = arg[8:] + elif arg.startswith('-file'): + if len(arg) == 5: + generator = pagegenerators.TextfilePageGenerator(filename = None) + else: + generator = pagegenerators.TextfilePageGenerator(filename = arg[6:]) + elif arg.startswith('-pos:'): + if arg[5]!=':': + mysite = wikipedia.getSite() + page = wikipedia.Page(mysite, arg[5:]) + if page.exists(): + alternatives.append(page.title()) + else: + answer = wikipedia.inputChoice(u'Possibility %s does not actually exist. Use it anyway?' + % page.title(), ['yes', 'no'], ['y', 'N'], 'N') + if answer == 'y': + alternatives.append(page.title()) + else: + alternatives.append(arg[5:]) + elif arg == '-just': + getAlternatives = False + elif arg == '-main': + main_only = True + elif arg.startswith('-min:'): + minimum = int(arg[5:]) + elif arg.startswith('-start'): + try: + if len(arg) <= len('-start:'): + generator = pagegenerators.CategorizedPageGenerator(wikipedia.getSite().disambcategory()) + else: + generator = pagegenerators.CategorizedPageGenerator(wikipedia.getSite().disambcategory(), start = arg[7:]) + generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0]) + except wikipedia.NoPage: + print "Disambiguation category for your wiki is not known." + raise + elif arg.startswith("-"): + print "Unrecognized command line argument: %s" % arg + # show help text and exit + wikipedia.showHelp() + else: + pageTitle.append(arg) + + # if the disambiguation page is given as a command line argument, + # connect the title's parts with spaces + if pageTitle != []: + pageTitle = ' '.join(pageTitle) + page = wikipedia.Page(wikipedia.getSite(), pageTitle) + generator = iter([page]) + + # if no disambiguation pages was given as an argument, and none was + # read from a file, query the user + if not generator: + pageTitle = wikipedia.input(u'On which disambiguation page do you want to work?') + page = wikipedia.Page(wikipedia.getSite(), pageTitle) + generator = iter([page]) + + bot = DisambiguationRobot(always, alternatives, getAlternatives, generator, primary, main_only, minimum = minimum) + bot.run() + + + +if __name__ == "__main__": + try: + main() + finally: + wikipedia.stopme()
pywikipedia-l@lists.wikimedia.org