[Pywikipedia-l] SVN: [6256] branches/rewrite/pywikibot/scripts/solve_disambiguation.py

russblau at svn.wikimedia.org russblau at svn.wikimedia.org
Wed Jan 14 15:17:47 UTC 2009


Revision: 6256
Author:   russblau
Date:     2009-01-14 15:17:47 +0000 (Wed, 14 Jan 2009)

Log Message:
-----------
Branch for port to rewrite

Added Paths:
-----------
    branches/rewrite/pywikibot/scripts/solve_disambiguation.py

Copied: branches/rewrite/pywikibot/scripts/solve_disambiguation.py (from rev 6255, trunk/pywikipedia/solve_disambiguation.py)
===================================================================
--- branches/rewrite/pywikibot/scripts/solve_disambiguation.py	                        (rev 0)
+++ branches/rewrite/pywikibot/scripts/solve_disambiguation.py	2009-01-14 15:17:47 UTC (rev 6256)
@@ -0,0 +1,1012 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+Script to help a human solve disambiguations by presenting a set of options.
+
+Specify the disambiguation page on the command line, or enter it at the
+prompt after starting the program. (If the disambiguation page title starts
+with a '-', you cannot name it on the command line, but you can enter it at
+the prompt.)  The program will pick up the page, and look for all
+alternative links, and show them with a number adjacent to them.  It will
+then automatically loop over all pages referring to the disambiguation page,
+and show 30 characters of context on each side of the reference to help you
+make the decision between the alternatives.  It will ask you to type the
+number of the appropriate replacement, and perform the change.
+
+It is possible to choose to replace only the link (just type the number) or
+replace both link and link-text (type 'r' followed by the number).
+
+Multiple references in one page will be scanned in order, but typing 'n' (next)
+on any one of them will leave the complete page unchanged. To leave only some reference unchanged, use the 's' (skip) option.
+
+Command line options:
+
+   -pos:XXXX   adds XXXX as an alternative disambiguation
+
+   -just       only use the alternatives given on the command line, do not
+               read the page for other possibilities
+
+   -primary    "primary topic" disambiguation (Begriffsklärung nach Modell 2).
+               That's titles where one topic is much more important, the
+               disambiguation page is saved somewhere else, and the important
+               topic gets the nice name.
+
+   -primary:XY like the above, but use XY as the only alternative, instead of
+               searching for alternatives in [[Keyword (disambiguation)]].
+               Note: this is the same as -primary -just -pos:XY
+
+   -file:XYZ   reads a list of pages from a text file. XYZ is the name of the
+               file from which the list is taken. If XYZ is not given, the user is asked for a filename.
+               Page titles should be inside [[double brackets]].
+               The -pos parameter won't work if -file is used.
+
+   -always:XY  instead of asking the user what to do, always perform the same
+               action. For example, XY can be "r0", "u" or "2". Be careful with
+               this option, and check the changes made by the bot. Note that
+               some choices for XY don't make sense and will result in a loop,
+               e.g. "l" or "m".
+
+   -main       only check pages in the main namespace, not in the talk,
+               wikipedia, user, etc. namespaces.
+
+   -start:XY   goes through all disambiguation pages in the category on your wiki
+               that is defined (to the bot) as the category containing disambiguation
+               pages, starting at XY. If only '-start' or '-start:' is given, it starts
+               at the beginning.
+               
+   -min:XX     (XX being a number) only work on disambiguation pages for which
+               at least XX are to be worked on.
+
+To complete a move of a page, one can use:
+
+    python solve_disambiguation.py -just -pos:New_Name Old_Name
+"""
+#
+# (C) Rob W.W. Hooft, 2003
+# (C) Daniel Herding, 2004
+# (C) Andre Engels, 2003-2004
+# (C) WikiWichtel, 2004
+#
+# Distributed under the terms of the MIT license.
+#
+__version__='$Id$'
+#
+# Standard library imports
+import re, sys, codecs
+
+# Application specific imports
+import wikipedia, pagegenerators, editarticle
+
+# Summary message when working on disambiguation pages
+msg = {
+    'ar': u'توضيح بمساعدة روبوت: %s - غير الوصلة أو الوصلات إلى %s',
+    'cs': u'Odstranění linku na rozcestník [[%s]] s použitím robota - Změněn(y) odkaz(y) na %s',
+    'en': u'Robot-assisted disambiguation: %s - Changed link(s) to %s',
+    'es': u'Bot:Desambiguación asistida: %s - Cambiando enlace(s) para %s',
+    'da': u'Retter flertydigt link til: %s - Ændrede link(s) til %s',
+    'de': u'Bot-unterstützte Begriffsklärung: %s - Link(s) ersetzt durch %s',
+    'fi': u'Täsmennystä botin avulla: %s korvattiin link(e)illä %s',
+    'fr': u'Homonymie résolue à l\'aide du robot: %s - Modifications du (des) lien(s) pour %s',
+    'he': u'תיקון קישור לדף פירושונים באמצעות בוט: %s',
+    'hu': u'Bottal végzett egyértelműsítés: %s –> %s',
+    'ia': u'Disambiguation assistite per robot: %s - Changed link(s) to %s',
+    'it': u'Sistemazione automatica della disambigua: %s - Inversione di redirect %s',
+    'lt': u'Nuorodų į nukrepiamąjį straipsnį keitimas: %s - Pakeistos nuorodos į %s',
+    'kk': u'Айрықты мағыналарды бот көмегімен шешу: %s - Changed link(s) to %s',
+    'ko': u'로봇의 도움을 받아 동음이의 처리 : [[%s]] - %s 문서로 링크 걸음',
+    'nl': u'Robot-geholpen doorverwijzing: [[%s]] - Link(s) veranderd naar %s',
+    'no': u'bot: Retter lenke til peker: %s - Endret lenke(r) til %s',
+    'pl': u'Wspomagane przez robota ujednoznacznienie: %s - Zmieniono link(i) %s',
+    'pt': u'Desambiguação assistida por bot: %s link(s) mudado(s) para %s',
+    'ru': u'Разрешение значений с помощью бота: %s - Changed link(s) to %s',
+    'sr': u'Решавање вишезначних одредница помоћу бота: %s - Changed link(s) to %s',
+    'sv': u'Länkar direkt till rätt artikel för: %s - Bytte länk(ar) till %s',
+    }
+
+# Summary message when working on disambiguation pages and the link is removed
+msg_unlink = {
+    'ar': u'توضيح بمساعدة روبوت: %s - أزال الوصلة أو الوصلات.',
+    'cs': u'Odstranění linku na rozcestník [[%s]] s použitím robota - Odstraněn(y) odkaz(y)',
+    'en': u'Robot-assisted disambiguation: %s - Removed link(s).',
+    'da': u'Retter flertydigt link til: %s - Fjernede link(s)',
+    'de': u'Bot-unterstützte Begriffsklärung: %s - Link(s) entfernt',
+    'fi': u'Täsmennystä botin avulla: %s - poistettiin linkkejä.',
+    'fr': u'Homonymie résolue à l\'aide du robot: %s - Retrait du (des) lien(s)',
+    'he': u'הסרת קישור לדף פירושונים באמצעות בוט: %s',
+    'hu': u'Bottal végzett egyértelműsítés: %s – hivatkozások eltávolítása',
+    'ia': u'Disambiguation assistite per robot: %s - Removed link(s).',
+    'it': u'Sistemazione automatica della disambigua: %s - Collegamenti rimossi',
+    'lt': u'Nuorodų į nukrepiamąjį straipsnį keitimas: %s - Pašalintos nuorodos',
+    'kk': u'Айрықты мағыналарды бот көмегімен шешу: %s - Removed link(s).',
+    'ko': u'로봇의 도움을 받아 동음이의 처리: [[%s]] - 링크 제거',
+    'nl': u'Robot-geholpen doorverwijzing: [[%s]] - Link(s) weggehaald.',
+    'no': u'bot: Retter lenke til peker: %s - Fjernet lenke(r)',
+    'pl': u'Wspomagane przez robota ujednoznacznienie: %s - Usunięto link(i)',
+    'pt': u'Desambiguação assistida por bot: %s link(s) removido(s)',
+    'ru': u'Разрешение значений с помощью бота: %s - Removed link(s)',
+    'sr': u'Решавање вишезначних одредница помоћу бота: %s - Removed link(s)',
+    'sv': u'Länkar direkt till rätt artikel för: %s - Tog bort länk(ar)',
+    }
+
+# Summary message when working on redirects
+msg_redir = {
+    'ar': u'توضيح بمساعدة روبوت: %s - غير الوصلة أو الوصلات إلى %s',
+    'cs': u'Robot opravil přesměrování na %s - Změněn(y) odkaz(y) na %s',
+    'en': u'Robot-assisted disambiguation: %s - Changed link(s) to %s',
+    'da': u'Retter flertydigt link til: %s - Ændrede link(s) til %s',
+    'de': u'Bot-unterstützte Redirectauflösung: %s - Link(s) ersetzt durch %s',
+    'fi': u'Täsmennystä botin avulla: %s korvattiin link(e)illä %s',
+    'fr': u'Correction de lien vers redirect: %s - Modifications du (des) lien(s) pour %s',
+    'he': u'תיקון קישור לדף פירושונים באמצעות בוט: %s שונה ל%s',
+    'hu': u'Bottal végzett egyértelműsítés: %s –> %s',
+    'ia': u'Resolution de redirectiones assistite per robot: %s - Changed link(s) to %s',
+    'it': u'Sistemazione automatica del redirect: %s - Inversione di redirect %s',
+    'lt': u'Nuorodų į peradresavimo straipsnį keitimas: %s - Pakeistos nuorodos į %s',
+    'kk': u'Айрықты мағыналарды бот көмегімен шешу: %s - Changed link(s) to %s',
+    'ko': u'로봇의 도움을 받아 동음이의 처리: [[%s]] - %s 문서로 링크 걸음',
+    'nl': u'Robot-geholpen redirect-oplossing: [[%s]] - Link(s) veranderd naar %s',
+    'no': u'bot: Endrer omdirigeringslenke: %s - Endret lenke(r) til %s',
+    'pl': u'Wspomagane przez robota ujednoznacznienie: %s - Zmieniono link(i) %s',
+    'pt': u'Desambiguação assistida por bot: %s link(s) mudados para %s',
+    'ru': u'Разрешение значений с помощью бота: %s - Changed link(s) to %s',
+    'sr': u'Решавање вишезначних одредница помоћу бота: %s - Changed link(s) to %s',
+    'sv': u'Länkar direkt till rätt artikel för: %s - Bytte länk(ar) till %s',
+    }
+
+# Summary message when working on redirects and the link is removed
+msg_redir_unlink = {
+    'ar': u'توضيح بمساعدة روبوت: %s - أزال الوصلة أو الوصلات',
+    'cs': u'Robot opravil přesměrování na %s - Odstraněn(y) odkaz(y)',
+    'en': u'Robot-assisted disambiguation: %s - Removed link(s)',
+    'da': u'Retter flertydigt link til: %s - Fjernede link(s)',
+    'de': u'Bot-unterstützte Redirectauflösung: %s - Link(s) entfernt',
+    'fr': u'Correction de lien vers redirect: %s - Retrait du (des) lien(s)',
+    'fi': u'Täsmennystä botin avulla: %s - poistettiin linkkejä',
+    'he': u'הסרת קישור לדף פירושונים באמצעות בוט: %s',
+    'hu': u'Bottal támogatott egyértelműsítés: %s – hivatkozások eltávolítása',
+    'ia': u'Resolution de redirectiones assistite per robot: %s - Removed link(s).',
+    'it': u'Sistemazione automatica del redirect: %s - Collegamenti rimossi',
+    'lt': u'Nuorodų į peradresavimo straipsnį keitimas: %s - Pašalintos nuorodos',
+    'kk': u'Айрықты мағыналарды бот көмегімен шешу: %s - Removed link(s).',
+    'ko': u'로봇의 도움을 받아 동음이의 처리: [[%s]] - 링크 제거',
+    'nl': u'Robot-geholpen redirect-oplossing: [[%s]] - Link(s) weggehaald',
+    'no': u'bot: Endrer omdirigeringslenke: %s - Fjernet lenke(r)',
+    'pl': u'Wspomagane przez robota ujednoznacznienie: %s - Usunięto link(i)',
+    'pt': u'Desambiguação assistida por bot: %s link(s) removidos',
+    'ru': u'Разрешение значений с помощью бота: %s - Removed link(s)',
+    'sr': u'Решавање вишезначних одредница помоћу бота: %s - Removed link(s)',
+    'sv': u'Länkar direkt till rätt artikel för: %s - Tog bort länk(ar)',
+    }
+
+# Summary message to (unknown)
+unknown_msg = {
+    'ar' : u'(غير معروف)',
+    'en' : u'(unknown)',
+    'fi' : u'(tuntematon)',
+    'hu' : u'(ismeretlen)',
+    'pt' : u'(desconhecido)',
+    }
+
+# disambiguation page name format for "primary topic" disambiguations
+# (Begriffsklärungen nach Modell 2)
+primary_topic_format = {
+    'ar': u'%s_(توضيح)',
+    'cs': u'%s_(rozcestník)',
+    'de': u'%s_(Begriffsklärung)',
+    'en': u'%s_(disambiguation)',
+    'fi': u'%s_(täsmennyssivu)',
+    'hu': u'%s_(egyértelműsítő lap)',
+    'ia': u'%s_(disambiguation)',
+    'it': u'%s_(disambigua)',
+    'lt': u'%s_(reikšmės)',
+    'kk': u'%s_(айрық)',
+    'ko': u'%s_(동음이의)',
+    'nl': u'%s_(doorverwijspagina)',
+    'no': u'%s_(peker)',
+    'pl': u'%s_(ujednoznacznienie)',
+    'pt': u'%s_(desambiguação)',
+    'he': u'%s_(פירושונים)',
+    'ru': u'%s_(значения)',
+    'sr': u'%s_(вишезначна одредница)',
+    'sv': u'%s_(olika betydelser)',
+    }
+
+# List pages that will be ignored if they got a link to a disambiguation
+# page. An example is a page listing disambiguations articles.
+# Special chars should be encoded with unicode (\x##) and space used
+# instead of _
+
+ignore_title = {
+    'wikipedia': {
+        'ar': [
+            u'تصنيف:صفحات توضيح',
+        ],
+        'cs': [
+            u'Wikipedie:Chybějící interwiki/.+',
+            u'Wikipedie:Rozcestníky',
+            u'Wikipedie diskuse:Rozcestníky',
+            u'Wikipedie:Seznam nejvíce odkazovaných rozcestníků',
+            u'Wikipedie:Seznam rozcestníků/první typ',
+            u'Wikipedie:Seznam rozcestníků/druhý typ',
+            u'Wikipedista:Zirland/okres',
+        ],
+        'da': [
+            u'Wikipedia:Links til sider med flertydige titler'
+        ],
+        'de': [
+            u'Benutzer:Katharina/Begriffsklärungen',
+            u'Benutzer:Kirschblut/.+buchstabenkürzel',
+            u'Benutzer:Noisper/Dingliste/[A-Z]',
+            u'Benutzer:SirJective/.+',
+            u'Benutzer:SrbBot/Index/.+',
+            u'Benutzer Diskussion:.+',
+            u'GISLexikon \([A-Z]\)',
+            u'Lehnwort',
+            u'Liste griechischer Wortstämme in deutschen Fremdwörtern',
+            u'Liste von Gräzismen',
+            u'Portal:Abkürzungen/.+',
+            u'Wikipedia:Archiv:.+',
+            u'Wikipedia:Artikelwünsche/Ding-Liste/[A-Z]',
+            u'Wikipedia:Begriffsklärung.*',
+            u'Wikipedia:Dreibuchstabenkürzel von [A-Z][A-Z][A-Z] bis [A-Z][A-Z][A-Z]',
+            u'Wikipedia:Interwiki-Konflikte',
+            u'Wikipedia:Kurze Artikel',
+            u'Wikipedia:Liste aller 2-Buchstaben-Kombinationen',
+            u'Wikipedia:Liste mathematischer Themen/BKS',
+            u'Wikipedia:Liste mathematischer Themen/Redirects',
+            u'Wikipedia:Löschkandidaten/.+',
+            u'Wikipedia:Qualitätsoffensive/UNO', #requested by Benutzer:Addicted
+            u'Wikipedia:WikiProjekt Altertumswissenschaft/.+',
+            u'Wikipedia:WikiProjekt Verwaiste Seiten/Begriffsklärungen',
+        ],
+         'en': [
+            u'Wikipedia:Links to disambiguating pages',
+            u'Wikipedia:Disambiguation pages with links',
+            u'Wikipedia:Multiple-place names \([A-Z]\)',
+            u'Wikipedia:Non-unique personal name',
+            u"User:Jerzy/Disambiguation Pages i've Editted",
+            u'User:Gareth Owen/inprogress',
+            u'TLAs from [A-Z][A-Z][A-Z] to [A-Z][A-Z][A-Z]',
+            u'List of all two-letter combinations',
+            u'User:Daniel Quinlan/redirects.+',
+            u'User:Oliver Pereira/stuff',
+            u'Wikipedia:French Wikipedia language links',
+            u'Wikipedia:Polish language links',
+            u'Wikipedia:Undisambiguated abbreviations/.+',
+            u'List of acronyms and initialisms',
+            u'Wikipedia:Usemod article histories',
+            u'User:Pizza Puzzle/stuff',
+            u'List of generic names of political parties',
+            u'Talk:List of initialisms/marked',
+            u'Talk:List of initialisms/sorted',
+            u'Talk:Programming language',
+            u'Talk:SAMPA/To do',
+            u"Wikipedia:Outline of Roget's Thesaurus",
+            u'User:Wik/Articles',
+            u'User:Egil/Sandbox',
+            u'Wikipedia talk:Make only links relevant to the context',
+            u'Wikipedia:Common words, searching for which is not possible',
+        ],
+        'fi': [
+            u'Wikipedia:Luettelo täsmennyssivuista',
+            u'Wikipedia:Luettelo (täsmennyssivuista)',
+            u'Wikipedia:Täsmennyssivu',
+        ],
+        'fr': [
+            u'Wikipédia:Liens aux pages d\'homonymie',
+            u'Wikipédia:Homonymie',
+            u'Wikipédia:Homonymie/Homonymes dynastiques',
+            u'Wikipédia:Prise de décision, noms des membres de dynasties/liste des dynastiens',
+            u'Liste de toutes les combinaisons de deux lettres',
+            u'Wikipédia:Log d\'upload/.*',
+            u'Sigles de trois lettres de [A-Z]AA à [A-Z]ZZ',
+            u'Wikipédia:Pages sans interwiki,.'
+        ],
+        'fy': [
+            u'Wikipedy:Fangnet',
+        ],
+        'ia': [
+            u'Categoria:Disambiguation',
+            u'Wikipedia:.+',
+            u'Usator:.+',
+            u'Discussion Usator:.+',
+        ],
+        'it': [
+            u'Aiuto:Disambigua/Disorfanamento',
+            u'Discussioni utente:.+',
+            u'Utente:Civvì/disorfanamento',
+        ],
+        'kk': [
+            u'Санат:Айрықты бет',
+        ],
+        'ko': [
+            u'위키백과:(동음이의) 문서의 목록',
+            u'위키백과:동음이의어 문서의 목록',
+        ],
+        'lt': [
+            u'Wikipedia:Rodomi nukreipiamieji straipsniai',
+        ],
+        'nl': [
+            u"Gebruiker:.*",
+            u"Overleg gebruiker:.+[aA]rchief.*",
+            u"Overleg gebruiker:Pven",
+            u"Portaal:.+[aA]rchief.*",
+            u"Wikipedia:Humor en onzin.*",
+            u"Wikipedia:Links naar doorverwijspagina's/Winkeldochters.*",
+            u"Wikipedia:Project aanmelding bij startpagina's",
+            u"Wikipedia:Wikiproject Roemeense gemeenten/Doorverwijspagina's",
+            u'Categorie:Doorverwijspagina',
+            u'Lijst van Nederlandse namen van pausen',
+            u'Overleg Wikipedia:Discussie spelling 2005',
+            u'Overleg Wikipedia:Doorverwijspagina',
+            u'Overleg Wikipedia:Logboek.*',
+            u'Wikipedia:Logboek.*',
+            u'Overleg gebruiker:Sybren/test.*',
+            u'Overleg gebruiker:[0-9][0-9]?[0-9]?\.[0-9][0-9]?[0-9]?\.[0-9][0-9]?[0-9]?\.[0-9][0-9]?[0-9]?',
+            u'Overleg:Lage Landen (staatkunde)',
+            u'Wikipedia:.*[aA]rchief.*',
+            u'Wikipedia:Doorverwijspagina',
+            u'Wikipedia:Lijst van alle tweeletter-combinaties',
+            u'Wikipedia:Onderhoudspagina',
+            u'Wikipedia:Ongelijke redirects',
+            u'Wikipedia:Protection log',
+            u'Wikipedia:Te verwijderen.*',
+            u'Wikipedia:Top 1000 van meest bekeken artikelen',
+            u'Wikipedia:Wikipedianen met een encyclopedisch artikel',
+            u'Wikipedia:Woorden die niet als zoekterm gebruikt kunnen worden',
+         ],
+        'pl': [
+            u'Wikipedysta:.+',
+            u'Dyskusja.+:.+',
+         ],
+        'pt': [
+            u'Usuário:.+',
+            u'Usuário Discussão:.+',
+            u'Discussão:.+',
+            u'Lista de combinações de duas letras',
+            u'Wikipedia:Lista de páginas de desambiguação.+',
+            u'Wikipedia:Páginas para eliminar/.+',
+        ],
+        'ru': [
+            u'Категория:Disambig',
+            u'Википедия:Страницы разрешения неоднозначностей',
+            u'Википедия:Вики-уборка/Статьи без языковых ссылок',
+            u'Википедия:Страницы с пометкой «(значения)»',
+            u'Список общерусских фамилий',
+        ],
+    },
+    'memoryalpha': {
+        'en': [
+            u'Memory Alpha:Links to disambiguating pages'
+        ],
+        'de': [
+            u'Memory Alpha:Liste der Wortklärungsseiten'
+        ],
+    },
+}
+
+def firstcap(string):
+    return string[0].upper()+string[1:]
+
+def correctcap(link, text):
+    # If text links to a page with title link uncapitalized, uncapitalize link, otherwise capitalize it
+    linkupper = link.title()
+    linklower = linkupper[0].lower() + linkupper[1:]
+    if text.find("[[%s]]"%linklower) > -1 or text.find("[[%s|"%linklower) > -1:
+        return linklower
+    else:
+        return linkupper
+
+class ReferringPageGeneratorWithIgnore:
+    def __init__(self, disambPage, primary=False, minimum = 0):
+        self.disambPage = disambPage
+        # if run with the -primary argument, enable the ignore manager
+        self.primaryIgnoreManager = PrimaryIgnoreManager(disambPage,
+                                                         enabled=primary)
+        self.minimum = minimum
+        
+    def __iter__(self):
+        # TODO: start yielding before all referring pages have been found
+        refs = [page for page in self.disambPage.getReferences(follow_redirects = False, withTemplateInclusion = False)]
+        wikipedia.output(u"Found %d references." % len(refs))
+        # Remove ignorables
+        if ignore_title.has_key(self.disambPage.site().family.name) and ignore_title[self.disambPage.site().family.name].has_key(self.disambPage.site().lang):
+            for ig in ignore_title[self.disambPage.site().family.name][self.disambPage.site().lang]:
+                for i in range(len(refs)-1, -1, -1):
+                    if re.match(ig, refs[i].title()):
+                        if wikipedia.verbose:
+                            wikipedia.output('Ignoring page %s'
+                                             % refs[i].title())
+                        del refs[i]
+                    elif self.primaryIgnoreManager.isIgnored(refs[i]):
+                        #wikipedia.output('Ignoring page %s because it was skipped before' % refs[i].title())
+                        del refs[i]
+        if len(refs) < self.minimum:
+            wikipedia.output(u"Found only %d pages to work on; skipping." % len(refs))
+            return
+        wikipedia.output(u"Will work on %d pages." % len(refs))
+        for ref in refs:
+            yield ref
+
+class PrimaryIgnoreManager(object):
+    '''
+    If run with the -primary argument, reads from a file which pages should
+    not be worked on; these are the ones where the user pressed n last time.
+    If run without the -primary argument, doesn't ignore any pages.
+    '''
+    def __init__(self, disambPage, enabled = False):
+        self.disambPage = disambPage
+        self.enabled = enabled
+
+        self.ignorelist = []
+        filename = wikipedia.config.datafilepath('disambiguations',
+                self.disambPage.titleForFilename() + '.txt')
+        try:
+            # The file is stored in the disambiguation/ subdir. Create if necessary.
+            f = codecs.open(filename, 'r', 'utf-8')
+            for line in f.readlines():
+                # remove trailing newlines and carriage returns
+                while line[-1] in ['\n', '\r']:
+                    line = line[:-1]
+                #skip empty lines
+                if line != '':
+                    self.ignorelist.append(line)
+            f.close()
+        except IOError:
+            pass
+
+    def isIgnored(self, refPage):
+        return self.enabled and refPage.urlname() in self.ignorelist
+
+    def ignore(self, refPage):
+        if self.enabled:
+            # Skip this occurence next time.
+            filename = wikipedia.config.datafilepath('disambiguations',
+                                     self.disambPage.urlname() + '.txt')
+            try:
+                # Open file for appending. If none exists yet, create a new one.
+                # The file is stored in the disambiguation/ subdir. Create if necessary.
+                f = codecs.open(filename, 'a', 'utf-8')
+                f.write(refPage.urlname() + '\n')
+                f.close()
+            except IOError:
+                pass
+
+
+class DisambiguationRobot(object):
+    ignore_contents = {
+        'de':(u'{{[Ii]nuse}}',
+              u'{{[Ll]öschen}}',
+            ),
+        'fi':(u'{{[Tt]yöstetään}}',
+            ),
+        'kk':(u'{{[Ii]nuse}}',
+              u'{{[Pp]rocessing}}',
+            ),
+        'nl':(u'{{wiu2}}',
+              u'{{nuweg}}',
+            ),
+        'ru':(u'{{[Ii]nuse}}',
+              u'{{[Pp]rocessing}}',
+            ),
+    }
+    
+    primary_redir_template = {
+        # Page.templates() format, first letter uppercase
+        'hu': u'Egyért-redir',
+    }
+    
+    def __init__(self, always, alternatives, getAlternatives, generator, primary, main_only, minimum = 0):
+        self.always = always
+        self.alternatives = alternatives
+        self.getAlternatives = getAlternatives
+        self.generator = generator
+        self.primary = primary
+        self.main_only = main_only
+        self.minimum = minimum
+
+        self.mysite = wikipedia.getSite()
+        self.mylang = self.mysite.language()
+        self.comment = None
+
+        self.setupRegexes()
+
+    def checkContents(self, text):
+        '''
+        For a given text, returns False if none of the regular
+        expressions given in the dictionary at the top of this class
+        matches a substring of the text.
+        Otherwise returns the substring which is matched by one of
+        the regular expressions.
+        '''
+        for ig in self.ignore_contents_regexes:
+            match = ig.search(text)
+            if match:
+                return match.group()
+        return None
+
+    def makeAlternativesUnique(self):
+        # remove duplicate entries
+        result={}
+        for i in self.alternatives:
+            result[i]=None
+        self.alternatives = result.keys()
+
+    def listAlternatives(self):
+        list = u'\n'
+        for i in range(len(self.alternatives)):
+            list += (u"%3i - %s\n" % (i, self.alternatives[i]))
+        wikipedia.output(list)
+
+    def setupRegexes(self):
+        # compile regular expressions
+        self.ignore_contents_regexes = []
+        if self.ignore_contents.has_key(self.mylang):
+            for ig in self.ignore_contents[self.mylang]:
+                self.ignore_contents_regexes.append(re.compile(ig))
+
+        linktrail = self.mysite.linktrail()
+        self.trailR = re.compile(linktrail)
+        # The regular expression which finds links. Results consist of four groups:
+        # group title is the target page title, that is, everything before | or ].
+        # group section is the page section. It'll include the # to make life easier for us.
+        # group label is the alternative link title, that's everything between | and ].
+        # group linktrail is the link trail, that's letters after ]] which are part of the word.
+        # note that the definition of 'letter' varies from language to language.
+        self.linkR = re.compile(r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')
+
+    def treat(self, refPage, disambPage):
+        """
+        Parameters:
+            disambPage - The disambiguation page or redirect we don't want anything
+                     to link on
+            refPage - A page linking to disambPage
+        Returns False if the user pressed q to completely quit the program.
+        Otherwise, returns True.
+        """
+        # TODO: break this function up into subroutines!
+
+        include = False
+        unlink = False
+        new_targets = []
+        try:
+            text=refPage.get(throttle=False)
+            ignoreReason = self.checkContents(text)
+            if ignoreReason:
+                wikipedia.output('\n\nSkipping %s because it contains %s.\n\n' % (refPage.title(), ignoreReason))
+            else:
+                include = True
+        except wikipedia.IsRedirectPage:
+            wikipedia.output(u'%s is a redirect to %s' % (refPage.title(), disambPage.title()))
+            if disambPage.isRedirectPage():
+                target = self.alternatives[0]
+                choice = wikipedia.inputChoice(u'Do you want to make redirect %s point to %s?' % (refPage.title(), target), ['yes', 'no'], ['y', 'N'], 'N')
+                if choice == 'y':
+                    redir_text = '#%s [[%s]]' % (self.mysite.redirect(default=True), target)
+                    try:
+                        refPage.put_async(redir_text,comment=self.comment)
+                    except wikipedia.PageNotSaved, error:
+                        wikipedia.output(u'Page not saved: %s' % error.args)
+            else:
+                choice = wikipedia.inputChoice(u'Do you want to work on pages linking to %s?' % refPage.title(), ['yes', 'no', 'change redirect'], ['y', 'N', 'c'], 'N')
+                if choice == 'y':
+                    gen = ReferringPageGeneratorWithIgnore(refPage, self.primary)
+                    preloadingGen = pagegenerators.PreloadingGenerator(gen)
+                    for refPage2 in preloadingGen:
+                        # run until the user selected 'quit'
+                        if not self.treat(refPage2, refPage):
+                            break
+                elif choice == 'c':
+                    text=refPage.get(throttle=False,get_redirect=True)
+                    include = "redirect"
+        except wikipedia.NoPage:
+            wikipedia.output(u'Page [[%s]] does not seem to exist?! Skipping.' % refPage.title())
+            include = False
+        if include in (True, "redirect"):
+            # make a backup of the original text so we can show the changes later
+            original_text = text
+            n = 0
+            curpos = 0
+            edited = False
+            # This loop will run until we have finished the current page
+            while True:
+                m = self.linkR.search(text, pos = curpos)
+                if not m:
+                    if n == 0:
+                        wikipedia.output(u"No changes necessary in %s" % refPage.title())
+                        return True
+                    else:
+                        # stop loop and save page
+                        break
+                # Make sure that next time around we will not find this same hit.
+                curpos = m.start() + 1
+                # ignore interwiki links and links to sections of the same page
+                if m.group('title') == '' or self.mysite.isInterwikiLink(m.group('title')):
+                    continue
+                else:
+                    try:
+                        linkPage = wikipedia.Page(disambPage.site(), m.group('title'))
+                        # Check whether the link found is to disambPage.
+                    except wikipedia.InvalidTitle:
+                        continue
+                    if linkPage != disambPage:
+                        continue
+
+                n += 1
+                # how many bytes should be displayed around the current link
+                context = 60
+                # This loop will run while the user doesn't choose an option
+                # that will actually change the page
+                while True:
+                    # Show the title of the page where the link was found.
+                    # Highlight the title in purple.
+                    wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % refPage.title())
+
+                    # at the beginning of the link, start red color.
+                    # at the end of the link, reset the color to default
+                    wikipedia.output(text[max(0, m.start() - context) : m.start()] + '\03{lightred}' + text[m.start() : m.end()] + '\03{default}' + text[m.end() : m.end() + context])
+
+                    if not self.always:
+                        if edited:
+                            choice = wikipedia.input(u"Option (#, r#, s=skip link, e=edit page, n=next page, u=unlink, q=quit\n"
+                                               "        m=more context, l=list, a=add new, x=save in this form):")
+                        else:
+                            choice = wikipedia.input(u"Option (#, r#, s=skip link, e=edit page, n=next page, u=unlink, q=quit\n"
+                                               "        m=more context, d=show disambiguation page, l=list, a=add new):")
+                    else:
+                        choice = self.always
+                    if choice in ['a', 'A']:
+                        newAlternative = wikipedia.input(u'New alternative:')
+                        self.alternatives.append(newAlternative)
+                        self.listAlternatives()
+                    elif choice in ['e', 'E']:
+                        editor = editarticle.TextEditor()
+                        newText = editor.edit(text, jumpIndex = m.start(), highlight = disambPage.title())
+                        # if user didn't press Cancel
+                        if newText and newText != text:
+                            text = newText
+                            break
+                    elif choice in ['d', 'D']:
+                        editor = editarticle.TextEditor()
+                        if disambPage.isRedirectPage():
+                            disambredir = disambPage.getRedirectTarget()
+                            disambigText = editor.edit(disambredir.get(), jumpIndex = m.start(), highlight = disambredir.title())
+                        else:
+                            disambigText = editor.edit(disambPage.get(), jumpIndex = m.start(), highlight = disambPage.title())
+                    elif choice in ['l', 'L']:
+                        self.listAlternatives()
+                    elif choice in ['m', 'M']:
+                        # show more text around the link we're working on
+                        context *= 2
+                    else:
+                        break
+
+                if choice in ['e', 'E']:
+                    # user has edited the page and then pressed 'OK'
+                    edited = True
+                    curpos = 0
+                    continue
+                elif choice in ['n', 'N']:
+                    # skip this page
+                    if self.primary:
+                        # If run with the -primary argument, skip this occurence next time.
+                        self.primaryIgnoreManager.ignore(refPage)
+                    return True
+                elif choice in ['q', 'Q']:
+                    # quit the program
+                    return False
+                elif choice in ['s', 'S']:
+                    # Next link on this page
+                    n -= 1
+                    continue
+                elif choice in ['x', 'X'] and edited:
+                    # Save the page as is
+                    break
+
+                # The link looks like this:
+                # [[page_title|link_text]]trailing_chars
+                page_title = m.group('title')
+                link_text = m.group('label')
+
+                if not link_text:
+                    # or like this: [[page_title]]trailing_chars
+                    link_text = page_title
+                if m.group('section') == None:
+                    section = ''
+                else:
+                    section = m.group('section')
+                trailing_chars = m.group('linktrail')
+                if trailing_chars:
+                    link_text += trailing_chars
+
+                if choice in ['u', 'U']:
+                    # unlink - we remove the section if there's any
+                    text = text[:m.start()] + link_text + text[m.end():]
+                    unlink = True
+                    continue
+                else:
+                    if len(choice)>0 and choice[0] == 'r':
+                        # we want to throw away the original link text
+                        replaceit = True
+                        choice = choice[1:]
+                    elif include == "redirect":
+                        replaceit = True
+                    else:
+                        replaceit = False
+
+                    try:
+                        choice=int(choice)
+                    except ValueError:
+                        wikipedia.output(u"Unknown option")
+                        # step back to ask the user again what to do with the current link
+                        curpos -= 1
+                        continue
+                    if choice >= len(self.alternatives) or choice < 0:
+                        wikipedia.output(u"Choice out of range. Please select a number between 0 and %i." % (len(self.alternatives) - 1))
+                        # show list of possible choices
+                        self.listAlternatives()
+                        # step back to ask the user again what to do with the current link
+                        curpos -= 1
+                        continue
+                    new_page_title = self.alternatives[choice]
+                    repPl = wikipedia.Page(disambPage.site(), new_page_title)
+                    if (new_page_title[0].isupper()) or (link_text[0].isupper()):
+                        new_page_title = repPl.title()
+                    else:
+                        new_page_title = repPl.title()
+                        new_page_title = new_page_title[0].lower() + new_page_title[1:]
+                    if new_page_title not in new_targets:
+                        new_targets.append(new_page_title)
+                    if replaceit and trailing_chars:
+                        newlink = "[[%s%s]]%s" % (new_page_title, section, trailing_chars)
+                    elif replaceit or (new_page_title == link_text and not section):
+                        newlink = "[[%s]]" % new_page_title
+                    # check if we can create a link with trailing characters instead of a pipelink
+                    elif len(new_page_title) <= len(link_text) and firstcap(link_text[:len(new_page_title)]) == firstcap(new_page_title) and re.sub(self.trailR, '', link_text[len(new_page_title):]) == '' and not section:
+                        newlink = "[[%s]]%s" % (link_text[:len(new_page_title)], link_text[len(new_page_title):])
+                    else:
+                        newlink = "[[%s%s|%s]]" % (new_page_title, section, link_text)
+                    text = text[:m.start()] + newlink + text[m.end():]
+                    continue
+
+                wikipedia.output(text[max(0,m.start()-30):m.end()+30])
+            if text == original_text:
+                wikipedia.output(u'\nNo changes have been made:\n')
+            else:
+                wikipedia.output(u'\nThe following changes have been made:\n')
+                wikipedia.showDiff(original_text, text)
+                wikipedia.output(u'')
+                # save the page
+                self.setSummaryMessage(disambPage, new_targets, unlink)
+                try:
+                    refPage.put_async(text,comment=self.comment)
+                except wikipedia.LockedPage:
+                    wikipedia.output(u'Page not saved: page is locked')
+                except wikipedia.PageNotSaved, error:
+                    wikipedia.output(u'Page not saved: %s' % error.args)
+        return True
+
+    def findAlternatives(self, disambPage):
+        if disambPage.isRedirectPage() and not self.primary:
+            if self.primary_redir_template.has_key(disambPage.site().lang) and self.primary_redir_template[disambPage.site().lang] in disambPage.templates(get_redirect = True):
+                baseTerm = disambPage.title()
+                for template in disambPage.templatesWithParams(get_redirect = True):
+                    if template[0] == self.primary_redir_template[disambPage.site().lang] and len(template[1]) > 0:
+                        baseTerm = template[1][1]
+                disambTitle = primary_topic_format[self.mylang] % baseTerm
+                try:
+                    disambPage2 = wikipedia.Page(self.mysite, disambTitle)
+                    links = disambPage2.linkedPages()
+                    links = [correctcap(l,disambPage2.get()) for l in links]
+                except wikipedia.NoPage:
+                    wikipedia.output(u"No page at %s, using redirect target." % disambTitle)
+                    links = disambPage.linkedPages()[:1]
+                    links = [correctcap(l,disambPage.get(get_redirect = True)) for l in links]
+                self.alternatives += links
+            else:
+                try:
+                    target = disambPage.getRedirectTarget().title()
+                    self.alternatives.append(target)
+                except wikipedia.NoPage:
+                    wikipedia.output(u"The specified page was not found.")
+                    user_input = wikipedia.input(u"""\
+Please enter the name of the page where the redirect should have pointed at,
+or press enter to quit:""")
+                    if user_input == "":
+                        sys.exit(1)
+                    else:
+                        self.alternatives.append(user_input)
+                except wikipedia.IsNotRedirectPage:
+                    wikipedia.output(
+                        u"The specified page is not a redirect. Skipping.")
+                    return False
+        elif self.getAlternatives:
+            try:
+                if self.primary:
+                    try:
+                        disambPage2 = wikipedia.Page(self.mysite,
+                                        primary_topic_format[self.mylang]
+                                            % disambPage.title()
+                                    )
+                        links = disambPage2.linkedPages()
+                        links = [correctcap(l,disambPage2.get()) for l in links]
+                    except wikipedia.NoPage:
+                        wikipedia.output(u"Page does not exist, using the first link in page %s." % disambPage.title())
+                        links = disambPage.linkedPages()[:1]
+                        links = [correctcap(l,disambPage.get()) for l in links]
+                else:
+                    try:
+                        links = disambPage.linkedPages()
+                        links = [correctcap(l,disambPage.get()) for l in links]
+                    except wikipedia.NoPage:
+                        wikipedia.output(u"Page does not exist, skipping.")
+                        return False
+            except wikipedia.IsRedirectPage:
+                wikipedia.output(u"Page is a redirect, skipping.")
+                return False
+            self.alternatives += links
+        return True
+
+    def setSummaryMessage(self, disambPage, new_targets = [], unlink = False):
+        # make list of new targets
+        targets = ''
+        for page_title in new_targets:
+            targets += u'[[%s]], ' % page_title
+        # remove last comma
+        targets = targets[:-2]
+
+        if not targets:
+            targets = wikipedia.translate(self.mysite, unknown_msg)
+
+        # first check whether user has customized the edit comment
+        if wikipedia.config.disambiguation_comment.has_key(self.mysite.family.name)  and wikipedia.config.disambiguation_comment[self.mysite.family.name].has_key(self.mylang):
+            try:
+                self.comment = wikipedia.translate(self.mysite,
+                                wikipedia.config.disambiguation_comment[
+                                self.mysite.family.name]
+                                ) % (disambPage.title(), targets)
+            #Backwards compatibility, type error probably caused by too many arguments for format string
+            except TypeError:
+                self.comment = wikipedia.translate(self.mysite,
+                                wikipedia.config.disambiguation_comment[
+                                self.mysite.family.name]
+                                ) % disambPage.title()
+        elif disambPage.isRedirectPage():
+            # when working on redirects, there's another summary message
+            if unlink and not new_targets:
+                self.comment = wikipedia.translate(self.mysite, msg_redir_unlink) % disambPage.title()
+            else:
+                self.comment = wikipedia.translate(self.mysite, msg_redir) % (disambPage.title(), targets)
+        else:
+            if unlink and not new_targets:
+                self.comment = wikipedia.translate(self.mysite, msg_unlink) % disambPage.title()
+            else:
+                self.comment = wikipedia.translate(self.mysite, msg) % (disambPage.title(), targets)
+
+    def run(self):
+        if self.main_only:
+            if not ignore_title.has_key(self.mysite.family.name):
+                ignore_title[self.mysite.family.name] = {}
+            if not ignore_title[self.mysite.family.name].has_key(self.mylang):
+                ignore_title[self.mysite.family.name][self.mylang] = []
+            ignore_title[self.mysite.family.name][self.mylang] += [
+                u'%s:' % namespace for namespace in self.mysite.namespaces()]
+
+        for disambPage in self.generator:
+            self.primaryIgnoreManager = PrimaryIgnoreManager(disambPage, enabled=self.primary)
+
+            if not self.findAlternatives(disambPage):
+                continue
+
+            self.makeAlternativesUnique()
+            # sort possible choices
+            if wikipedia.config.sort_ignore_case:
+                self.alternatives.sort(lambda x,y: cmp(x.lower(), y.lower()))
+            else:
+                self.alternatives.sort()
+            self.listAlternatives()
+
+            gen = ReferringPageGeneratorWithIgnore(disambPage, self.primary, minimum = self.minimum)
+            preloadingGen = pagegenerators.PreloadingGenerator(gen)
+            for refPage in preloadingGen:
+                if not self.primaryIgnoreManager.isIgnored(refPage):
+                    # run until the user selected 'quit'
+                    if not self.treat(refPage, disambPage):
+                        break
+
+            # clear alternatives before working on next disambiguation page
+            self.alternatives = []
+
+def main():
+    # the option that's always selected when the bot wonders what to do with
+    # a link. If it's None, the user is prompted (default behaviour).
+    always = None
+    alternatives = []
+    getAlternatives = True
+    # if the -file argument is used, page titles are dumped in this array.
+    # otherwise it will only contain one page.
+    generator = None
+    # This temporary array is used to read the page title if one single
+    # page to work on is specified by the arguments.
+    pageTitle = []
+    primary = False
+    main_only = False
+
+    # For sorting the linked pages, case can be ignored
+    ignoreCase = False
+    minimum = 0
+
+    for arg in wikipedia.handleArgs():
+        if arg.startswith('-primary:'):
+            primary = True
+            getAlternatives = False
+            alternatives.append(arg[9:])
+        elif arg == '-primary':
+            primary = True
+        elif arg.startswith('-always:'):
+            always = arg[8:]
+        elif arg.startswith('-file'):
+            if len(arg) == 5:
+                generator = pagegenerators.TextfilePageGenerator(filename = None)
+            else:
+                generator = pagegenerators.TextfilePageGenerator(filename = arg[6:])
+        elif arg.startswith('-pos:'):
+            if arg[5]!=':':
+                mysite = wikipedia.getSite()
+                page = wikipedia.Page(mysite, arg[5:])
+                if page.exists():
+                    alternatives.append(page.title())
+                else:
+                    answer = wikipedia.inputChoice(u'Possibility %s does not actually exist. Use it anyway?'
+                             % page.title(), ['yes', 'no'], ['y', 'N'], 'N')
+                    if answer == 'y':
+                        alternatives.append(page.title())
+            else:
+                alternatives.append(arg[5:])
+        elif arg == '-just':
+            getAlternatives = False
+        elif arg == '-main':
+            main_only = True
+        elif arg.startswith('-min:'):
+            minimum = int(arg[5:])
+        elif arg.startswith('-start'):
+            try:
+                if len(arg) <= len('-start:'):
+                    generator = pagegenerators.CategorizedPageGenerator(wikipedia.getSite().disambcategory())
+                else:
+                    generator = pagegenerators.CategorizedPageGenerator(wikipedia.getSite().disambcategory(), start = arg[7:])
+                generator = pagegenerators.NamespaceFilterPageGenerator(generator, [0])
+            except wikipedia.NoPage:
+                print "Disambiguation category for your wiki is not known."
+                raise
+        elif arg.startswith("-"):
+            print "Unrecognized command line argument: %s" % arg
+            # show help text and exit
+            wikipedia.showHelp()
+        else:
+            pageTitle.append(arg)
+
+    # if the disambiguation page is given as a command line argument,
+    # connect the title's parts with spaces
+    if pageTitle != []:
+        pageTitle = ' '.join(pageTitle)
+        page = wikipedia.Page(wikipedia.getSite(), pageTitle)
+        generator = iter([page])
+
+    # if no disambiguation pages was given as an argument, and none was
+    # read from a file, query the user
+    if not generator:
+        pageTitle = wikipedia.input(u'On which disambiguation page do you want to work?')
+        page = wikipedia.Page(wikipedia.getSite(), pageTitle)
+        generator = iter([page])
+
+    bot = DisambiguationRobot(always, alternatives, getAlternatives, generator, primary, main_only, minimum = minimum)
+    bot.run()
+
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    finally:
+        wikipedia.stopme()





More information about the Pywikipedia-l mailing list