jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/372064 )
Change subject: [IMPR] Exclude links in disambiguation templates from possibilities
......................................................................
[IMPR] Exclude links in disambiguation templates from possibilities
Bug: T118719
Change-Id: I7f6762f9b7ebda4bbf04598fa15ba4f82266c9c6
---
M scripts/solve_disambiguation.py
A tests/solve_disambiguation_tests.py
2 files changed, 130 insertions(+), 7 deletions(-)
Approvals:
jenkins-bot: Verified
Xqt: Looks good to me, approved
diff --git a/scripts/solve_disambiguation.py b/scripts/solve_disambiguation.py
index 54efe23..7169f6d 100755
--- a/scripts/solve_disambiguation.py
+++ b/scripts/solve_disambiguation.py
@@ -71,7 +71,7 @@
# (C) Daniel Herding, 2004
# (C) Andre Engels, 2003-2004
# (C) WikiWichtel, 2004
-# (C) Pywikibot team, 2003-2017
+# (C) Pywikibot team, 2003-2018
#
# Distributed under the terms of the MIT license.
#
@@ -85,7 +85,7 @@
import pywikibot
from pywikibot import editor as editarticle
from pywikibot.tools import first_lower, first_upper as firstcap
-from pywikibot import pagegenerators, config, i18n
+from pywikibot import pagegenerators, config, i18n, textlib
from pywikibot.bot import (
Bot, QuitKeyboardInterrupt,
StandardOption, HighlightContextOption, ListOption, OutputProxyOption,
@@ -96,6 +96,53 @@
dn_template = {
'en': u'{{dn}}',
'fr': u'{{Lien vers un homonyme}}',
+}
+
+# Regexes of disambiguation template titles to exclude links from
+disamb_templates = {
+ 'wikipedia': {
+ 'bs': [r'[Čč]vor', r'[Dd]isambig'],
+ 'cs': [r'[Rr]ozcestník', r'[Rr]ozcestník[ _]-[
_][^\}]+'],
+ 'en': [r'[Dd]isambig-plants', r'[Dd]isambig(uation)?',
+ r'[Dd]isambiguation[ _]cleanup', r'[Gg]eodis',
+ r'[Hh]ndis-cleanup',
+ r'[Ll]etter-Number[ _]Combination[ _]Disambiguation',
+ r'[Mm]il-unit-dis', r'[Nn]umberdis', r'.+?[
_]disambiguation'],
+ 'haw': [r'[Hh]uaʻōlelo[ _]puana[ _]like'],
+ 'hr': [r'[Rr]azdvojba', r'[Dd]isambig'],
+ 'no': [r'[Pp]eker', r'[Ee]tternavn',
r'[Dd]isambig',
+ r'[Tt]obokstavsforkortelse',
r'[Tt]rebokstavsforkortelse',
+ r'[Ff]lertydig', r'[Pp]ekerside'],
+ 'nov': [r'[Dd]esambig'],
+ 'qr': [r"[Ss]ut'ichana[ _]qillqa", r'[Dd]isambig',
r'SJM'],
+ 'rmy': [r'[Dd]udalipen'],
+ 'sk': [r'[Dd]isambig', r'[Rr]ozlišovacia[ _]stránka',
+ r'[Dd]isambiguation'],
+ 'sr': [r'[Dd]isambig(uation)?', r'ВЗО',
r'[Вв]зо', r'[Вв]ишезначна',
+ r'[Вв]ишезначна[ _]одредница', r'[Вв]ишезначност',
+ r'[Vv]išeznačna[ _]odrednica-lat'],
+ 'tg': [r'Ибҳомзудоӣ', r'[Dd]isambig', r'Рафъи[
_]ибҳом',
+ r'[Dd]isambiguation'],
+ 'tr': [r'[Aa]nlam[ _]ayrım', r'[Dd]isambig',
r'[Aa]nlam[ _]ayrımı',
+ r'[Kk]işi[ _]adları[ _]\(anlam[ _]ayrımı\)',
+ r'[Yy]erleşim[ _]yerleri[ _]\(anlam[ _]ayrımı\)',
+ r'[Kk]ısaltmalar[ _]\(anlam[ _]ayrımı\)',
+ r'[Cc]oğrafya[ _]\(anlam[ _]ayrımı\)',
+ r'[Yy]erleşim[ _]yerleri[ _]\(anlam[ _]ayrımı\)',
+ r'[Ss]ayılar[ _]\(anlam[ _]ayrımı\)',
+ r"ABD'deki[ _]iller[ _]\(anlam[ _]ayrımı\)"],
+ 'wo': [r'[Bb]okktekki'],
+ 'yi': [r'באדייטען'],
+ 'zea': [r'[Dd]p', r'[Dd]eurverwiespagina'],
+ 'zh-classical': [r'釋義', r'消歧義', r'[Dd]isambig'],
+ },
+ 'loveto': {
+ '1911': [r'[Dd]isamb'],
+ },
+ 'wowwiki': {
+ 'en': [r'[Dd]isambig', r'[Dd]isambig\/quest',
r'[Dd]isambig\/quest2',
+ r'[Dd]isambig\/achievement2'],
+ },
}
# disambiguation page name format for "primary topic" disambiguations
@@ -989,6 +1036,35 @@
pywikibot.output(u'Page not saved: %s' % error.args)
return 'done'
+ def get_disambiguation_links(self, disambPage):
+ """Get links from disambPage excluding links from
disamb_templates.
+
+ @param disambPage: the disambiguation page
+ @type disambPage: pywikibot.Page
+ @return: list of processed links
+ @rtype: list of str
+
+ """
+ site_disamb_templates = i18n.translate(self.site, disamb_templates)
+ if site_disamb_templates:
+ exceptions = ['nowiki', 'comment', 'category',
'file', 'interwiki']
+ stripped_text = disambPage.text
+ exc_regexes = textlib._get_regexes(exceptions, self.site)
+ for exc in exc_regexes:
+ stripped_text = exc.sub(r'', stripped_text)
+ for template in site_disamb_templates:
+ template_regex = re.compile(
+ r'\{\{ *(?:' + r':|'.join(self.site.namespaces[10])
+
+ r':)?' + template + r'\s*(\|[^\}]*)?\}\}'
+ )
+ stripped_text = template_regex.sub(r'', stripped_text)
+ disambPage.text = stripped_text
+ full_text = disambPage.expand_text()
+ links = re.findall(r'\[\[([^\]\|]+)(?:\|[^\]]*|)\]\]', full_text)
+ else:
+ links = disambPage.linkedPages()
+ return links
+
def findAlternatives(self, disambPage):
"""Extend self.alternatives using correctcap of
disambPage.linkedPages.
@@ -1013,12 +1089,12 @@
try:
disambPage2 = pywikibot.Page(
pywikibot.Link(disambTitle, self.mysite))
- links = disambPage2.linkedPages()
+ links = self.get_disambiguation_links(disambPage2)
links = [correctcap(l, disambPage2.get()) for l in links]
except pywikibot.NoPage:
pywikibot.output(u"No page at %s, using redirect target."
% disambTitle)
- links = disambPage.linkedPages()[:1]
+ links = self.get_disambiguation_links(disambPage)[:1]
links = [correctcap(l, disambPage.get(get_redirect=True))
for l in links]
self.alternatives += links
@@ -1049,19 +1125,19 @@
primary_topic_format[self.mylang]
% disambPage.title(),
self.mysite))
- links = disambPage2.linkedPages()
+ links = self.get_disambiguation_links(disambPage2)
links = [correctcap(l, disambPage2.get())
for l in links]
except pywikibot.NoPage:
pywikibot.output(
'Page does not exist; using first link in page %s.'
% disambPage.title())
- links = disambPage.linkedPages()[:1]
+ links = self.get_disambiguation_links(disambPage)[:1]
links = [correctcap(l, disambPage.get())
for l in links]
else:
try:
- links = disambPage.linkedPages()
+ links = self.get_disambiguation_links(disambPage)
links = [correctcap(l, disambPage.get())
for l in links]
except pywikibot.NoPage:
diff --git a/tests/solve_disambiguation_tests.py b/tests/solve_disambiguation_tests.py
new file mode 100644
index 0000000..9c86bb3
--- /dev/null
+++ b/tests/solve_disambiguation_tests.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+"""Test solve_disambiguation bot module."""
+#
+# (C) Pywikibot team, 2018
+#
+# Distributed under the terms of the MIT license.
+#
+from __future__ import absolute_import, unicode_literals
+
+import pywikibot
+
+from scripts.solve_disambiguation import DisambiguationRobot
+
+from tests.aspects import TestCase, unittest
+
+
+class TestGettingDisambigLinks(TestCase):
+ """Test getting disambiguation links."""
+
+ family = 'wikipedia'
+ code = 'en'
+
+ def test_get(self):
+ """Test getting disambiguation links."""
+ page = pywikibot.Page(self.site, 'foo')
+ bot = DisambiguationRobot(None, [], True, False, None, False, False,
+ minimum=0)
+ page.text = '* [[Link1]]\n* [[Link2]]'
+ newlinks = bot.get_disambiguation_links(page)
+ links = [
+ pywikibot.Link(self.site, 'Link1'),
+ pywikibot.Link(self.site, 'Link2')]
+ self.assertEqual(newlinks, links)
+
+ def test_get_without_templates(self):
+ """Test excluding links from disamb_templates."""
+ page = pywikibot.Page(self.site, 'foo')
+ bot = DisambiguationRobot(None, [], True, False, None, False, False,
+ minimum=0)
+ page.text = '* [[Link1]]\n{{Disambig}}'
+ newlinks = bot.get_disambiguation_links(page)
+ links = [pywikibot.Link(self.site, 'Link1')]
+ self.assertEqual(newlinks, links)
+
+
+if __name__ == '__main__':
+ unittest.main()
--
To view, visit
https://gerrit.wikimedia.org/r/372064
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7f6762f9b7ebda4bbf04598fa15ba4f82266c9c6
Gerrit-PatchSet: 23
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: Dvorapa <dvorapa(a)seznam.cz>
Gerrit-Reviewer: Framawiki <framawiki(a)tools.wmflabs.org>
Gerrit-Reviewer: JAn Dudík <jan.dudik(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Magul <tomasz.magulski(a)gmail.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: Zoranzoki21 <zorandori4444(a)gmail.com>
Gerrit-Reviewer: jenkins-bot <>