Xqt submitted this change.

View Change

Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
[IMPR] Only follow redirects in harvest_template.py if no wikibase item exists

This implements the proposal no. 2 of T311883

- Make _template_link_target a public staticmethod
- Test for InvalidPageError with linked_page.exists() call
- do not follow redirect in template_link_target if the redirect
page has a wikibase item
- solve multiple return statements issue
- Add test for this new behaviour

Bug: T311883
Change-Id: I87fe427009f9bbe5db2208d0ed850eb0d48bd505
---
M scripts/harvest_template.py
M tests/__init__.py
A tests/harvest_templates_tests.py
3 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index c1df570..a225b8d 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -106,7 +106,11 @@
from pywikibot import textlib
from pywikibot.backports import List
from pywikibot.bot import ConfigParserBot, OptionHandler, WikidataBot
-from pywikibot.exceptions import InvalidTitleError, NoPageError
+from pywikibot.exceptions import (
+ InvalidPageError,
+ InvalidTitleError,
+ NoPageError,
+)


willstop = False
@@ -203,15 +207,23 @@
titles.append(temp.title(with_ns=False))
return titles

- def _template_link_target(self, item, link_text
- ) -> Optional[pywikibot.ItemPage]:
+ @staticmethod
+ def template_link_target(item: pywikibot.ItemPage,
+ link_text: str) -> Optional[pywikibot.ItemPage]:
+ """Find the ItemPage target for a given link text.
+
+ .. versionchanged:: 7.4
+ Only follow the redirect target if redirect page has no
+ wikibase item.
+ """
link = pywikibot.Link(link_text)
linked_page = pywikibot.Page(link)
try:
exists = linked_page.exists()
- except InvalidTitleError:
- pywikibot.error('"{}" is not a valid title so it cannot be linked.'
- ' Skipping.'.format(link_text))
+ except (InvalidTitleError, InvalidPageError):
+ pywikibot.error('"{}" is not a valid title or the page itself is '
+ 'invalid so it cannot be linked. Skipping.'
+ .format(link_text))
return None

if not exists:
@@ -219,23 +231,24 @@
'Skipping.'.format(linked_page))
return None

- if linked_page.isRedirectPage():
- linked_page = linked_page.getRedirectTarget()
-
- try:
- linked_item = pywikibot.ItemPage.fromPage(linked_page)
- except NoPageError:
- linked_item = None
+ while True:
+ try:
+ linked_item = pywikibot.ItemPage.fromPage(linked_page)
+ except NoPageError:
+ if linked_page.isRedirectPage():
+ linked_page = linked_page.getRedirectTarget()
+ continue
+ linked_item = None
+ break

if not linked_item or not linked_item.exists():
pywikibot.output('{} does not have a wikidata item to link with. '
'Skipping.'.format(linked_page))
- return None
-
- if linked_item.title() == item.title():
+ linked_item = None
+ elif linked_item.title() == item.title():
pywikibot.output('{} links to itself. Skipping.'
.format(linked_page))
- return None
+ linked_item = None

return linked_item

@@ -295,7 +308,7 @@
for match in pywikibot.link_regex.finditer(value):
matched = True
link_text = match.group(1)
- linked_item = self._template_link_target(
+ linked_item = self.template_link_target(
item, link_text)
added = False
if linked_item:
@@ -321,7 +334,7 @@
.format(claim.getID(), field, value))
continue

- linked_item = self._template_link_target(item, value)
+ linked_item = self.template_link_target(item, value)
if not linked_item:
continue

diff --git a/tests/__init__.py b/tests/__init__.py
index 39ce2b0..fa97e19 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -148,6 +148,7 @@
'fixing_redirects',
'generate_family_file',
'generate_user_files',
+ 'harvest_templates',
'interwikidata',
'l10n',
'patrolbot',
diff --git a/tests/harvest_templates_tests.py b/tests/harvest_templates_tests.py
new file mode 100644
index 0000000..4ea7a6e
--- /dev/null
+++ b/tests/harvest_templates_tests.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python3
+"""Tests for scripts/harvest_template.py."""
+#
+# (C) Pywikibot team, 2022
+#
+# Distributed under the terms of the MIT license.
+#
+import unittest
+from contextlib import suppress
+
+from pywikibot import ItemPage
+from scripts.harvest_template import HarvestRobot
+
+from tests.aspects import ScriptMainTestCase
+
+
+class TestHarvestRobot(ScriptMainTestCase):
+
+ """Test HarvestRobot."""
+
+ family = 'wikipedia'
+ code = 'cs'
+
+ def test_template_link_target(self):
+ """Test template_link_target static method."""
+ tests = [
+ ('Pes', 'Q144'),
+ ('Imaginární číslo', 'Q9165172'),
+ ('Sequana', 'Q472766'),
+ ]
+ for link, item in tests:
+ with self.subTest(link=link, item=item):
+ dummy_item = ItemPage(self.site.data_repository(), 'Q1')
+ target = HarvestRobot.template_link_target(dummy_item, link)
+ self.assertIsInstance(target, ItemPage)
+ self.assertEqual(target.title(), item)
+
+
+if __name__ == '__main__': # pragma: no cover
+ with suppress(SystemExit):
+ unittest.main()

To view, visit change 810472. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I87fe427009f9bbe5db2208d0ed850eb0d48bd505
Gerrit-Change-Number: 810472
Gerrit-PatchSet: 4
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Fomafix <fomafix@googlemail.com>
Gerrit-Reviewer: JAn Dudík <jan.dudik@gmail.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged