Xqt submitted this change.

View Change

Approvals: Xqt: Verified; Looks good to me, approved
[IMPR] Add handlers for various claim types

- Add handlers for wikibase-item, string, external_id, url and commonsMedia
- dispatch handlers in HarvestRobot.treat_field()
- use current_page in template_link_target() to determine the
curren site because the generator may have multiple sites
- create Page object directly in template_link_target and
handle_commonsmedia() instead of the Link object first
- use image_repository() handle_commonsmedia() instead of
hardcoded 'commons'

This does not change or expand the current behaviour but prepares
treats found here:
https://phabricator.wikimedia.org/search/query/XILoHVKp2l5I/#R

Change-Id: I58ca30929a3ac9e7edf214b663b63fac3596c19a
---
M scripts/harvest_template.py
D tests/harvest_templates_tests.py
2 files changed, 119 insertions(+), 123 deletions(-)

diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 7bf679c..3d9e6f6 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -207,8 +207,8 @@
titles.append(temp.title(with_ns=False))
return titles

- @staticmethod
- def template_link_target(item: pywikibot.ItemPage,
+ def template_link_target(self,
+ item: pywikibot.ItemPage,
link_text: str) -> Optional[pywikibot.ItemPage]:
"""Find the ItemPage target for a given link text.

@@ -216,8 +216,7 @@
Only follow the redirect target if redirect page has no
wikibase item.
"""
- link = pywikibot.Link(link_text)
- linked_page = pywikibot.Page(link)
+ linked_page = pywikibot.Page(self.current_page.site, link_text)
try:
exists = linked_page.exists()
except (InvalidTitleError, InvalidPageError):
@@ -265,12 +264,17 @@
return local or default

def treat_page_and_item(self,
- page: pywikibot.page.BasePage,
- item: pywikibot.page.ItemPage) -> None:
+ page: Optional[pywikibot.page.BasePage],
+ item: Optional[pywikibot.page.ItemPage]) -> None:
"""Process a single page/item."""
if willstop:
raise KeyboardInterrupt

+ if page is None:
+ return
+
+ assert page is self.current_page
+
templates = page.raw_extracted_templates
for template, fielddict in templates:
# Clean up template
@@ -287,102 +291,135 @@

# We found the template we were looking for
for field_item in fielddict.items():
- self.treat_field(page, item, field_item)
+ self.treat_field(item, field_item)

def treat_field(self,
- page: pywikibot.page.BasePage,
item: pywikibot.page.ItemPage,
field_item: Tuple[str, str]) -> None:
- """Process a single field of template fileddict."""
+ """Process a single field of template fileddict.
+
+ .. versionadded:: 7.4
+ """
field, value = field_item
field = field.strip()
+ site = self.current_page.site
+
# todo: extend the list of tags to ignore
value = textlib.removeDisabledParts(
# todo: eventually we may want to import the references
- value, tags=['ref'], site=page.site).strip()
+ value, tags=['ref'], site=site).strip()

if not field or not value or field not in self.fields:
return

# This field contains something useful for us
prop, options = self.fields[field]
+ exists_arg = list(self._get_option_with_fallback(options, 'exists'))
claim = pywikibot.Claim(self.repo, prop)
- exists_arg = self._get_option_with_fallback(options, 'exists')
-
- if claim.type == 'wikibase-item':
- do_multi = self._get_option_with_fallback(options, 'multi')
- matched = False
-
- # Try to extract a valid page
- for match in pywikibot.link_regex.finditer(value):
- matched = True
- link_text = match.group(1)
- linked_item = self.template_link_target(item, link_text)
- added = False
-
- if linked_item:
- claim.setTarget(linked_item)
- added = self.user_add_claim_unless_exists(
- item, claim, exists_arg, page.site, pywikibot.info)
- claim = pywikibot.Claim(self.repo, prop)
-
- # stop after the first match if not supposed to add
- # multiple values
- if not do_multi:
- break
-
- # update exists_arg, so we can add more values
- if 'p' not in exists_arg and added:
- exists_arg += 'p'
-
- if matched:
- return
-
- if not self._get_option_with_fallback(options, 'islink'):
- pywikibot.info(
- '{} field {} value {} is not a wikilink. Skipping.'
- .format(claim.getID(), field, value))
- return
-
- linked_item = self.template_link_target(item, value)
- if not linked_item:
- return
-
- claim.setTarget(linked_item)
-
- elif claim.type in ('string', 'external-id'):
- claim.setTarget(value.strip())
-
- elif claim.type == 'url':
- match = self.linkR.search(value)
- if not match:
- return
-
- claim.setTarget(match.group('url'))
-
- elif claim.type == 'commonsMedia':
- commonssite = pywikibot.Site('commons')
- imagelink = pywikibot.Link(
- value, source=commonssite, default_namespace=6)
- image = pywikibot.FilePage(imagelink)
- if image.isRedirectPage():
- image = pywikibot.FilePage(image.getRedirectTarget())
-
- if not image.exists():
- pywikibot.info("{} doesn't exist. I can't link to it"
- .format(image.title(as_link=True)))
- return
-
- claim.setTarget(image)
-
- else:
+ handler = getattr(self, 'handle_'
+ + claim.type.lower().replace('-', '_'), None)
+ if not handler:
pywikibot.info('{} is not a supported datatype.'
.format(claim.type))
return

- # A generator might yield pages from multiple sites
- self.user_add_claim_unless_exists(
- item, claim, exists_arg, page.site, pywikibot.info)
+ if handler(claim, value, item, field, exists_arg):
+ # A generator might yield pages from multiple sites
+ self.user_add_claim_unless_exists(
+ item, claim, ''.join(exists_arg), site, pywikibot.info)
+
+ def handle_wikibase_item(self, claim, value: str,
+ item: pywikibot.page.ItemPage,
+ field: str,
+ exists_arg: List[str]) -> bool:
+ """Handle 'wikibase-item' claim type.
+
+ .. note:: `exists_arg` may be modified in place which is reused
+ by the caller method
+ .. versionadded:: 7.4
+ """
+ prop, options = self.fields[field]
+ do_multi = self._get_option_with_fallback(options, 'multi')
+ matched = False
+
+ # Try to extract a valid page
+ for match in pywikibot.link_regex.finditer(value):
+ matched = True
+ link_text = match.group(1)
+ linked_item = self.template_link_target(item, link_text)
+ added = False
+
+ if linked_item:
+ claim.setTarget(linked_item)
+ added = self.user_add_claim_unless_exists(
+ item, claim, exists_arg, self.current_page.site,
+ pywikibot.info)
+ claim = pywikibot.Claim(self.repo, prop)
+
+ # stop after the first match if not supposed to add
+ # multiple values
+ if not do_multi:
+ break
+
+ # update exists_arg, so we can add more values
+ if 'p' not in exists_arg and added:
+ exists_arg += 'p'
+
+ if matched:
+ return False
+
+ if not self._get_option_with_fallback(options, 'islink'):
+ pywikibot.info(
+ '{} field {} value {} is not a wikilink. Skipping.'
+ .format(claim.getID(), field, value))
+ return False
+
+ linked_item = self.template_link_target(item, value)
+ if not linked_item:
+ return False
+
+ claim.setTarget(linked_item)
+ return True
+
+ def handle_string(self, claim, value, *args) -> bool:
+ """Handle 'string' and 'external-id' claim type.
+
+ .. versionadded:: 7.4
+ """
+ claim.setTarget(value.strip())
+ return True
+
+ handle_external_id = handle_string
+
+ def handle_url(self, claim, value, *args) -> bool:
+ """Handle 'url' claim type.
+
+ .. versionadded:: 7.4
+ """
+ match = self.linkR.search(value)
+ if not match:
+ return False
+
+ claim.setTarget(match.group('url'))
+ return True
+
+ def handle_commonsmedia(self, claim, value, *args) -> bool:
+ """Handle 'commonsMedia' claim type.
+
+ .. versionadded:: 7.4
+ """
+ repo = self.current_page.site.image_repository()
+ image = pywikibot.FilePage(repo, value)
+ if image.isRedirectPage():
+ image = pywikibot.FilePage(image.getRedirectTarget())
+
+ if not image.exists():
+ pywikibot.info("{} doesn't exist. I can't link to it"
+ .format(image.title(as_link=True)))
+ return False
+
+ claim.setTarget(image)
+ return True


def main(*args: str) -> None:
diff --git a/tests/harvest_templates_tests.py b/tests/harvest_templates_tests.py
deleted file mode 100644
index 4ea7a6e..0000000
--- a/tests/harvest_templates_tests.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/python3
-"""Tests for scripts/harvest_template.py."""
-#
-# (C) Pywikibot team, 2022
-#
-# Distributed under the terms of the MIT license.
-#
-import unittest
-from contextlib import suppress
-
-from pywikibot import ItemPage
-from scripts.harvest_template import HarvestRobot
-
-from tests.aspects import ScriptMainTestCase
-
-
-class TestHarvestRobot(ScriptMainTestCase):
-
- """Test HarvestRobot."""
-
- family = 'wikipedia'
- code = 'cs'
-
- def test_template_link_target(self):
- """Test template_link_target static method."""
- tests = [
- ('Pes', 'Q144'),
- ('Imaginární číslo', 'Q9165172'),
- ('Sequana', 'Q472766'),
- ]
- for link, item in tests:
- with self.subTest(link=link, item=item):
- dummy_item = ItemPage(self.site.data_repository(), 'Q1')
- target = HarvestRobot.template_link_target(dummy_item, link)
- self.assertIsInstance(target, ItemPage)
- self.assertEqual(target.title(), item)
-
-
-if __name__ == '__main__': # pragma: no cover
- with suppress(SystemExit):
- unittest.main()

To view, visit change 810539. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I58ca30929a3ac9e7edf214b663b63fac3596c19a
Gerrit-Change-Number: 810539
Gerrit-PatchSet: 10
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: JAn Dudík <jan.dudik@gmail.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-Reviewer: Multichill <maarten@mdammers.nl>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged