Xqt has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/810539 )
Change subject: [IMPR] Add handlers for various claim types ......................................................................
[IMPR] Add handlers for various claim types
- Add handlers for wikibase-item, string, external_id, url and commonsMedia - dispatch handlers in HarvestRobot.treat_field() - use current_page in template_link_target() to determine the curren site because the generator may have multiple sites - create Page object directly in template_link_target and handle_commonsmedia() instead of the Link object first - use image_repository() handle_commonsmedia() instead of hardcoded 'commons'
This does not change or expand the current behaviour but prepares treats found here: https://phabricator.wikimedia.org/search/query/XILoHVKp2l5I/#R
Change-Id: I58ca30929a3ac9e7edf214b663b63fac3596c19a --- M scripts/harvest_template.py D tests/harvest_templates_tests.py 2 files changed, 119 insertions(+), 123 deletions(-)
Approvals: Xqt: Verified; Looks good to me, approved
diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py index 7bf679c..3d9e6f6 100755 --- a/scripts/harvest_template.py +++ b/scripts/harvest_template.py @@ -207,8 +207,8 @@ titles.append(temp.title(with_ns=False)) return titles
- @staticmethod - def template_link_target(item: pywikibot.ItemPage, + def template_link_target(self, + item: pywikibot.ItemPage, link_text: str) -> Optional[pywikibot.ItemPage]: """Find the ItemPage target for a given link text.
@@ -216,8 +216,7 @@ Only follow the redirect target if redirect page has no wikibase item. """ - link = pywikibot.Link(link_text) - linked_page = pywikibot.Page(link) + linked_page = pywikibot.Page(self.current_page.site, link_text) try: exists = linked_page.exists() except (InvalidTitleError, InvalidPageError): @@ -265,12 +264,17 @@ return local or default
def treat_page_and_item(self, - page: pywikibot.page.BasePage, - item: pywikibot.page.ItemPage) -> None: + page: Optional[pywikibot.page.BasePage], + item: Optional[pywikibot.page.ItemPage]) -> None: """Process a single page/item.""" if willstop: raise KeyboardInterrupt
+ if page is None: + return + + assert page is self.current_page + templates = page.raw_extracted_templates for template, fielddict in templates: # Clean up template @@ -287,102 +291,135 @@
# We found the template we were looking for for field_item in fielddict.items(): - self.treat_field(page, item, field_item) + self.treat_field(item, field_item)
def treat_field(self, - page: pywikibot.page.BasePage, item: pywikibot.page.ItemPage, field_item: Tuple[str, str]) -> None: - """Process a single field of template fileddict.""" + """Process a single field of template fileddict. + + .. versionadded:: 7.4 + """ field, value = field_item field = field.strip() + site = self.current_page.site + # todo: extend the list of tags to ignore value = textlib.removeDisabledParts( # todo: eventually we may want to import the references - value, tags=['ref'], site=page.site).strip() + value, tags=['ref'], site=site).strip()
if not field or not value or field not in self.fields: return
# This field contains something useful for us prop, options = self.fields[field] + exists_arg = list(self._get_option_with_fallback(options, 'exists')) claim = pywikibot.Claim(self.repo, prop) - exists_arg = self._get_option_with_fallback(options, 'exists') - - if claim.type == 'wikibase-item': - do_multi = self._get_option_with_fallback(options, 'multi') - matched = False - - # Try to extract a valid page - for match in pywikibot.link_regex.finditer(value): - matched = True - link_text = match.group(1) - linked_item = self.template_link_target(item, link_text) - added = False - - if linked_item: - claim.setTarget(linked_item) - added = self.user_add_claim_unless_exists( - item, claim, exists_arg, page.site, pywikibot.info) - claim = pywikibot.Claim(self.repo, prop) - - # stop after the first match if not supposed to add - # multiple values - if not do_multi: - break - - # update exists_arg, so we can add more values - if 'p' not in exists_arg and added: - exists_arg += 'p' - - if matched: - return - - if not self._get_option_with_fallback(options, 'islink'): - pywikibot.info( - '{} field {} value {} is not a wikilink. Skipping.' - .format(claim.getID(), field, value)) - return - - linked_item = self.template_link_target(item, value) - if not linked_item: - return - - claim.setTarget(linked_item) - - elif claim.type in ('string', 'external-id'): - claim.setTarget(value.strip()) - - elif claim.type == 'url': - match = self.linkR.search(value) - if not match: - return - - claim.setTarget(match.group('url')) - - elif claim.type == 'commonsMedia': - commonssite = pywikibot.Site('commons') - imagelink = pywikibot.Link( - value, source=commonssite, default_namespace=6) - image = pywikibot.FilePage(imagelink) - if image.isRedirectPage(): - image = pywikibot.FilePage(image.getRedirectTarget()) - - if not image.exists(): - pywikibot.info("{} doesn't exist. I can't link to it" - .format(image.title(as_link=True))) - return - - claim.setTarget(image) - - else: + handler = getattr(self, 'handle_' + + claim.type.lower().replace('-', '_'), None) + if not handler: pywikibot.info('{} is not a supported datatype.' .format(claim.type)) return
- # A generator might yield pages from multiple sites - self.user_add_claim_unless_exists( - item, claim, exists_arg, page.site, pywikibot.info) + if handler(claim, value, item, field, exists_arg): + # A generator might yield pages from multiple sites + self.user_add_claim_unless_exists( + item, claim, ''.join(exists_arg), site, pywikibot.info) + + def handle_wikibase_item(self, claim, value: str, + item: pywikibot.page.ItemPage, + field: str, + exists_arg: List[str]) -> bool: + """Handle 'wikibase-item' claim type. + + .. note:: `exists_arg` may be modified in place which is reused + by the caller method + .. versionadded:: 7.4 + """ + prop, options = self.fields[field] + do_multi = self._get_option_with_fallback(options, 'multi') + matched = False + + # Try to extract a valid page + for match in pywikibot.link_regex.finditer(value): + matched = True + link_text = match.group(1) + linked_item = self.template_link_target(item, link_text) + added = False + + if linked_item: + claim.setTarget(linked_item) + added = self.user_add_claim_unless_exists( + item, claim, exists_arg, self.current_page.site, + pywikibot.info) + claim = pywikibot.Claim(self.repo, prop) + + # stop after the first match if not supposed to add + # multiple values + if not do_multi: + break + + # update exists_arg, so we can add more values + if 'p' not in exists_arg and added: + exists_arg += 'p' + + if matched: + return False + + if not self._get_option_with_fallback(options, 'islink'): + pywikibot.info( + '{} field {} value {} is not a wikilink. Skipping.' + .format(claim.getID(), field, value)) + return False + + linked_item = self.template_link_target(item, value) + if not linked_item: + return False + + claim.setTarget(linked_item) + return True + + def handle_string(self, claim, value, *args) -> bool: + """Handle 'string' and 'external-id' claim type. + + .. versionadded:: 7.4 + """ + claim.setTarget(value.strip()) + return True + + handle_external_id = handle_string + + def handle_url(self, claim, value, *args) -> bool: + """Handle 'url' claim type. + + .. versionadded:: 7.4 + """ + match = self.linkR.search(value) + if not match: + return False + + claim.setTarget(match.group('url')) + return True + + def handle_commonsmedia(self, claim, value, *args) -> bool: + """Handle 'commonsMedia' claim type. + + .. versionadded:: 7.4 + """ + repo = self.current_page.site.image_repository() + image = pywikibot.FilePage(repo, value) + if image.isRedirectPage(): + image = pywikibot.FilePage(image.getRedirectTarget()) + + if not image.exists(): + pywikibot.info("{} doesn't exist. I can't link to it" + .format(image.title(as_link=True))) + return False + + claim.setTarget(image) + return True
def main(*args: str) -> None: diff --git a/tests/harvest_templates_tests.py b/tests/harvest_templates_tests.py deleted file mode 100644 index 4ea7a6e..0000000 --- a/tests/harvest_templates_tests.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/python3 -"""Tests for scripts/harvest_template.py.""" -# -# (C) Pywikibot team, 2022 -# -# Distributed under the terms of the MIT license. -# -import unittest -from contextlib import suppress - -from pywikibot import ItemPage -from scripts.harvest_template import HarvestRobot - -from tests.aspects import ScriptMainTestCase - - -class TestHarvestRobot(ScriptMainTestCase): - - """Test HarvestRobot.""" - - family = 'wikipedia' - code = 'cs' - - def test_template_link_target(self): - """Test template_link_target static method.""" - tests = [ - ('Pes', 'Q144'), - ('Imaginární číslo', 'Q9165172'), - ('Sequana', 'Q472766'), - ] - for link, item in tests: - with self.subTest(link=link, item=item): - dummy_item = ItemPage(self.site.data_repository(), 'Q1') - target = HarvestRobot.template_link_target(dummy_item, link) - self.assertIsInstance(target, ItemPage) - self.assertEqual(target.title(), item) - - -if __name__ == '__main__': # pragma: no cover - with suppress(SystemExit): - unittest.main()