jenkins-bot merged this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
Harvest multiple values from one parameter

The new modifier is '-multi'.

Doesn't work with claim types besides wikibase-item.

Bug: T87689
Change-Id: Ied808405a21213e165d51b3fe3d79dfd883e58c0
---
M scripts/harvest_template.py
1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 6c6fd07..80718d0 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -40,6 +40,8 @@
has the imported property with the imported value and
some qualifiers.

+-multi If set, try to match multiple values from parameter.
+
Examples:

python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \
@@ -75,10 +77,18 @@
page won't be skipped if the item already has that property but there is
not the new value.

+ python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \
+ -template:"Infobox musical artist" current_members P527 -exists:p \
+ -multi
+
+ will import band members from the "current_members" parameter of "Infobox
+ musical artist" on English Wikipedia as Wikidata property "P527" (has
+ part). This will only extract multiple band members if each is linked, and
+ will not add duplicate claims for the same member.
"""
#
# (C) Multichill, Amir, 2013
-# (C) Pywikibot team, 2013-2018
+# (C) Pywikibot team, 2013-2019
#
# Distributed under the terms of MIT License.
#
@@ -114,8 +124,9 @@
"""Class holding options for a param-property pair."""

availableOptions = {
- 'islink': False,
'exists': '',
+ 'islink': False,
+ 'multi': False,
}


@@ -140,12 +151,16 @@
@keyword exists: pattern for merging existing claims with harvested
values
@type exists: str
+ @keyword multi: Whether multiple values should be extracted from a
+ single parameter
+ @type multi: bool
"""
self.availableOptions.update({
'always': True,
'create': False,
'exists': '',
'islink': False,
+ 'multi': False,
})
super(HarvestRobot, self).__init__(**kwargs)
self.generator = generator
@@ -261,22 +276,42 @@
# This field contains something useful for us
prop, options = self.fields[field]
claim = pywikibot.Claim(self.repo, prop)
+ exists_arg = self._get_option_with_fallback(options, 'exists')
if claim.type == 'wikibase-item':
+ do_multi = self._get_option_with_fallback(
+ options, 'multi')
+ matched = False
# Try to extract a valid page
- match = pywikibot.link_regex.search(value)
- if match:
+ for match in pywikibot.link_regex.finditer(value):
+ matched = True
link_text = match.group(1)
- else:
- if self._get_option_with_fallback(options, 'islink'):
- link_text = value
- else:
- pywikibot.output(
- '{} field {} value {} is not a wikilink. '
- 'Skipping.'
- .format(claim.getID(), field, value))
- continue
+ linked_item = self._template_link_target(
+ item, link_text)
+ added = False
+ if linked_item:
+ claim.setTarget(linked_item)
+ added = self.user_add_claim_unless_exists(
+ item, claim, exists_arg, page.site,
+ pywikibot.output)
+ claim = pywikibot.Claim(self.repo, prop)
+ # stop after the first match if not supposed to add
+ # multiple values
+ if not do_multi:
+ break
+ # update exists_arg, so we can add more values
+ if 'p' not in exists_arg and added:
+ exists_arg += 'p'

- linked_item = self._template_link_target(item, link_text)
+ if matched:
+ continue
+
+ if not self._get_option_with_fallback(options, 'islink'):
+ pywikibot.output(
+ '{} field {} value {} is not a wikilink. Skipping.'
+ .format(claim.getID(), field, value))
+ continue
+
+ linked_item = self._template_link_target(item, value)
if not linked_item:
continue

@@ -308,9 +343,7 @@

# A generator might yield pages from multiple sites
self.user_add_claim_unless_exists(
- item, claim, self._get_option_with_fallback(
- options, 'exists'),
- page.site, pywikibot.output)
+ item, claim, exists_arg, page.site, pywikibot.output)


def main(*args):

To view, visit change 371586. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Ied808405a21213e165d51b3fe3d79dfd883e58c0
Gerrit-Change-Number: 371586
Gerrit-PatchSet: 8
Gerrit-Owner: Ejegg <ejegg@ejegg.com>
Gerrit-Reviewer: Ejegg <ejegg@ejegg.com>
Gerrit-Reviewer: Jgleeson <jgleeson@wikimedia.org>
Gerrit-Reviewer: John Vandenberg <jayvdb@gmail.com>
Gerrit-Reviewer: Magul <tomasz.magulski@gmail.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot (75)