Xqt has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/810536 )
Change subject: [IMPR] Detach field handling out of treat_page_and_item method ......................................................................
[IMPR] Detach field handling out of treat_page_and_item method
Change-Id: If767efaa2ccc0cd379137a2e7ce1441df7cc3e9b --- M scripts/harvest_template.py 1 file changed, 91 insertions(+), 80 deletions(-)
Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py index a225b8d..137cfd7 100755 --- a/scripts/harvest_template.py +++ b/scripts/harvest_template.py @@ -270,103 +270,114 @@ raise KeyboardInterrupt
templates = page.raw_extracted_templates - for (template, fielddict) in templates: + for template, fielddict in templates: # Clean up template try: - template = pywikibot.Page(page.site, template, - ns=10).title(with_ns=False) + template = pywikibot.Page(page.site, template, ns=10) except InvalidTitleError: pywikibot.error( - "Failed parsing template; '{}' should be " + 'Failed parsing template; {!r} should be ' 'the template name.'.format(template)) continue
- if template not in self.templateTitles: + if template.title(with_ns=False) not in self.templateTitles: continue + # We found the template we were looking for - for field, value in fielddict.items(): - field = field.strip() - # todo: extend the list of tags to ignore - value = textlib.removeDisabledParts( - # todo: eventually we may want to import the references - value, tags=['ref'], site=page.site).strip() - if not field or not value: - continue + for field_item in fielddict.items(): + self.treat_field(page, item, field_item)
- if field not in self.fields: - continue + def treat_field(self, page, item, field_item): + """Process a single field of template fileddict.""" + field, value = field_item + field = field.strip() + # todo: extend the list of tags to ignore + value = textlib.removeDisabledParts( + # todo: eventually we may want to import the references + value, tags=['ref'], site=page.site).strip()
- # This field contains something useful for us - prop, options = self.fields[field] - claim = pywikibot.Claim(self.repo, prop) - exists_arg = self._get_option_with_fallback(options, 'exists') - if claim.type == 'wikibase-item': - do_multi = self._get_option_with_fallback( - options, 'multi') - matched = False - # Try to extract a valid page - for match in pywikibot.link_regex.finditer(value): - matched = True - link_text = match.group(1) - linked_item = self.template_link_target( - item, link_text) - added = False - if linked_item: - claim.setTarget(linked_item) - added = self.user_add_claim_unless_exists( - item, claim, exists_arg, page.site, - pywikibot.output) - claim = pywikibot.Claim(self.repo, prop) - # stop after the first match if not supposed to add - # multiple values - if not do_multi: - break - # update exists_arg, so we can add more values - if 'p' not in exists_arg and added: - exists_arg += 'p' + if not field or not value or field not in self.fields: + return
- if matched: - continue + # This field contains something useful for us + prop, options = self.fields[field] + claim = pywikibot.Claim(self.repo, prop) + exists_arg = self._get_option_with_fallback(options, 'exists')
- if not self._get_option_with_fallback(options, 'islink'): - pywikibot.output( - '{} field {} value {} is not a wikilink. Skipping.' - .format(claim.getID(), field, value)) - continue + if claim.type == 'wikibase-item': + do_multi = self._get_option_with_fallback(options, 'multi') + matched = False
- linked_item = self.template_link_target(item, value) - if not linked_item: - continue + # Try to extract a valid page + for match in pywikibot.link_regex.finditer(value): + matched = True + link_text = match.group(1) + linked_item = self.template_link_target(item, link_text) + added = False
+ if linked_item: claim.setTarget(linked_item) - elif claim.type in ('string', 'external-id'): - claim.setTarget(value.strip()) - elif claim.type == 'url': - match = self.linkR.search(value) - if not match: - continue - claim.setTarget(match.group('url')) - elif claim.type == 'commonsMedia': - commonssite = pywikibot.Site('commons') - imagelink = pywikibot.Link( - value, source=commonssite, default_namespace=6) - image = pywikibot.FilePage(imagelink) - if image.isRedirectPage(): - image = pywikibot.FilePage(image.getRedirectTarget()) - if not image.exists(): - pywikibot.output( - "{} doesn't exist. I can't link to it" - .format(image.title(as_link=True))) - continue - claim.setTarget(image) - else: - pywikibot.output('{} is not a supported datatype.' - .format(claim.type)) - continue + added = self.user_add_claim_unless_exists( + item, claim, exists_arg, page.site, pywikibot.info) + claim = pywikibot.Claim(self.repo, prop)
- # A generator might yield pages from multiple sites - self.user_add_claim_unless_exists( - item, claim, exists_arg, page.site, pywikibot.output) + # stop after the first match if not supposed to add + # multiple values + if not do_multi: + break + + # update exists_arg, so we can add more values + if 'p' not in exists_arg and added: + exists_arg += 'p' + + if matched: + return + + if not self._get_option_with_fallback(options, 'islink'): + pywikibot.info( + '{} field {} value {} is not a wikilink. Skipping.' + .format(claim.getID(), field, value)) + return + + linked_item = self.template_link_target(item, value) + if not linked_item: + return + + claim.setTarget(linked_item) + + elif claim.type in ('string', 'external-id'): + claim.setTarget(value.strip()) + + elif claim.type == 'url': + match = self.linkR.search(value) + if not match: + return + + claim.setTarget(match.group('url')) + + elif claim.type == 'commonsMedia': + commonssite = pywikibot.Site('commons') + imagelink = pywikibot.Link( + value, source=commonssite, default_namespace=6) + image = pywikibot.FilePage(imagelink) + if image.isRedirectPage(): + image = pywikibot.FilePage(image.getRedirectTarget()) + + if not image.exists(): + pywikibot.info("{} doesn't exist. I can't link to it" + .format(image.title(as_link=True))) + return + + claim.setTarget(image) + + else: + pywikibot.info('{} is not a supported datatype.' + .format(claim.type)) + return + + # A generator might yield pages from multiple sites + self.user_add_claim_unless_exists( + item, claim, exists_arg, page.site, pywikibot.info)
def main(*args: str) -> None:
pywikibot-commits@lists.wikimedia.org