jenkins-bot has submitted this change and it was merged.
Change subject: Bug 55124: link harvesting bugs
......................................................................
Bug 55124: link harvesting bugs
- move link handling to a helper function
- dont add claim for value which didnt match wikilink regexp
- fix NoPage/exists check
- add check to prevent linking an item to itself
Change-Id: I049c13d2ca1c54e304d31cf00ae608a07e2235f1
---
M scripts/harvest_template.py
1 file changed, 36 insertions(+), 12 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 65ec3b3..c70c9f2 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -85,6 +85,31 @@
titles.append(temp.title(withNamespace=False))
return titles
+ def _template_link_target(self, item, link_text):
+ linked_page = None
+
+ link = pywikibot.Link(link_text)
+ linked_page = pywikibot.Page(link)
+
+ if not linked_page.exists():
+ pywikibot.output(u'%s doesn\'t exist so it can\'t be linked.
Skipping' % (linked_page))
+ return
+
+ if linked_page.isRedirectPage():
+ linked_page = linked_page.getRedirectTarget()
+
+ linked_item = pywikibot.ItemPage.fromPage(linked_page)
+
+ if not linked_item.exists():
+ pywikibot.output(u'%s doesn\'t have a wikidata item to link with.
Skipping' % (linked_page))
+ return
+
+ if linked_item.title() == item.title():
+ pywikibot.output(u'%s links to itself. Skipping' % (linked_page))
+ return
+
+ return linked_item
+
def processPage(self, page):
"""
Process a single page
@@ -96,7 +121,7 @@
#TODO FIXME: We should provide an option to create the page
item.get()
if set(self.fields.values()) <= set(item.claims.keys()):
- pywikibot.output('%s item %s has claims for all properties. Skipping'
% (page, item.title()))
+ pywikibot.output(u'%s item %s has claims for all properties.
Skipping' % (page, item.title()))
else:
pagetext = page.get()
templates = pywikibot.extract_templates_and_params(pagetext)
@@ -131,17 +156,16 @@
if claim.getType() == 'wikibase-item':
# Try to extract a valid page
match = re.search(pywikibot.link_regex, value)
- if match:
- try:
- link = pywikibot.Link(match.group(1))
- linkedPage = pywikibot.Page(link)
- if linkedPage.isRedirectPage():
- linkedPage =
linkedPage.getRedirectTarget()
- linkedItem =
pywikibot.ItemPage.fromPage(linkedPage)
- claim.setTarget(linkedItem)
- except pywikibot.exceptions.NoPage:
- pywikibot.output('[[%s]] doesn\'t
exist so I can\'t link to it' % (linkedItem.title(),))
- continue
+ if not match:
+ pywikibot.output(u'%s field %s value %s isnt
a wikilink. Skipping' % (claim.getID(), field, value))
+ continue
+
+ link_text = match.group(1)
+ linked_item = self._template_link_target(item,
link_text)
+ if not linked_item:
+ continue
+
+ claim.setTarget(linked_item)
elif claim.getType() == 'string':
claim.setTarget(value.strip())
elif claim.getType() == 'commonsMedia':
--
To view, visit
https://gerrit.wikimedia.org/r/135313
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I049c13d2ca1c54e304d31cf00ae608a07e2235f1
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>