[Gerrit] Bug 55124: link harvesting bugs - change (pywikibot/core) - Pywikibot-commits

27 May 2014

jenkins-bot has submitted this change and it was merged.
Change subject: Bug 55124: link harvesting bugs
......................................................................
Bug 55124: link harvesting bugs
- move link handling to a helper function
- dont add claim for value which didnt match wikilink regexp
- fix NoPage/exists check
- add check to prevent linking an item to itself
Change-Id: I049c13d2ca1c54e304d31cf00ae608a07e2235f1
---
M scripts/harvest_template.py
1 file changed, 36 insertions(+), 12 deletions(-)
Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 65ec3b3..c70c9f2 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -85,6 +85,31 @@
         titles.append(temp.title(withNamespace=False))
         return titles
+    def _template_link_target(self, item, link_text):
+        linked_page = None
+
+        link = pywikibot.Link(link_text)
+        linked_page = pywikibot.Page(link)
+
+        if not linked_page.exists():
+            pywikibot.output(u'%s doesn't exist so it can't be linked. Skipping' % (linked_page))
+            return
+
+        if linked_page.isRedirectPage():
+            linked_page = linked_page.getRedirectTarget()
+
+        linked_item = pywikibot.ItemPage.fromPage(linked_page)
+
+        if not linked_item.exists():
+            pywikibot.output(u'%s doesn't have a wikidata item to link with. Skipping' % (linked_page))
+            return
+
+        if linked_item.title() == item.title():
+            pywikibot.output(u'%s links to itself. Skipping' % (linked_page))
+            return
+
+        return linked_item
+
     def processPage(self, page):
         """
         Process a single page
@@ -96,7 +121,7 @@
             #TODO FIXME: We should provide an option to create the page
         item.get()
         if set(self.fields.values()) <= set(item.claims.keys()):
-            pywikibot.output('%s item %s has claims for all properties. Skipping' % (page, item.title()))
+            pywikibot.output(u'%s item %s has claims for all properties. Skipping' % (page, item.title()))
         else:
             pagetext = page.get()
             templates = pywikibot.extract_templates_and_params(pagetext)
@@ -131,17 +156,16 @@
                                 if claim.getType() == 'wikibase-item':
                                     # Try to extract a valid page
                                     match = re.search(pywikibot.link_regex, value)
-                                    if match:
-                                        try:
-                                            link = pywikibot.Link(match.group(1))
-                                            linkedPage = pywikibot.Page(link)
-                                            if linkedPage.isRedirectPage():
-                                                linkedPage = linkedPage.getRedirectTarget()
-                                            linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
-                                            claim.setTarget(linkedItem)
-                                        except pywikibot.exceptions.NoPage:
-                                            pywikibot.output('[[%s]] doesn't exist so I can't link to it' % (linkedItem.title(),))
-                                            continue
+                                    if not match:
+                                        pywikibot.output(u'%s field %s value %s isnt a wikilink. Skipping' % (claim.getID(), field, value))
+                                        continue
+
+                                    link_text = match.group(1)
+                                    linked_item = self._template_link_target(item, link_text)
+                                    if not linked_item:
+                                        continue
+
+                                    claim.setTarget(linked_item)
                                 elif claim.getType() == 'string':
                                     claim.setTarget(value.strip())
                                 elif claim.getType() == 'commonsMedia':
-- 
To view, visit https://gerrit.wikimedia.org/r/135313
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I049c13d2ca1c54e304d31cf00ae608a07e2235f1
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: Ladsgroup ladsgroup@gmail.com
Gerrit-Reviewer: Merlijn van Deen valhallasw@arctus.nl
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot <>