jenkins-bot has submitted this change and it was merged.
Change subject: Add SPARQL endpoint support for pywikibot ......................................................................
Add SPARQL endpoint support for pywikibot
Change-Id: I9fd08059fe2ba34bb95789e088dd7416a73daaed --- M dev-requirements.txt M pywikibot/README.rst A pywikibot/data/sparql.py M pywikibot/pagegenerators.py A tests/sparql_tests.py 5 files changed, 400 insertions(+), 0 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/dev-requirements.txt b/dev-requirements.txt index 81f4b4a..734953c 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -11,6 +11,7 @@ flake8 codecov coverage +mock ; python_version < '3'
# pywin32 & pywinauto>=0.4.0 are Win32 UI test dependencies that have been # excluded from this file as they are quite expensive to install, and they diff --git a/pywikibot/README.rst b/pywikibot/README.rst index 1397683..5f77c30 100644 --- a/pywikibot/README.rst +++ b/pywikibot/README.rst @@ -133,6 +133,8 @@ +---------------------------+-------------------------------------------------------+ | wikistats.py | Objects representing WikiStats API | +---------------------------+-------------------------------------------------------+ + | sparql.py | Objects representing SPARQL query API | + +---------------------------+-------------------------------------------------------+
+---------------+------------------------------------------------------------------+ diff --git a/pywikibot/data/sparql.py b/pywikibot/data/sparql.py new file mode 100644 index 0000000..9e16d4d --- /dev/null +++ b/pywikibot/data/sparql.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- +"""SPARQL Query interface.""" +# +# Distributed under the terms of the MIT license. +# +from __future__ import absolute_import, unicode_literals + +import json +import sys +if sys.version_info[0] > 2: + from urllib.parse import quote +else: + from urllib2 import quote + +from pywikibot.comms import http + +WIKIDATA = 'http://query.wikidata.org/sparql' +DEFAULT_HEADERS = {'cache-control': 'no-cache', + 'Accept': 'application/sparql-results+json'} + + +class SparqlQuery(object): + """ + SPARQL Query class. + + This class allows to run SPARQL queries against any SPARQL endpoint. + """ + + def __init__(self, endpoint=WIKIDATA, entity_url='http://www.wikidata.org/entity/'): + """ + Create endpoint. + + @param endpoint: SPARQL endpoint URL, by default Wikidata query endpoint + """ + self.endpoint = endpoint + self.last_response = None + self.entity_url = entity_url + + def get_last_response(self): + """ + Return last received response. + + @return: Response object from last request or None + """ + return self.last_response + + def select(self, query, full_data=False, headers=DEFAULT_HEADERS): + """ + Run SPARQL query and return the result. + + The response is assumed to be in format defined by: + https://www.w3.org/TR/2013/REC-sparql11-results-json-20130321/ + + @param query: Query text + @type query: string + @param full_data: Whether return full data objects or only values + @type full_data: bool + @return: List of query results or None if query failed + """ + data = self.query(query, headers=headers) + if data and 'results' in data: + result = [] + qvars = data['head']['vars'] + for row in data['results']['bindings']: + values = {} + for var in qvars: + if full_data: + if row[var]['type'] not in VALUE_TYPES: + raise ValueError('Unknown type: %s' % row[var]['type']) + valtype = VALUE_TYPES[row[var]['type']] + values[var] = valtype(row[var], entity_url=self.entity_url) + else: + values[var] = row[var]['value'] + result.append(values) + return result + else: + return None + + def query(self, query, headers=DEFAULT_HEADERS): + """ + Run SPARQL query and return parsed JSON result. + + @param query: Query text + @type query: string + """ + url = '%s?query=%s' % (self.endpoint, quote(query)) + self.last_response = http.fetch(url, headers=headers) + if not self.last_response.content: + return None + try: + return json.loads(self.last_response.content) + except ValueError: + return None + + def ask(self, query, headers=DEFAULT_HEADERS): + """ + Run SPARQL ASK query and return boolean result. + + @param query: Query text + @type query: string + @rtype: bool + """ + data = self.query(query, headers=headers) + return data['boolean'] + + def get_items(self, query, item_name='item'): + """ + Retrieve set of items which satisfy given query. + + Items are returned as Wikibase IDs. + + @param query: Query string. Must contain ?{item_name} as one of the projected values. + @param item_name: Name of the value to extract + @return: Set of item ids, e.g. Q1234 + @rtype: set + """ + res = self.select(query, full_data=True) + if res: + return set([r[item_name].getID() for r in res]) + return set() + + +class URI(object): + """Representation of URI result type.""" + + def __init__(self, data, entity_url, **kwargs): + """ + Create URI object. + + @type data: dict + """ + self.value = data.get('value') + self.entity_url = entity_url + + def getID(self): + """ + Get ID of Wikibase object identified by the URI. + + @return: ID of Wikibase object, e.g. Q1234 + """ + urllen = len(self.entity_url) + if self.value.startswith(self.entity_url): + return self.value[urllen:] + else: + return None + + def __str__(self): + return self.value + + def __repr__(self): + return '<' + self.value + '>' + + +class Literal(object): + """Representation of RDF literal result type.""" + + def __init__(self, data, **kwargs): + """ + Create Literal object. + + @type data: dict + """ + self.type = data.get('datatype') + self.language = data.get('xml:lang') + self.value = data.get('value') + + def __str__(self): + return self.value + + def __repr__(self): + if self.type: + return self.value + '^^' + self.type + if self.language: + return self.value + '@' + self.language + return self.value + + +class Bnode(object): + """Representation of blank node.""" + + def __init__(self, data, **kwargs): + """ + Create Bnode. + + @type data: dict + """ + self.value = data['value'] + + def __str__(self): + return self.value + + def __repr__(self): + return "_:" + self.value + +VALUE_TYPES = {'uri': URI, 'literal': Literal, 'bnode': Bnode} diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index 2a22663..7efef9c 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -241,6 +241,12 @@ -wikidataquery Takes a WikidataQuery query string like claim[31:12280] and works on the resulting pages.
+-sparql Takes a SPARQL SELECT query string including ?item + and works on the resulting pages. + +-sparqlendpoint Specify SPARQL endpoint URL (optional). + (Example : -sparqlendpoint:http://myserver.com/sparql) + -searchitem Takes a search string and works on Wikibase pages that contain it. Argument can be given as "-searchitem:text", where text @@ -362,6 +368,7 @@ self.subpage_max_depth = None self._site = site self._positional_arg_name = positional_arg_name + self._sparql = None
@property def site(self): @@ -820,6 +827,14 @@ if not value: value = pywikibot.input('WikidataQuery string:') gen = WikidataQueryPageGenerator(value, site=self.site) + elif arg == '-sparqlendpoint': + if not value: + value = pywikibot.input('SPARQL endpoint:') + self._sparql = value + elif arg == '-sparql': + if not value: + value = pywikibot.input('SPARQL query:') + gen = WikidataSPARQLPageGenerator(value, site=self.site, endpoint=self._sparql) elif arg == '-mysqlquery': if not value: value = pywikibot.input('Mysql query string:') @@ -2581,6 +2596,38 @@ yield pywikibot.Page(pywikibot.Link(link, site))
+def WikidataSPARQLPageGenerator(query, site=None, item_name='item', endpoint=None): + """Generate pages that result from the given SPARQL query. + + @param query: the SPARQL query string. + @param site: Site for generator results. + @type site: L{pywikibot.site.BaseSite} + + """ + from pywikibot.data import sparql + + if site is None: + site = pywikibot.Site() + repo = site.data_repository() + if endpoint is None: + endpoint = sparql.WIKIDATA + + query_object = sparql.SparqlQuery(endpoint=endpoint) + data = query_object.get_items(query, item_name=item_name) + + for item in data: + page = pywikibot.ItemPage(repo, item) + if isinstance(site, pywikibot.site.DataSite): + yield page + continue + + try: + link = page.getSitelink(site) + except pywikibot.NoPage: + continue + yield pywikibot.Page(pywikibot.Link(link, site)) + + def WikibaseSearchItemPageGenerator(text, language=None, total=None, site=None): """ Generate pages that contain the provided text. diff --git a/tests/sparql_tests.py b/tests/sparql_tests.py new file mode 100644 index 0000000..9db482c --- /dev/null +++ b/tests/sparql_tests.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +"""Test cases for the SPARQL API.""" +# +# Distributed under the terms of the MIT license. +# +from __future__ import absolute_import, unicode_literals + +import sys + +import pywikibot.data.sparql as sparql + +from tests.aspects import unittest, TestCase + +if sys.version_info[0] > 2: + from unittest.mock import patch +else: + from mock import patch + +# See: https://www.w3.org/TR/2013/REC-sparql11-results-json-20130321/ + +SQL_RESPONSE = """ +{ + "head" : { + "vars" : [ "cat", "d", "catLabel" ] + }, + "results" : { + "bindings" : [ { + "cat" : { + "type" : "uri", + "value" : "http://www.wikidata.org/entity/Q498787" + }, + "d" : { + "datatype" : "http://www.w3.org/2001/XMLSchema#dateTime", + "type" : "literal", + "value" : "1955-01-01T00:00:00Z" + }, + "catLabel" : { + "xml:lang" : "en", + "type" : "literal", + "value" : "Muezza" + } + }, { + "cat" : { + "type" : "uri", + "value" : "http://www.wikidata.org/entity/Q677525" + }, + "d" : { + "datatype" : "http://www.w3.org/2001/XMLSchema#dateTime", + "type" : "literal", + "value" : "2015-06-22T00:00:00Z" + }, + "catLabel" : { + "xml:lang" : "en", + "type" : "literal", + "value" : "Orangey" + } + } ] + } +} +""" + +RESPONSE_TRUE = """ +{ + "head" : { }, + "boolean" : true +} +""" + +RESPONSE_FALSE = """ +{ + "head" : { }, + "boolean" : false +} +""" + + +class TestContainer(object): + """Simple test container for return values.""" + + def __init__(self, value): + """Create container.""" + self.content = value + + +class TestSparql(TestCase): + """Test SPARQL queries.""" + + net = False + + @patch.object(sparql.http, 'fetch') + def testQuerySelect(self, mock_method): + """Test SELECT query.""" + mock_method.return_value = TestContainer(SQL_RESPONSE) + q = sparql.SparqlQuery() + res = q.select('SELECT * WHERE { ?x ?y ?z }') + self.assertIsInstance(res, list, 'Result is not a list') + self.assertEqual(len(res), 2) + + self.assertDictEqual(res[0], + {'cat': 'http://www.wikidata.org/entity/Q498787', + 'catLabel': 'Muezza', 'd': '1955-01-01T00:00:00Z'}, + 'Bad result') + self.assertDictEqual(res[1], + {'cat': 'http://www.wikidata.org/entity/Q677525', + 'catLabel': 'Orangey', 'd': '2015-06-22T00:00:00Z'}, + 'Bad result') + + @patch.object(sparql.http, 'fetch') + def testQuerySelectFull(self, mock_method): + """Test SELECT query with full data.""" + mock_method.return_value = TestContainer(SQL_RESPONSE) + q = sparql.SparqlQuery() + res = q.select('SELECT * WHERE { ?x ?y ?z }', full_data=True) + self.assertIsInstance(res, list, 'Result is not a list') + self.assertEqual(len(res), 2) + + self.assertIsInstance(res[0]['cat'], sparql.URI, 'Wrong type for URI') + self.assertEqual(repr(res[0]['cat']), 'http://www.wikidata.org/entity/Q498787', + 'Wrong URI representation') + self.assertEqual(res[0]['cat'].getID(), 'Q498787', 'Wrong URI ID') + + self.assertIsInstance(res[0]['catLabel'], sparql.Literal, 'Wrong type for Literal') + self.assertEqual(repr(res[0]['catLabel']), 'Muezza@en', 'Wrong literal representation') + + self.assertIsInstance(res[0]['d'], sparql.Literal, 'Wrong type for Literal') + self.assertEqual(repr(res[0]['d']), + '1955-01-01T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime', + 'Wrong URI representation') + + @patch.object(sparql.http, 'fetch') + def testGetItems(self, mock_method): + """Test item list retrieval via SPARQL.""" + mock_method.return_value = TestContainer(SQL_RESPONSE) + q = sparql.SparqlQuery() + res = q.get_items('SELECT * WHERE { ?x ?y ?z }', 'cat') + self.assertSetEqual(res, set(['Q498787', 'Q677525'])) + + @patch.object(sparql.http, 'fetch') + def testQueryAsk(self, mock_method): + """Test ASK query.""" + mock_method.return_value = TestContainer(RESPONSE_TRUE) + q = sparql.SparqlQuery() + + res = q.ask('ASK { ?x ?y ?z }') + self.assertTrue(res) + + mock_method.return_value = TestContainer(RESPONSE_FALSE) + res = q.ask('ASK { ?x ?y ?z }') + self.assertFalse(res) + +if __name__ == '__main__': + try: + unittest.main() + except SystemExit: + pass