jenkins-bot has submitted this change and it was merged.
Change subject: WikidataSPARQLPageGenerator should not always remove duplicates ......................................................................
WikidataSPARQLPageGenerator should not always remove duplicates
get_items creates a set from the results, to avoid duplicates. It has the unfortunate side effect of shuffling the results of all queries, including those with an ORDER BY.
The result_type argument is added to allow the caller to change the default set() to a list() to preserve order and duplicates.
Bug: T141348 Change-Id: I19873439e82f1a74797d1d2b2ac253086b38721e --- M pywikibot/data/sparql.py M pywikibot/pagegenerators.py M tests/sparql_tests.py 3 files changed, 45 insertions(+), 17 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/data/sparql.py b/pywikibot/data/sparql.py index 9e16d4d..dd09b21 100644 --- a/pywikibot/data/sparql.py +++ b/pywikibot/data/sparql.py @@ -103,21 +103,24 @@ data = self.query(query, headers=headers) return data['boolean']
- def get_items(self, query, item_name='item'): + def get_items(self, query, item_name='item', result_type=set): """ - Retrieve set of items which satisfy given query. + Retrieve items which satisfy given query.
Items are returned as Wikibase IDs.
@param query: Query string. Must contain ?{item_name} as one of the projected values. @param item_name: Name of the value to extract - @return: Set of item ids, e.g. Q1234 - @rtype: set + @param result_type: type of the iterable in which + SPARQL results are stored (default set) + @type result_type: iterable + @return: item ids, e.g. Q1234 + @rtype: same as result_type """ res = self.select(query, full_data=True) if res: - return set([r[item_name].getID() for r in res]) - return set() + return result_type(r[item_name].getID() for r in res) + return result_type()
class URI(object): diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index a79a6bc..6ce7417 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -2665,12 +2665,17 @@ return WikidataPageFromItemGenerator(items_pages, site)
-def WikidataSPARQLPageGenerator(query, site=None, item_name='item', endpoint=None): +def WikidataSPARQLPageGenerator(query, site=None, + item_name='item', endpoint=None, + result_type=set): """Generate pages that result from the given SPARQL query.
@param query: the SPARQL query string. @param site: Site for generator results. @type site: L{pywikibot.site.BaseSite} + @param result_type: type of the iterable in which + SPARQL results are stored (default set) + @type result_type: iterable
""" from pywikibot.data import sparql @@ -2682,7 +2687,9 @@ endpoint = sparql.WIKIDATA
query_object = sparql.SparqlQuery(endpoint=endpoint) - data = query_object.get_items(query, item_name=item_name) + data = query_object.get_items(query, + item_name=item_name, + result_type=result_type) items_pages = (pywikibot.ItemPage(repo, item) for item in data) if isinstance(site, pywikibot.site.DataSite): return items_pages diff --git a/tests/sparql_tests.py b/tests/sparql_tests.py index d86368e..26a0b7b 100644 --- a/tests/sparql_tests.py +++ b/tests/sparql_tests.py @@ -18,13 +18,21 @@
# See: https://www.w3.org/TR/2013/REC-sparql11-results-json-20130321/
-SQL_RESPONSE = """ +SQL_RESPONSE_CONTAINER = """ { "head" : { "vars" : [ "cat", "d", "catLabel" ] }, "results" : { - "bindings" : [ { + "bindings" : [ + %s + ] + } +} +""" + +ITEM_Q498787 = """ + { "cat" : { "type" : "uri", "value" : "http://www.wikidata.org/entity/Q498787" @@ -39,7 +47,11 @@ "type" : "literal", "value" : "Muezza" } - }, { + } +""" + +ITEM_Q677525 = """ + { "cat" : { "type" : "uri", "value" : "http://www.wikidata.org/entity/Q677525" @@ -54,9 +66,7 @@ "type" : "literal", "value" : "Orangey" } - } ] - } -} + } """
RESPONSE_TRUE = """ @@ -90,7 +100,8 @@ @patch.object(sparql.http, 'fetch') def testQuerySelect(self, mock_method): """Test SELECT query.""" - mock_method.return_value = Container(SQL_RESPONSE) + mock_method.return_value = Container( + SQL_RESPONSE_CONTAINER % ("%s, %s" % (ITEM_Q498787, ITEM_Q677525))) q = sparql.SparqlQuery() res = q.select('SELECT * WHERE { ?x ?y ?z }') self.assertIsInstance(res, list, 'Result is not a list') @@ -108,7 +119,8 @@ @patch.object(sparql.http, 'fetch') def testQuerySelectFull(self, mock_method): """Test SELECT query with full data.""" - mock_method.return_value = Container(SQL_RESPONSE) + mock_method.return_value = Container( + SQL_RESPONSE_CONTAINER % ("%s, %s" % (ITEM_Q498787, ITEM_Q677525))) q = sparql.SparqlQuery() res = q.select('SELECT * WHERE { ?x ?y ?z }', full_data=True) self.assertIsInstance(res, list, 'Result is not a list') @@ -130,10 +142,16 @@ @patch.object(sparql.http, 'fetch') def testGetItems(self, mock_method): """Test item list retrieval via SPARQL.""" - mock_method.return_value = Container(SQL_RESPONSE) + mock_method.return_value = Container( + SQL_RESPONSE_CONTAINER % ("%s, %s, %s" % (ITEM_Q498787, + ITEM_Q677525, + ITEM_Q677525))) q = sparql.SparqlQuery() res = q.get_items('SELECT * WHERE { ?x ?y ?z }', 'cat') self.assertSetEqual(res, set(['Q498787', 'Q677525'])) + res = q.get_items('SELECT * WHERE { ?x ?y ?z }', 'cat', + result_type=list) + self.assertEqual(res, ['Q498787', 'Q677525', 'Q677525'])
@patch.object(sparql.http, 'fetch') def testQueryAsk(self, mock_method):