jenkins-bot has submitted this change and it was merged.
Change subject: Add an interface module to the WikidataQuery API. ......................................................................
Add an interface module to the WikidataQuery API.
Methods to generate the queries programmatically from either integer IDs or Pywikibot page classes like ItemPage are provided, along with basic caching of results. Tests included.
Change-Id: Id1dd2c48c65b9bfb877ec10ad1b8ea69aa00a39c --- M pywikibot/__init__.py A pywikibot/data/wikidataquery.py M tests/wikibase_tests.py A tests/wikidataquery_tests.py 4 files changed, 823 insertions(+), 2 deletions(-)
Approvals: Merlijn van Deen: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py index bbdab74..9778a33 100644 --- a/pywikibot/__init__.py +++ b/pywikibot/__init__.py @@ -270,13 +270,19 @@ return WbTime(long(t[0]), int(t[1]), int(t[2]), int(t[3]), int(t[4]), int(t[5]), precision, before, after, timezone, calendarmodel)
+ def toTimestr(self): + """ + Function which converts the the data to a UTC date/time string + """ + return WbTime.FORMATSTR.format(self.year, self.month, self.day, + self.hour, self.minute, self.second) + def toWikibase(self): """ Function which converts the data to a JSON object for the Wikibase API. """ - json = {'time': WbTime.FORMATSTR.format(self.year, self.month, self.day, - self.hour, self.minute, self.second), + json = {'time': self.toTimestr(), 'precision': self.precision, 'after': self.after, 'before': self.before, diff --git a/pywikibot/data/wikidataquery.py b/pywikibot/data/wikidataquery.py new file mode 100644 index 0000000..f7ea1a6 --- /dev/null +++ b/pywikibot/data/wikidataquery.py @@ -0,0 +1,540 @@ +# -*- coding: utf-8 -*- +""" +Objects representing WikidataQuery query syntax and API +""" +# +# (C) Pywikipedia bot team, 2013 +# +# Distributed under the terms of the MIT license. + +import json +import urllib2 +from pywikibot.comms import http +import pickle +import os +import hashlib +import time +import tempfile + +from pywikibot.page import ItemPage, PropertyPage, Claim +import pywikibot + + +def listify(x): + """ + If given a non-list , encapsulate in a single-element list + """ + return x if isinstance(x, list) else [x] + + +class QuerySet(): + """ + A QuerySet represents a set of queries or other query sets, joined + by operators (AND and OR). + + A QuerySet stores this information as a list of Query(Sets) and + a joiner operator to join them all together + """ + + def __init__(self, q): + """ + Initialise a query set from a Query or another QuerySet + """ + self.qs = [q] + + def addJoiner(self, args, joiner): + """ + Add to this QuerySet using the given joiner. + + If the given joiner is not the same as we used before in + this QuerySet, nest the current one in parens before joining + - this makes the implicit grouping of the API explicit. + + @return a new query set representing the joining of this one and + the arguments + """ + + if len(self.qs) > 1 and joiner != self.joiner: + left = QuerySet(self) + else: + left = self + + left.joiner = joiner + + for a in listify(args): + left.qs.append(a) + + return left + + def AND(self, args): + """ + Add the given args (Queries or QuerySets) to the Query set as a + logical conjuction (AND) + """ + return self.addJoiner(args, "AND") + + def OR(self, args): + """ + Add the given args (Queries or QuerySets) to the Query set as a + logical disjunction (AND) + """ + return self.addJoiner(args, "OR") + + def __str__(self): + """ + Output as an API-ready string + """ + + def bracketIfQuerySet(q): + if isinstance(q, QuerySet) and q.joiner != self.joiner: + return "(%s)" % q + else: + return str(q) + + s = bracketIfQuerySet(self.qs[0]) + + for q in self.qs[1:]: + s += " %s %s" % (self.joiner, bracketIfQuerySet(q)) + + return s + + def __repr__(self): + return u"QuerySet(%s)" % self + + +class Query(): + """ + A query is a single query for the WikidataQuery API, for example + claim[100:60] or link[enwiki] + + Construction of a Query can throw a TypeError if you feed it bad + parameters. Exactly what these need to be depends on the Query + """ + + def AND(self, ands): + """ + Produce a query set ANDing this query and all the given query/sets + """ + return QuerySet(self).addJoiner(ands, "AND") + + def OR(self, ors): + """ + Produce a query set ORing this query and all the given query/sets + """ + return QuerySet(self).addJoiner(ors, "OR") + + def formatItem(self, item): + """ + Default item formatting is string, which will work for queries, + querysets, ints and strings + """ + return str(item) + + def formatList(self, l): + """ + Format and comma-join a list + """ + return ",".join([self.formatItem(x) for x in l]) + + @staticmethod + def isOrContainsOnlyTypes(items, types): + """ + Either this item is one of the given types, or it is a list of + only those types + """ + if isinstance(items, list): + for x in items: + found = False + for typ in listify(types): + if isinstance(x, typ): + found = True + break + + if not found: + return False + else: + for typ in listify(types): + found = False + if isinstance(items, typ): + found = True + break + + if not found: + return False + + return True + + def validate(self): + """ + Default validate result is a pass - subclasses need to implement + this if they want to check their parameters + """ + return True + + def validateOrRaise(self, msg=None): + if not self.validate(): + raise(TypeError, msg) + + def convertWDType(self, item): + """ + Convert WD items like ItemPage or PropertyPage into integer IDs + for use in query strings. + + @param item A single item. One of ItemPages, PropertyPages, int + or anything that can be fed to int() + + @return the int ID of the item + """ + if isinstance(item, ItemPage) or isinstance(item, PropertyPage): + return item.getID(numeric=True) + else: + return int(item) + + def convertWDTypes(self, items): + return [self.convertWDType(x) for x in listify(items)] + + def __str__(self): + """ + The __str__ method is critical, as this is what generates + the string to be passed to the API + """ + raise NotImplemented + + def __repr__(self): + return u"Query(%s)" % self + + +class HasClaim(Query): + """ + This is a Query of the form "claim[prop:val]". It is subclassed by + the other similar forms like noclaim and string + """ + + queryType = "claim" + + def __init__(self, prop, items=[]): + self.prop = self.convertWDType(prop) + + if isinstance(items, Tree): + self.items = items + elif isinstance(self, StringClaim): + self.items = listify(items) + else: + self.items = self.convertWDTypes(items) + + self.validateOrRaise() + + def formatItems(self): + res = '' + if self.items: + res += ":" + ",".join([self.formatItem(x) for x in self.items]) + + return res + + def validate(self): + return self.isOrContainsOnlyTypes(self.items, [int, Tree]) + + def __str__(self): + if isinstance(self.items, list): + return "%s[%s%s]" % (self.queryType, self.prop, self.formatItems()) + elif isinstance(self.items, Tree): # maybe Query? + return "%s[%s:(%s)]" % (self.queryType, self.prop, self.items) + + +class NoClaim(HasClaim): + queryType = "noclaim" + + +class StringClaim(HasClaim): + """ + Query of the form string[PROPERTY:"STRING",...] + """ + queryType = "string" + + def formatItem(self, x): + """ + Strings need quote-wrapping + """ + return '"%s"' % x + + def validate(self): + return self.isOrContainsOnlyTypes(self.items, str) + + +class Tree(Query): + """ + Query of the form tree[ITEM,...][PROPERTY,...]<PROPERTY,...> + """ + queryType = "tree" + + def __init__(self, item, forward=[], reverse=[]): + """ + @param item The root item + @param forward List of forward properties, can be empty + @param reverse List of reverse properties, can be empty + """ + + # check sensible things coming in, as we lose info once we do + # type conversion + if not self.isOrContainsOnlyTypes(item, [int, ItemPage]): + raise(TypeError, "The item paramter must contain or be integer IDs or page.ItemPages") + elif (not self.isOrContainsOnlyTypes(forward, [int, PropertyPage]) + or not self.isOrContainsOnlyTypes(reverse, [int, PropertyPage])): + raise(TypeError, "The forward and reverse parameters must contain or be integer IDs or page.PropertyPages") + + self.item = self.convertWDTypes(item) + self.forward = self.convertWDTypes(forward) + self.reverse = self.convertWDTypes(reverse) + + self.validateOrRaise() + + def validate(self): + return (self.isOrContainsOnlyTypes(self.item, int) and + self.isOrContainsOnlyTypes(self.forward, int) and + self.isOrContainsOnlyTypes(self.reverse, int)) + + def __str__(self): + return "%s[%s][%s][%s]" % (self.queryType, self.formatList(self.item), + self.formatList(self.forward), + self.formatList(self.reverse)) + + +class Around(Query): + """ + A query in the form around[PROPERTY,LATITUDE,LONGITUDE,RADIUS] + """ + queryType = "around" + + def __init__(self, prop, coord, rad): + self.prop = self.convertWDType(prop) + self.lt = coord.lat + self.lg = coord.lon + self.rad = rad + + def validate(self): + return isinstance(self.prop, int) + + def __str__(self): + return "%s[%s,%s,%s,%s]" % (self.queryType, self.prop, + self.lt, self.lg, self.rad) + + +class Between(Query): + """ + A query in the form between[PROP, BEGIN, END] + + You have to give prop and one of begin or end. Note that times have + to be in UTC, timezones are not supported by the API + + @param prop the property + @param begin WbTime object representign the beginning of the period + @param end WbTime object representing the end of the period + """ + queryType = "between" + + def __init__(self, prop, begin=None, end=None): + self.prop = self.convertWDType(prop) + self.begin = begin + self.end = end + + def validate(self): + return (self.begin or self.end) and isinstance(self.prop, int) + + def __str__(self): + begin = self.begin.toTimestr() if self.begin else '' + + # if you don't have an end, you don't put in the comma + end = ',' + self.end.toTimestr() if self.end else '' + + return "%s[%s,%s%s]" % (self.queryType, self.prop, begin, end) + + +class Link(Query): + """ + A query in the form link[LINK,...], which also includes nolink + + All link elements have to be strings, or validation will throw + """ + + queryType = "link" + + def __init__(self, link): + self.link = listify(link) + self.validateOrRaise() + + def validate(self): + return self.isOrContainsOnlyTypes(self.link, str) + + def __str__(self): + return "%s[%s]" % (self.queryType, self.formatList(self.link)) + + +class NoLink(Link): + queryType = "nolink" + + +def fromClaim(claim): + """ + Construct from a pywikibot.page Claim object + """ + + if not isinstance(claim, Claim): + raise(TypeError, "claim must be a page.Claim") + + if claim.getType() == 'wikibase-item': + return HasClaim(claim.getID(numeric=True), claim.getTarget().getID(numeric=True)) + if claim.getType() == 'string': + return StringClaim(claim.getID(numeric=True), claim.getTarget()) + else: + raise(TypeError, "Cannot construct a query from a claim of type %s" + % claim.getType()) + + +class WikidataQuery(): + """ + An interface to the WikidatQuery API. Default host is + wikidataquery.eu (http://208.80.153.172), but you can substitute + a different one. + + Caching defaults to a subdir of the system temp directory with a + 1 hour max cache age. + + Set a zero or negative maxCacheAge to disable caching + """ + + def __init__(self, host="http://208.80.153.172", cacheDir=None, + cacheMaxAge=60): + self.host = host + self.cacheMaxAge = cacheMaxAge + + if cacheDir: + self.cacheDir = cacheDir + else: + self.cacheDir = os.path.join(tempfile.gettempdir(), + "wikidataquery_cache") + + def getUrl(self, queryStr): + return "%s/api?%s" % (self.host, queryStr) + + def getQueryString(self, q, labels=[], props=[]): + """ + Get the query string for a given query or queryset + @return query string including lables and props + """ + qStr = "q=%s" % urllib2.quote(str(q)) + + if labels: + qStr += "&labels=%s" % ','.join(labels) + + if props: + qStr += "&props=%s" % ','.join(props) + + return qStr + + def getCacheFilename(self, queryStr): + """ + Encode a query into a unique and universally safe format + """ + encQuery = hashlib.sha1(queryStr).hexdigest() + ".wdq_cache" + return os.path.join(self.cacheDir, encQuery) + + def readFromCache(self, queryStr): + """ + Check if we have cached this data recently enough, read it + if we have. Returns None if the data is not there or if it is + too old + """ + + if self.cacheMaxAge <= 0: + return None + + cacheFile = self.getCacheFilename(queryStr) + + if os.path.isfile(cacheFile): + mtime = os.path.getmtime(cacheFile) + now = time.time() + + if ((now - mtime) / 60) < self.cacheMaxAge: + + try: + data = pickle.load(open(cacheFile, 'r')) + except pickle.UnpicklingError: + pywikibot.warning(u"Couldn't read cached data from %s" + % cacheFile) + data = None + + return data + + return None + + def saveToCache(self, q, data): + """ + Save data from a query to a cache file, if enabled + @ returns nothing + """ + + if self.cacheMaxAge <= 0: + return + + # we have to use our own query string, as otherwise we may + # be able to find the cache file again if there are e.g. + # whitespace differences + cacheFile = self.getCacheFilename(q) + + if os.path.exists(cacheFile) and not os.path.isfile(cacheFile): + return + + if not os.path.exists(self.cacheDir): + os.makedirs(self.cacheDir) + + try: + pickle.dump(data, open(cacheFile, 'w')) + except IOError: + pywikibot.warning(u"Failed to write cache file %s" % cacheFile) + + def getDataFromHost(self, queryStr): + """ + Go and fetch a query from the host's API + """ + url = self.getUrl(queryStr) + + try: + resp = http.request(None, url) + except: + pywikibot.warning(u"Failed to retrieve %s" % url) + raise + + try: + data = json.loads(resp) + except ValueError: + pywikibot.warning(u"Data received from host but no JSON could be decoded") + raise pywikibot.ServerError + + return data + + def query(self, q, labels=[], props=[]): + """ + Actually run a query over the API + @return Python dict of the interpreted JSON or None on failure + """ + + fullQueryString = self.getQueryString(q, labels, props) + + #try to get cached data first + data = self.readFromCache(fullQueryString) + + if data: + return data + + # the cached data must not be OK, go and get real data from the + # host's API + data = self.getDataFromHost(fullQueryString) + + # no JSON found + if not data: + return None + + #cache data for next time + self.saveToCache(fullQueryString, data) + + return data diff --git a/tests/wikibase_tests.py b/tests/wikibase_tests.py index eb653d8..43116fa 100644 --- a/tests/wikibase_tests.py +++ b/tests/wikibase_tests.py @@ -45,6 +45,10 @@ claim.setTarget(pywikibot.ItemPage(repo, 'q1')) self.assertEqual(claim._formatDataValue(), {'entity-type': 'item', 'numeric-id': 1})
+ # test WbTime + t = pywikibot.WbTime(year=2010, hour=12, minute=43) + self.assertEqual(t.toTimestr(), '+00000002010-01-01T12:43:00Z') + # test WikibasePage.__cmp__ self.assertEqual(pywikibot.ItemPage.fromPage(mainpage), pywikibot.ItemPage(repo, 'q5296'))
diff --git a/tests/wikidataquery_tests.py b/tests/wikidataquery_tests.py new file mode 100644 index 0000000..d8cfb66 --- /dev/null +++ b/tests/wikidataquery_tests.py @@ -0,0 +1,271 @@ +# -*- coding: utf-8 -*- +""" +Test cases for the WikidataQuery query syntax and API +""" +# +# (C) Pywikipedia bot team, 2013 +# +# Distributed under the terms of the MIT license. + + +import pywikibot.data.wikidataquery as query +from utils import unittest + +import pywikibot +from pywikibot.page import ItemPage, PropertyPage, Claim + +import os +import time + + +class TestApiFunctions(unittest.TestCase): + + def testQueries(self): + """ + Test that we produce the expected query strings and that + invalid inputs are rejected correctly + """ + + q = query.HasClaim(99) + self.assertEqual(str(q), "claim[99]") + + q = query.HasClaim(99, 100) + self.assertEqual(str(q), "claim[99:100]") + + q = query.HasClaim(99, [100]) + self.assertEqual(str(q), "claim[99:100]") + + q = query.HasClaim(99, [100, 101]) + self.assertEqual(str(q), "claim[99:100,101]") + + q = query.NoClaim(99, [100, 101]) + self.assertEqual(str(q), "noclaim[99:100,101]") + + q = query.StringClaim(99, "Hello") + self.assertEqual(str(q), 'string[99:"Hello"]') + + q = query.StringClaim(99, ["Hello"]) + self.assertEqual(str(q), 'string[99:"Hello"]') + + q = query.StringClaim(99, ["Hello", "world"]) + self.assertEqual(str(q), 'string[99:"Hello","world"]') + + self.assertRaises(TypeError, lambda: query.StringClaim(99, 2)) + + q = query.Tree(92, [1], 2) + self.assertEqual(str(q), 'tree[92][1][2]') + + #missing third arg + q = query.Tree(92, 1) + self.assertEqual(str(q), 'tree[92][1][]') + + #missing second arg + q = query.Tree(92, reverse=3) + self.assertEqual(str(q), 'tree[92][][3]') + + q = query.Tree([92, 93], 1, [2, 7]) + self.assertEqual(str(q), 'tree[92,93][1][2,7]') + + #bad tree arg types + self.assertRaises(TypeError, lambda: query.Tree(99, "hello")) + + q = query.Link("enwiki") + self.assertEqual(str(q), 'link[enwiki]') + + q = query.NoLink(["enwiki", "frwiki"]) + self.assertEqual(str(q), 'nolink[enwiki,frwiki]') + + #bad link arg types + self.assertRaises(TypeError, lambda: query.Link(99)) + self.assertRaises(TypeError, lambda: query.Link([99])) + + #HasClaim with tree as arg + q = query.HasClaim(99, query.Tree(1, 2, 3)) + self.assertEqual(str(q), "claim[99:(tree[1][2][3])]") + + q = query.HasClaim(99, query.Tree(1, [2, 5], [3, 90])) + self.assertEqual(str(q), "claim[99:(tree[1][2,5][3,90])]") + + def testQueriesWDStructures(self): + """ + Queries using Wikibase page structures like ItemPage + """ + repo = pywikibot.Site('wikidata', 'wikidata').data_repository() + + q = query.HasClaim(PropertyPage(repo, "P99")) + self.assertEqual(str(q), "claim[99]") + + q = query.HasClaim(PropertyPage(repo, "P99"), + ItemPage(repo, "Q100")) + self.assertEqual(str(q), "claim[99:100]") + + q = query.HasClaim(99, [100, PropertyPage(repo, "P101")]) + self.assertEqual(str(q), "claim[99:100,101]") + + q = query.StringClaim(PropertyPage(repo, "P99"), + "Hello") + self.assertEqual(str(q), 'string[99:"Hello"]') + + q = query.Tree(ItemPage(repo, "Q92"), [1], 2) + self.assertEqual(str(q), 'tree[92][1][2]') + + q = query.Tree(ItemPage(repo, "Q92"), [PropertyPage(repo, "P101")], 2) + self.assertEqual(str(q), 'tree[92][101][2]') + + self.assertRaises(TypeError, lambda: query.Tree(PropertyPage(repo, "P92"), + [PropertyPage(repo, "P101")], + 2)) + + c = pywikibot.Coordinate(50, 60) + q = query.Around(PropertyPage(repo, "P625"), c, 23.4) + self.assertEqual(str(q), 'around[625,50,60,23.4]') + + begin = pywikibot.WbTime(year=1999) + end = pywikibot.WbTime(year=2010, hour=1) + + #note no second comma + q = query.Between(PropertyPage(repo, "P569"), begin) + self.assertEqual(str(q), 'between[569,+00000001999-01-01T00:00:00Z]') + + q = query.Between(PropertyPage(repo, "P569"), end=end) + self.assertEqual(str(q), 'between[569,,+00000002010-01-01T01:00:00Z]') + + q = query.Between(569, begin, end) + self.assertEqual(str(q), 'between[569,+00000001999-01-01T00:00:00Z,+00000002010-01-01T01:00:00Z]') + + # try negative year + begin = pywikibot.WbTime(year=-44) + q = query.Between(569, begin, end) + self.assertEqual(str(q), 'between[569,-00000000044-01-01T00:00:00Z,+00000002010-01-01T01:00:00Z]') + + def testQueriesDirectFromClaim(self): + """ + Test construction of the the right Query from a page.Claim + """ + + repo = pywikibot.Site('wikidata', 'wikidata').data_repository() + + claim = Claim(repo, 'P17') + claim.setTarget(pywikibot.ItemPage(repo, 'Q35')) + + q = query.fromClaim(claim) + self.assertEqual(str(q), 'claim[17:35]') + + claim = Claim(repo, 'P268') + claim.setTarget('somestring') + + q = query.fromClaim(claim) + self.assertEqual(str(q), 'string[268:"somestring"]') + + def testQuerySets(self): + """ + Test that we can join queries together correctly + """ + + # construct via queries + qs = query.HasClaim(99, 100).AND(query.HasClaim(99, 101)) + + self.assertEqual(str(qs), 'claim[99:100] AND claim[99:101]') + + self.assertEqual(repr(qs), 'QuerySet(claim[99:100] AND claim[99:101])') + + qs = query.HasClaim(99, 100).AND(query.HasClaim(99, 101)).AND(query.HasClaim(95)) + + self.assertEqual(str(qs), 'claim[99:100] AND claim[99:101] AND claim[95]') + + # construct via queries + qs = query.HasClaim(99, 100).AND([query.HasClaim(99, 101), query.HasClaim(95)]) + + self.assertEqual(str(qs), 'claim[99:100] AND claim[99:101] AND claim[95]') + + qs = query.HasClaim(99, 100).OR([query.HasClaim(99, 101), query.HasClaim(95)]) + + self.assertEqual(str(qs), 'claim[99:100] OR claim[99:101] OR claim[95]') + + q1 = query.HasClaim(99, 100) + q2 = query.HasClaim(99, 101) + + #different joiners get explicit grouping parens (the api also allows implicit, but we don't do that) + qs1 = q1.AND(q2) + qs2 = q1.OR(qs1).AND(query.HasClaim(98)) + + self.assertEqual(str(qs2), '(claim[99:100] OR (claim[99:100] AND claim[99:101])) AND claim[98]') + + #if the joiners are the same, no need to group + qs1 = q1.AND(q2) + qs2 = q1.AND(qs1).AND(query.HasClaim(98)) + + self.assertEqual(str(qs2), 'claim[99:100] AND claim[99:100] AND claim[99:101] AND claim[98]') + + qs1 = query.HasClaim(100).AND(query.HasClaim(101)) + qs2 = qs1.OR(query.HasClaim(102)) + + self.assertEqual(str(qs2), '(claim[100] AND claim[101]) OR claim[102]') + + qs = query.Link("enwiki").AND(query.NoLink("dewiki")) + + self.assertEqual(str(qs), 'link[enwiki] AND nolink[dewiki]') + + def testQueryApiSyntax(self): + """ + Test that we can generate the API query correctly + """ + + w = query.WikidataQuery("http://example.com") + + qs = w.getQueryString(query.Link("enwiki")) + self.assertEqual(qs, "q=link%5Benwiki%5D") + + self.assertEqual(w.getUrl(qs), "http://example.com/api?q=link%5Benwiki%5D") + + #check labels and props work OK + qs = w.getQueryString(query.Link("enwiki"), ['en', 'fr'], ['prop']) + self.assertEqual(qs, "q=link%5Benwiki%5D&labels=en,fr&props=prop") + + +class TestApiSlowFunctions(unittest.TestCase): + def testQueryApiGetter(self): + """ + Test that we can actually retreive data and that caching works + """ + + w = query.WikidataQuery(cacheMaxAge=0) + + #this query doesn't return any items, save a bit of bandwidth! + q = query.HasClaim(105).AND([query.NoClaim(225), query.HasClaim(100)]) + + #check that the cache file is created + cacheFile = w.getCacheFilename(w.getQueryString(q, [], [])) + + # remove existing cache file + try: + os.remove(cacheFile) + except OSError: + pass + + data = w.query(q) + + self.assertFalse(os.path.exists(cacheFile)) + + w = query.WikidataQuery(cacheMaxAge=0.1) + + data = w.query(q) + + self.assertTrue(os.path.exists(cacheFile)) + + self.assertTrue('status' in data) + self.assertTrue('items' in data) + + t1 = time.time() + data = w.query(q) + t2 = time.time() + + # check that the cache access is fast + self.assertTrue(t2 - t1 < 0.2) + + +if __name__ == '__main__': + try: + unittest.main() + except SystemExit: + pass