Revision: 5345 Author: russblau Date: 2008-05-09 19:59:52 +0000 (Fri, 09 May 2008)
Log Message: ----------- major refactoring of api generator classes.
Modified Paths: -------------- branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2008-05-09 17:08:13 UTC (rev 5344) +++ branches/rewrite/pywikibot/data/api.py 2008-05-09 19:59:52 UTC (rev 5345) @@ -24,7 +24,9 @@
lagpattern = re.compile(r"Waiting for [\d.]+: (?P<lag>\d+) seconds? lagged")
+_modules = {} # cache for retrieved API parameter information
+ class APIError(pywikibot.Error): """The wiki site returned an error message.""" def __init__(self, code, info, **kwargs): @@ -233,7 +235,136 @@
#TODO - refactor all these generator classes into a parent/subclass hierarchy
-class PageGenerator(object): +class QueryGenerator(object): + """Base class for iterators that handle responses to API action=query. + + By default, the iterator will iterate each item in the query response, + and use the query-continue element, if present, to continue iterating as + long as the wiki returns additional values. However, if the iterator's + limit attribute is set to a positive int, the iterator will stop after + iterating that many values. + + """ + def __init__(self, **kwargs): + """ + Constructor: kwargs are used to create a Request object; + see that object's documentation for values. 'action'='query' is + assumed. + + """ + global data + if "action" in kwargs and "action" != "query": + raise Error("%s: 'action' must be 'query', not %s" + % (self.__class__.__name__, kwargs["query"])) + else: + kwargs["action"] = "query" + self.site = kwargs.get("site", pywikibot.Site()) + # make sure request type is valid, and get limit key if any + if "generator" in kwargs: + self.module = kwargs["generator"] + elif "list" in kwargs: + self.module = kwargs["list"] + elif "prop" in kwargs: + self.module = kwargs["prop"] + else: + raise Error("%s: No query module name found in arguments." + % self.__class__.__name__) + for name in self.module.split("|"): + if name not in _modules: + self.get_module() + break + self.set_limit() + if self.query_limit is not None and "generator" in kwargs: + self.prefix = "g" + self.prefix + self.request = Request(**kwargs) + self.limit = None + self.resultkey = self.module # this is the name of the "query" + # subelement to look for when iterating + + def get_module(self): + """Query api on self.site for paraminfo on querymodule=self.module""" + + paramreq = Request(site=self.site, action="paraminfo", + querymodules=self.module) + data = paramreq.submit() + assert "paraminfo" in data + assert "querymodules" in data["paraminfo"] + assert len(data["paraminfo"]["querymodules"]) == 1+self.module.count("|") + for paraminfo in data["paraminfo"]["querymodules"]: + assert paraminfo["name"] in self.module + if "missing" in paraminfo: + raise Error("Invalid query module name '%s'." % self.module) + _modules[paraminfo["name"]] = paraminfo + + def set_limit(self): + """Set query_limit for self.module based on api response""" + + self.query_limit = None + for mod in self.module.split('|'): + for param in _modules[mod].get("parameters", []): + if param["name"] == "limit": + if (self.site.logged_in() + and "apihighlimits" in + self.site.getuserinfo()["rights"]): + self.query_limit = int(param["highmax"]) + else: + self.query_limit = int(param["max"]) + self.prefix = _modules[mod]["prefix"] + logging.debug("%s: Set query_limit to %i." + % (self.__class__.__name__, self.query_limit)) + return + + def __iter__(self): + """Submit request and iterate the response based on self.resultkey + + Continues response as needed until limit (if any) is reached. + + """ + count = 0 + while True: + if self.query_limit is not None and "revisions" not in self.module: + if self.limit is not None: + new_limit = min(self.query_limit, self.limit - count) + else: + new_limit = self.query_limit + self.request[self.prefix+"limit"] = str(new_limit) + self.data = self.request.submit() + if not self.data or not isinstance(self.data, dict): + logging.debug( + "%s: stopped iteration because no dict retrieved from api." + % self.__class__.__name__) + return + if not ("query" in self.data + and self.resultkey in self.data["query"]): + logging.debug( + "%s: stopped iteration because 'query' and result keys not found in api response." + % self.__class__.__name__) + logging.debug(self.data) + return + pagedata = self.data["query"][self.resultkey] + if isinstance(pagedata, dict): + pagedata = pagedata.values() + # for generators, this yields the pages in order of + # their pageids, not their titles.... FIXME? + + for item in pagedata: + yield self.result(item) + count += 1 + if self.limit is not None and count >= self.limit: + return + if not "query-continue" in self.data: + return + if not self.module in self.data["query-continue"]: + raise Error("Missing '%s' key in ['query-continue'] value." + % self.module) + self.request.update(self.data["query-continue"][self.module]) + + def result(self, data): + """Process result data as needed for particular subclass.""" + return data + + +class PageGenerator(QueryGenerator): """Iterator for response to a request of type action=query&generator=foo.""" def __init__(self, generator, **kwargs): """ @@ -244,15 +375,8 @@ @type generator: str
""" - if generator not in self.limits: - raise ValueError("Unrecognized generator '%s'" % generator) - self.request = Request(action="query", generator=generator, **kwargs) - # set limit to max, if applicable - # FIXME: need to distinguish between the "limit" per API request and an - # overall limit on the number of pages to be iterated - if self.limits[generator]: - limitkey = 'g' + self.limits[generator] - self.request.setdefault(limitkey, "max") + QueryGenerator.__init__(self, generator=generator, **kwargs) + # get some basic information about every page generated if 'prop' in self.request: self.request['prop'] += "|info|imageinfo" else: @@ -266,61 +390,8 @@ self.request["iiprop"] += 'timestamp|user|comment|url|size|sha1|metadata' else: self.request['iiprop'] = 'timestamp|user|comment|url|size|sha1|metadata' - self.generator = generator - self.site = self.request.site self.resultkey = "pages" # element to look for in result
- # dict mapping generator types to their limit parameter names - limits = {'links': None, - 'images': None, - 'templates': None, - 'categories': None, - 'allpages': 'aplimit', - 'alllinks': 'allimit', - 'allcategories': 'aclimit', - 'allimages': 'ailimit', - 'backlinks': 'bllimit', - 'categorymembers': 'cmlimit', - 'embeddedin': 'eilimit', - 'imageusage': 'iulimit', - 'search': 'srlimit', - 'watchlist': 'wllimit', - 'exturlusage': 'eulimit', - 'random': 'rnlimit', - } - - def __iter__(self): - """Iterate objects for elements found in response.""" - # FIXME: this won't handle generators with <redirlinks> subelements - # correctly yet - while True: - self.data = self.request.submit() - if not self.data or not isinstance(self.data, dict): - raise StopIteration - if not "query" in self.data: - raise StopIteration - query = self.data["query"] - if not self.resultkey in query: - raise StopIteration - if isinstance(query[self.resultkey], dict): - for v in query[self.resultkey].itervalues(): - yield self.result(v) - elif isinstance(query[self.resultkey], list): - for v in query[self.resultkey]: - yield self.result(v) - else: - raise APIError("Unknown", - "Unknown format in ['query']['%s'] value." - % self.resultkey, - data=query[self.resultkey]) - if not "query-continue" in self.data: - return - if not self.generator in self.data["query-continue"]: - raise APIError("Unknown", - "Missing '%s' key in ['query-continue'] value.", - data=self.data["query-continue"]) - self.request.update(self.data["query-continue"][self.generator]) - def result(self, pagedata): """Convert page dict entry from api to Page object.
@@ -352,7 +423,7 @@ return image
-class PropertyGenerator(object): +class PropertyGenerator(QueryGenerator): """Generator for queries of type action=query&property=...
Note that this generator yields one or more dict object(s) corresponding @@ -369,55 +440,11 @@ @type prop: str
""" - if isinstance(prop, basestring): - prop = prop.split("|") - for p in prop: - if p not in self.limits: - raise ValueError("Unrecognized property '%s'" % p) - self.request = Request(action="query", prop="|".join(prop)) - # set limit to max, if applicable - for p in prop: - if self.limits[p] and kwargs.pop("getAll", False): - self.request['g'+self.limits[generator]] = "max" - self.request.params.update(kwargs) - self.site = self.request.site - self.resultkey = prop + QueryGenerator.__init__(self, prop=prop, **kwargs) + self.resultkey = "pages"
- # dict mapping property types to their limit parameter names - limits = {'revisions': 'rvlimit', - 'imageinfo': 'iilimit', - 'info': None, - 'links': None, - 'langlinks': None, - 'images': None, - 'imageinfo': None, - 'templates': None, - 'categories': None, - 'extlinks': None, - }
- def __iter__(self): - """Iterate objects for elements found in response.""" - # this looks for the resultkey ''inside'' a <page> entry - while True: - self.data = self.request.submit() - if not self.data or not isinstance(self.data, dict): - raise StopIteration - if not ("query" in self.data and "pages" in self.data["query"]): - raise StopIteration - pagedata = self.data["query"]["pages"].values() - for item in pagedata: - yield item - if not "query-continue" in self.data: - return - if not self.resultkey in self.data["query-continue"]: - raise APIError("Unknown", - "Missing '%s' key in ['query-continue'] value.", - data=self.data["query-continue"]) - self.request.update(self.data["query-continue"][self.resultkey]) - - -class ListGenerator(object): +class ListGenerator(QueryGenerator): """Iterator for queries with action=query&list=... parameters"""
def __init__(self, listaction, **kwargs): @@ -429,64 +456,9 @@ @type listaction: str
""" - if listaction not in self.limits: - raise ValueError("Unrecognized list type '%s'" % listaction) - self.request = Request(action="query", list=listaction, **kwargs) - # set limit to max, if applicable - # FIXME: need to distinguish between the "limit" per API request and an - # overall limit on the number of pages to be iterated - if self.limits[listaction]: - limitkey = self.limits[listaction] - self.request.setdefault(limitkey, "max") - self.resultkey = listaction - self.site = self.request.site + QueryGenerator.__init__(self, list=listaction, **kwargs)
- # dict mapping generator types to their limit parameter names - - limits = {'allpages': 'aplimit', - 'alllinks': 'allimit', - 'allcategories': 'aclimit', - 'allusers': 'aulimit', - 'allimages': 'ailimit', - 'backlinks': 'bllimit', - 'blocks': 'bklimit', - 'categorymembers': 'cmlimit', - 'embeddedin': 'eilimit', - 'exturlusage': 'eulimit', - 'imageusage': 'iulimit', - 'logevents': 'lelimit', - 'recentchanges': 'rclimit', - 'search': 'srlimit', - 'usercontribs': 'uclimit', - 'watchlist': 'wllimit', - 'deletedrevs': 'drlimit', - 'users': None, - 'random': 'rnlimit', - }
- def __iter__(self): - """Iterate objects for elements found in response.""" - # this looks for the resultkey in the 'query' element - while True: - self.data = self.request.submit() - if not self.data or not isinstance(self.data, dict): - raise StopIteration - if not ("query" in self.data - and self.resultkey in self.data["query"]): - raise StopIteration - resultdata = self.data["query"][self.resultkey] - assert isinstance(resultdata, list) - for item in resultdata: - yield item - if not "query-continue" in self.data: - return - if not self.resultkey in self.data["query-continue"]: - raise APIError("Unknown", - "Missing '%s' key in ['query-continue'] value.", - data=self.data["query-continue"]) - self.request.update(self.data["query-continue"][self.resultkey]) - - class LoginManager(login.LoginManager): """Supplies getCookie() method to use API interface.""" def getCookie(self, remember=True, captchaId=None, captchaAnswer=None):
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2008-05-09 17:08:13 UTC (rev 5344) +++ branches/rewrite/pywikibot/site.py 2008-05-09 19:59:52 UTC (rev 5345) @@ -615,6 +615,7 @@ % (len(cache), self) ) for pagedata in rvgen: +# logging.debug("Preloading %s" % pagedata) try: if pagedata['title'] not in cache: raise Error( @@ -754,25 +755,22 @@ however, that the iterated values are always Page objects, even if in the Category or Image namespace. @type namespaces: list of ints - @param batch: the number of pages to fetch each time. - @type batch: int + @param limit: maximum number of pages to iterate (default: all) + @type limit: int
""" if category.namespace() != 14: - raise ValueError( - u"Cannot get category members of non-Category page '%s'" + raise Error( + u"categorymembers: non-Category page '%s' specified" % category.title()) cmtitle = category.title(withSection=False).encode(self.encoding()) - cmgen = api.PageGenerator(u"categorymembers", gcmtitle=cmtitle, + cmgen = api.PageGenerator("categorymembers", gcmtitle=cmtitle, gcmprop="ids|title|sortkey", site=self) if namespaces is not None: - cmgen.request[u"gcmnamespace"] = u"|".join(unicode(ns) + cmgen.request["gcmnamespace"] = u"|".join(str(ns) for ns in namespaces) - if batch is not None: - if batch > 5000: - logging.debug("No more than 5000 rows can be fetched at once.") - batch=5000 - cmgen.request[u'cmlimit'] = str(batch) + if isinstance(limit, int): + cmgen.limit = limit return cmgen
def loadrevisions(self, page=None, getText=False, revids=None, @@ -863,8 +861,8 @@ u"ids|flags|timestamp|user|comment|content" if section is not None: rvgen.request[u"rvsection"] = unicode(section) - if limit: - rvgen.request[u"rvlimit"] = unicode(limit) + if isinstance(limit, int): + rvgen.limit = limit if rvdir: rvgen.request[u"rvdir"] = u"newer" elif rvdir is not None: @@ -1017,7 +1015,7 @@ if isinstance(protect_level, basestring): apgen.request["gapprlevel"] = protect_level if isinstance(limit, int): - apgen.request["gaplimit"] = str(limit) + apgen.limit = limit if reverse: apgen.request["gapdir"] = "descending" return apgen @@ -1052,7 +1050,7 @@ if prefix: algen.request["alprefix"] = prefix if isinstance(limit, int): - algen.request["allimit"] = str(limit) + algen.limit = limit if unique: algen.request["alunique"] = "" if fromids: @@ -1081,7 +1079,7 @@ if prefix: acgen.request["gacprefix"] = prefix if isinstance(limit, int): - acgen.request["gaclimit"] = str(limit) + acgen.limit = limit if reverse: acgen.request["gacdir"] = "descending" return acgen @@ -1110,7 +1108,7 @@ if group: augen.request["augroup"] = group if isinstance(limit, int): - augen.request["aulimit"] = str(limit) + augen.limit = limit return augen
def allimages(self, start="!", prefix="", minsize=None, maxsize=None, @@ -1135,7 +1133,7 @@ if prefix: aigen.request["gaiprefix"] = prefix if isinstance(limit, int): - aigen.request["gailimit"] = str(limit) + aigen.limit = limit if isinstance(minsize, int): aigen.request["gaiminsize"] = str(minsize) if isinstance(maxsize, int): @@ -1190,7 +1188,7 @@ if users: bkgen.request["bkusers"] = users if isinstance(limit, int): - bkgen.request["bklimit"] = str(limit) + bkgen.limit = limit return bkgen