[Pywikipedia-l] SVN: [5345] branches/rewrite/pywikibot
russblau at svn.wikimedia.org
russblau at svn.wikimedia.org
Fri May 9 19:59:52 UTC 2008
Revision: 5345
Author: russblau
Date: 2008-05-09 19:59:52 +0000 (Fri, 09 May 2008)
Log Message:
-----------
major refactoring of api generator classes.
Modified Paths:
--------------
branches/rewrite/pywikibot/data/api.py
branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py 2008-05-09 17:08:13 UTC (rev 5344)
+++ branches/rewrite/pywikibot/data/api.py 2008-05-09 19:59:52 UTC (rev 5345)
@@ -24,7 +24,9 @@
lagpattern = re.compile(r"Waiting for [\d.]+: (?P<lag>\d+) seconds? lagged")
+_modules = {} # cache for retrieved API parameter information
+
class APIError(pywikibot.Error):
"""The wiki site returned an error message."""
def __init__(self, code, info, **kwargs):
@@ -233,7 +235,136 @@
#TODO - refactor all these generator classes into a parent/subclass hierarchy
-class PageGenerator(object):
+class QueryGenerator(object):
+ """Base class for iterators that handle responses to API action=query.
+
+ By default, the iterator will iterate each item in the query response,
+ and use the query-continue element, if present, to continue iterating as
+ long as the wiki returns additional values. However, if the iterator's
+ limit attribute is set to a positive int, the iterator will stop after
+ iterating that many values.
+
+ """
+ def __init__(self, **kwargs):
+ """
+ Constructor: kwargs are used to create a Request object;
+ see that object's documentation for values. 'action'='query' is
+ assumed.
+
+ """
+ global data
+ if "action" in kwargs and "action" != "query":
+ raise Error("%s: 'action' must be 'query', not %s"
+ % (self.__class__.__name__, kwargs["query"]))
+ else:
+ kwargs["action"] = "query"
+ self.site = kwargs.get("site", pywikibot.Site())
+ # make sure request type is valid, and get limit key if any
+ if "generator" in kwargs:
+ self.module = kwargs["generator"]
+ elif "list" in kwargs:
+ self.module = kwargs["list"]
+ elif "prop" in kwargs:
+ self.module = kwargs["prop"]
+ else:
+ raise Error("%s: No query module name found in arguments."
+ % self.__class__.__name__)
+ for name in self.module.split("|"):
+ if name not in _modules:
+ self.get_module()
+ break
+ self.set_limit()
+ if self.query_limit is not None and "generator" in kwargs:
+ self.prefix = "g" + self.prefix
+ self.request = Request(**kwargs)
+ self.limit = None
+ self.resultkey = self.module # this is the name of the "query"
+ # subelement to look for when iterating
+
+ def get_module(self):
+ """Query api on self.site for paraminfo on querymodule=self.module"""
+
+ paramreq = Request(site=self.site, action="paraminfo",
+ querymodules=self.module)
+ data = paramreq.submit()
+ assert "paraminfo" in data
+ assert "querymodules" in data["paraminfo"]
+ assert len(data["paraminfo"]["querymodules"]) == 1+self.module.count("|")
+ for paraminfo in data["paraminfo"]["querymodules"]:
+ assert paraminfo["name"] in self.module
+ if "missing" in paraminfo:
+ raise Error("Invalid query module name '%s'." % self.module)
+ _modules[paraminfo["name"]] = paraminfo
+
+ def set_limit(self):
+ """Set query_limit for self.module based on api response"""
+
+ self.query_limit = None
+ for mod in self.module.split('|'):
+ for param in _modules[mod].get("parameters", []):
+ if param["name"] == "limit":
+ if (self.site.logged_in()
+ and "apihighlimits" in
+ self.site.getuserinfo()["rights"]):
+ self.query_limit = int(param["highmax"])
+ else:
+ self.query_limit = int(param["max"])
+ self.prefix = _modules[mod]["prefix"]
+ logging.debug("%s: Set query_limit to %i."
+ % (self.__class__.__name__, self.query_limit))
+ return
+
+ def __iter__(self):
+ """Submit request and iterate the response based on self.resultkey
+
+ Continues response as needed until limit (if any) is reached.
+
+ """
+ count = 0
+ while True:
+ if self.query_limit is not None and "revisions" not in self.module:
+ if self.limit is not None:
+ new_limit = min(self.query_limit, self.limit - count)
+ else:
+ new_limit = self.query_limit
+ self.request[self.prefix+"limit"] = str(new_limit)
+ self.data = self.request.submit()
+ if not self.data or not isinstance(self.data, dict):
+ logging.debug(
+ "%s: stopped iteration because no dict retrieved from api."
+ % self.__class__.__name__)
+ return
+ if not ("query" in self.data
+ and self.resultkey in self.data["query"]):
+ logging.debug(
+ "%s: stopped iteration because 'query' and result keys not found in api response."
+ % self.__class__.__name__)
+ logging.debug(self.data)
+ return
+ pagedata = self.data["query"][self.resultkey]
+ if isinstance(pagedata, dict):
+ pagedata = pagedata.values()
+ # for generators, this yields the pages in order of
+ # their pageids, not their titles.... FIXME?
+
+ for item in pagedata:
+ yield self.result(item)
+ count += 1
+ if self.limit is not None and count >= self.limit:
+ return
+ if not "query-continue" in self.data:
+ return
+ if not self.module in self.data["query-continue"]:
+ raise Error("Missing '%s' key in ['query-continue'] value."
+ % self.module)
+ self.request.update(self.data["query-continue"][self.module])
+
+ def result(self, data):
+ """Process result data as needed for particular subclass."""
+ return data
+
+
+class PageGenerator(QueryGenerator):
"""Iterator for response to a request of type action=query&generator=foo."""
def __init__(self, generator, **kwargs):
"""
@@ -244,15 +375,8 @@
@type generator: str
"""
- if generator not in self.limits:
- raise ValueError("Unrecognized generator '%s'" % generator)
- self.request = Request(action="query", generator=generator, **kwargs)
- # set limit to max, if applicable
- # FIXME: need to distinguish between the "limit" per API request and an
- # overall limit on the number of pages to be iterated
- if self.limits[generator]:
- limitkey = 'g' + self.limits[generator]
- self.request.setdefault(limitkey, "max")
+ QueryGenerator.__init__(self, generator=generator, **kwargs)
+ # get some basic information about every page generated
if 'prop' in self.request:
self.request['prop'] += "|info|imageinfo"
else:
@@ -266,61 +390,8 @@
self.request["iiprop"] += 'timestamp|user|comment|url|size|sha1|metadata'
else:
self.request['iiprop'] = 'timestamp|user|comment|url|size|sha1|metadata'
- self.generator = generator
- self.site = self.request.site
self.resultkey = "pages" # element to look for in result
- # dict mapping generator types to their limit parameter names
- limits = {'links': None,
- 'images': None,
- 'templates': None,
- 'categories': None,
- 'allpages': 'aplimit',
- 'alllinks': 'allimit',
- 'allcategories': 'aclimit',
- 'allimages': 'ailimit',
- 'backlinks': 'bllimit',
- 'categorymembers': 'cmlimit',
- 'embeddedin': 'eilimit',
- 'imageusage': 'iulimit',
- 'search': 'srlimit',
- 'watchlist': 'wllimit',
- 'exturlusage': 'eulimit',
- 'random': 'rnlimit',
- }
-
- def __iter__(self):
- """Iterate objects for elements found in response."""
- # FIXME: this won't handle generators with <redirlinks> subelements
- # correctly yet
- while True:
- self.data = self.request.submit()
- if not self.data or not isinstance(self.data, dict):
- raise StopIteration
- if not "query" in self.data:
- raise StopIteration
- query = self.data["query"]
- if not self.resultkey in query:
- raise StopIteration
- if isinstance(query[self.resultkey], dict):
- for v in query[self.resultkey].itervalues():
- yield self.result(v)
- elif isinstance(query[self.resultkey], list):
- for v in query[self.resultkey]:
- yield self.result(v)
- else:
- raise APIError("Unknown",
- "Unknown format in ['query']['%s'] value."
- % self.resultkey,
- data=query[self.resultkey])
- if not "query-continue" in self.data:
- return
- if not self.generator in self.data["query-continue"]:
- raise APIError("Unknown",
- "Missing '%s' key in ['query-continue'] value.",
- data=self.data["query-continue"])
- self.request.update(self.data["query-continue"][self.generator])
-
def result(self, pagedata):
"""Convert page dict entry from api to Page object.
@@ -352,7 +423,7 @@
return image
-class PropertyGenerator(object):
+class PropertyGenerator(QueryGenerator):
"""Generator for queries of type action=query&property=...
Note that this generator yields one or more dict object(s) corresponding
@@ -369,55 +440,11 @@
@type prop: str
"""
- if isinstance(prop, basestring):
- prop = prop.split("|")
- for p in prop:
- if p not in self.limits:
- raise ValueError("Unrecognized property '%s'" % p)
- self.request = Request(action="query", prop="|".join(prop))
- # set limit to max, if applicable
- for p in prop:
- if self.limits[p] and kwargs.pop("getAll", False):
- self.request['g'+self.limits[generator]] = "max"
- self.request.params.update(kwargs)
- self.site = self.request.site
- self.resultkey = prop
+ QueryGenerator.__init__(self, prop=prop, **kwargs)
+ self.resultkey = "pages"
- # dict mapping property types to their limit parameter names
- limits = {'revisions': 'rvlimit',
- 'imageinfo': 'iilimit',
- 'info': None,
- 'links': None,
- 'langlinks': None,
- 'images': None,
- 'imageinfo': None,
- 'templates': None,
- 'categories': None,
- 'extlinks': None,
- }
- def __iter__(self):
- """Iterate objects for elements found in response."""
- # this looks for the resultkey ''inside'' a <page> entry
- while True:
- self.data = self.request.submit()
- if not self.data or not isinstance(self.data, dict):
- raise StopIteration
- if not ("query" in self.data and "pages" in self.data["query"]):
- raise StopIteration
- pagedata = self.data["query"]["pages"].values()
- for item in pagedata:
- yield item
- if not "query-continue" in self.data:
- return
- if not self.resultkey in self.data["query-continue"]:
- raise APIError("Unknown",
- "Missing '%s' key in ['query-continue'] value.",
- data=self.data["query-continue"])
- self.request.update(self.data["query-continue"][self.resultkey])
-
-
-class ListGenerator(object):
+class ListGenerator(QueryGenerator):
"""Iterator for queries with action=query&list=... parameters"""
def __init__(self, listaction, **kwargs):
@@ -429,64 +456,9 @@
@type listaction: str
"""
- if listaction not in self.limits:
- raise ValueError("Unrecognized list type '%s'" % listaction)
- self.request = Request(action="query", list=listaction, **kwargs)
- # set limit to max, if applicable
- # FIXME: need to distinguish between the "limit" per API request and an
- # overall limit on the number of pages to be iterated
- if self.limits[listaction]:
- limitkey = self.limits[listaction]
- self.request.setdefault(limitkey, "max")
- self.resultkey = listaction
- self.site = self.request.site
+ QueryGenerator.__init__(self, list=listaction, **kwargs)
- # dict mapping generator types to their limit parameter names
-
- limits = {'allpages': 'aplimit',
- 'alllinks': 'allimit',
- 'allcategories': 'aclimit',
- 'allusers': 'aulimit',
- 'allimages': 'ailimit',
- 'backlinks': 'bllimit',
- 'blocks': 'bklimit',
- 'categorymembers': 'cmlimit',
- 'embeddedin': 'eilimit',
- 'exturlusage': 'eulimit',
- 'imageusage': 'iulimit',
- 'logevents': 'lelimit',
- 'recentchanges': 'rclimit',
- 'search': 'srlimit',
- 'usercontribs': 'uclimit',
- 'watchlist': 'wllimit',
- 'deletedrevs': 'drlimit',
- 'users': None,
- 'random': 'rnlimit',
- }
- def __iter__(self):
- """Iterate objects for elements found in response."""
- # this looks for the resultkey in the 'query' element
- while True:
- self.data = self.request.submit()
- if not self.data or not isinstance(self.data, dict):
- raise StopIteration
- if not ("query" in self.data
- and self.resultkey in self.data["query"]):
- raise StopIteration
- resultdata = self.data["query"][self.resultkey]
- assert isinstance(resultdata, list)
- for item in resultdata:
- yield item
- if not "query-continue" in self.data:
- return
- if not self.resultkey in self.data["query-continue"]:
- raise APIError("Unknown",
- "Missing '%s' key in ['query-continue'] value.",
- data=self.data["query-continue"])
- self.request.update(self.data["query-continue"][self.resultkey])
-
-
class LoginManager(login.LoginManager):
"""Supplies getCookie() method to use API interface."""
def getCookie(self, remember=True, captchaId=None, captchaAnswer=None):
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-05-09 17:08:13 UTC (rev 5344)
+++ branches/rewrite/pywikibot/site.py 2008-05-09 19:59:52 UTC (rev 5345)
@@ -615,6 +615,7 @@
% (len(cache), self)
)
for pagedata in rvgen:
+# logging.debug("Preloading %s" % pagedata)
try:
if pagedata['title'] not in cache:
raise Error(
@@ -754,25 +755,22 @@
however, that the iterated values are always Page objects, even
if in the Category or Image namespace.
@type namespaces: list of ints
- @param batch: the number of pages to fetch each time.
- @type batch: int
+ @param limit: maximum number of pages to iterate (default: all)
+ @type limit: int
"""
if category.namespace() != 14:
- raise ValueError(
- u"Cannot get category members of non-Category page '%s'"
+ raise Error(
+ u"categorymembers: non-Category page '%s' specified"
% category.title())
cmtitle = category.title(withSection=False).encode(self.encoding())
- cmgen = api.PageGenerator(u"categorymembers", gcmtitle=cmtitle,
+ cmgen = api.PageGenerator("categorymembers", gcmtitle=cmtitle,
gcmprop="ids|title|sortkey", site=self)
if namespaces is not None:
- cmgen.request[u"gcmnamespace"] = u"|".join(unicode(ns)
+ cmgen.request["gcmnamespace"] = u"|".join(str(ns)
for ns in namespaces)
- if batch is not None:
- if batch > 5000:
- logging.debug("No more than 5000 rows can be fetched at once.")
- batch=5000
- cmgen.request[u'cmlimit'] = str(batch)
+ if isinstance(limit, int):
+ cmgen.limit = limit
return cmgen
def loadrevisions(self, page=None, getText=False, revids=None,
@@ -863,8 +861,8 @@
u"ids|flags|timestamp|user|comment|content"
if section is not None:
rvgen.request[u"rvsection"] = unicode(section)
- if limit:
- rvgen.request[u"rvlimit"] = unicode(limit)
+ if isinstance(limit, int):
+ rvgen.limit = limit
if rvdir:
rvgen.request[u"rvdir"] = u"newer"
elif rvdir is not None:
@@ -1017,7 +1015,7 @@
if isinstance(protect_level, basestring):
apgen.request["gapprlevel"] = protect_level
if isinstance(limit, int):
- apgen.request["gaplimit"] = str(limit)
+ apgen.limit = limit
if reverse:
apgen.request["gapdir"] = "descending"
return apgen
@@ -1052,7 +1050,7 @@
if prefix:
algen.request["alprefix"] = prefix
if isinstance(limit, int):
- algen.request["allimit"] = str(limit)
+ algen.limit = limit
if unique:
algen.request["alunique"] = ""
if fromids:
@@ -1081,7 +1079,7 @@
if prefix:
acgen.request["gacprefix"] = prefix
if isinstance(limit, int):
- acgen.request["gaclimit"] = str(limit)
+ acgen.limit = limit
if reverse:
acgen.request["gacdir"] = "descending"
return acgen
@@ -1110,7 +1108,7 @@
if group:
augen.request["augroup"] = group
if isinstance(limit, int):
- augen.request["aulimit"] = str(limit)
+ augen.limit = limit
return augen
def allimages(self, start="!", prefix="", minsize=None, maxsize=None,
@@ -1135,7 +1133,7 @@
if prefix:
aigen.request["gaiprefix"] = prefix
if isinstance(limit, int):
- aigen.request["gailimit"] = str(limit)
+ aigen.limit = limit
if isinstance(minsize, int):
aigen.request["gaiminsize"] = str(minsize)
if isinstance(maxsize, int):
@@ -1190,7 +1188,7 @@
if users:
bkgen.request["bkusers"] = users
if isinstance(limit, int):
- bkgen.request["bklimit"] = str(limit)
+ bkgen.limit = limit
return bkgen
More information about the Pywikipedia-l
mailing list