Revision: 5283 Author: russblau Date: 2008-04-29 16:18:53 +0000 (Tue, 29 Apr 2008)
Log Message: ----------- Added preloadpages method for Site; fixed bugs.
Modified Paths: -------------- branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/site.py branches/rewrite/pywikibot/throttle.py branches/rewrite/pywikibot/tools.py
Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2008-04-28 14:43:39 UTC (rev 5282) +++ branches/rewrite/pywikibot/data/api.py 2008-04-29 16:18:53 UTC (rev 5283) @@ -93,20 +93,6 @@ self.params = {} if "action" not in kwargs: raise ValueError("'action' specification missing from Request.") - if kwargs["action"] == 'query': - if "meta" in kwargs: - if "userinfo" not in kwargs["meta"]: - kwargs["meta"] += "|userinfo" - else: - kwargs["meta"] = "userinfo" - if "uiprop" in kwargs: - kwargs["uiprop"] += "|blockinfo|hasmsg" - else: - kwargs["uiprop"] = "blockinfo|hasmsg" - if "format" not in kwargs: - self.params["format"] = "json" - if "maxlag" not in kwargs: - self.params["maxlag"] = str(config.maxlag) self.update(**kwargs)
# implement dict interface @@ -138,10 +124,32 @@
""" from pywikibot.comms import http - if self.params['format'] != 'json': + + for key in self.params: + if isinstance(self.params[key], basestring): + self.params[key] = self.params[key].split("|") + if self.params["action"] == ['query']: + meta = self.params.get("meta", []) + if "userinfo" not in meta: + meta.append("userinfo") + self.params["meta"] = meta + uiprop = self.params.get("uiprop", []) + uiprop = set(uiprop + ["blockinfo", "hasmsg"]) + self.params["uiprop"] = list(uiprop) + if "properties" in self.params: + if "info" in self.params["properties"]: + inprop = self.params.get("inprop", []) + info = set(info + ["protection", "talkid", "subjectid"]) + self.params["info"] = list(info) + if "maxlag" not in self.params: + self.params["maxlag"] = [str(config.maxlag)] + if "format" not in self.params: + self.params["format"] = ["json"] + if self.params['format'] != ["json"]: raise TypeError("Query format '%s' cannot be parsed." % self.params['format']) for key in self.params: + self.params[key] = "|".join(self.params[key]) if isinstance(self.params[key], unicode): self.params[key] = self.params[key].encode(self.site.encoding()) params = urllib.urlencode(self.params) @@ -353,12 +361,17 @@ @type prop: str
""" - self.request = Request(action="query", prop=prop, **kwargs) - if prop not in self.limits: - raise ValueError("Unrecognized property '%s'" % prop) + if isinstance(prop, basestring): + prop = prop.split("|") + for p in prop: + if p not in self.limits: + raise ValueError("Unrecognized property '%s'" % p) + self.request = Request(action="query", prop="|".join(prop)) # set limit to max, if applicable - if self.limits[prop] and kwargs.pop("getAll", False): - self.request['g'+self.limits[generator]] = "max" + for p in prop: + if self.limits[p] and kwargs.pop("getAll", False): + self.request['g'+self.limits[generator]] = "max" + self.request.params.update(kwargs) self.site = self.request.site self.resultkey = prop
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2008-04-28 14:43:39 UTC (rev 5282) +++ branches/rewrite/pywikibot/site.py 2008-04-29 16:18:53 UTC (rev 5283) @@ -500,10 +500,8 @@ def getpageinfo(self, page): """Load page info from api and save in page attributes""" title = page.title(withSection=False) - query = api.PropertyGenerator( - "info", - inprop="protection|talkid|subjectid", - titles=title.encode(self.encoding())) + query = api.PropertyGenerator("info", + titles=title.encode(self.encoding())) for pageitem in query: if pageitem['title'] != title: raise Error( @@ -580,6 +578,66 @@ api.update_page(target, pagedata) page._redir = target
+ def preloadpages(self, pagelist, size=60, lookahead=0): + """Return a generator to a list of preloaded pages. + + @param pagelist: an iterable that returns Page objects + @param size: how many Pages to query at a time + @type size: int + @param lookahead: if greater than zero, preload pages in a + separate thread for greater responsiveness; higher values + result in more aggressive preloading + @type lookahead: int + + """ + from pywikibot.tools import itergroup, ThreadedGenerator + gen = ThreadedGenerator(target=itergroup, + args=(pagelist, size), + qsize=lookahead) + try: + for sublist in gen: + pageids = [] + cache = {} + for p in sublist: + if pageids is not None: + if hasattr(p, "_pageid"): + pageids.append(str(p._pageid)) + else: + # only use pageids if all pages have them + pageids = None + cache[p.title(withSection=False)] = p + rvgen = api.PropertyGenerator("revisions|info") + if pageids is not None: + rvgen.request["pageids"] = "|".join(pageids) + else: + rvgen.request["titles"] = "|".join(cache.keys()) + rvgen.request[u"rvprop"] = \ + u"ids|flags|timestamp|user|comment|content" + for pagedata in rvgen: + if pagedata['title'] not in cache: + raise Error( + u"preloadpages: Query returned unexpected title '%s'" + % pagedata['title'] + ) + page = cache[pagedata['title']] + api.update_page(page, pagedata) + if 'revisions' in pagedata: # true if page exists + for rev in pagedata['revisions']: + revision = pywikibot.page.Revision( + revid=rev['revid'], + timestamp=rev['timestamp'], + user=rev['user'], + anon=rev.has_key('anon'), + comment=rev.get('comment', u''), + minor=rev.has_key('minor'), + text=rev.get('*', None) + ) + page._revisions[revision.revid] = revision + page._revid = revision.revid + yield page + finally: + gen.stop() + # following group of methods map more-or-less directly to API queries
def getbacklinks(self, page, followRedirects=False, filterRedirects=None, @@ -819,7 +877,8 @@ else: page = Page(self, pagedata['title']) api.update_page(page, pagedata) - + if 'revisions' not in pagedata: + continue for rev in pagedata['revisions']: revision = pywikibot.page.Revision( revid=rev['revid'], @@ -849,6 +908,8 @@ raise Error( u"getlanglinks: Query on %s returned data on '%s'" % (page, pageitem['title'])) + if 'langlinks' not in pageitem: + continue for linkdata in pageitem['langlinks']: yield pywikibot.Link(linkdata['*'], source=pywikibot.Site(linkdata['lang'])) @@ -864,6 +925,8 @@ raise RuntimeError( "getlanglinks: Query on %s returned data on '%s'" % (page, pageitem['title'])) + if 'extlinks' not in pageitem: + continue for linkdata in pageitem['extlinks']: yield linkdata['*']
Modified: branches/rewrite/pywikibot/throttle.py =================================================================== --- branches/rewrite/pywikibot/throttle.py 2008-04-28 14:43:39 UTC (rev 5282) +++ branches/rewrite/pywikibot/throttle.py 2008-04-29 16:18:53 UTC (rev 5283) @@ -107,7 +107,7 @@ f.close() self.process_multiplicity = count if self.verbosedelay: - pywikibot.output( + logging.info( u"Found %s processes running, including the current process." % count) finally: @@ -216,10 +216,10 @@ self.next_multiplicity = math.log(1+requestsize)/math.log(2.0) # Announce the delay if it exceeds a preset limit if waittime > config.noisysleep: - pywikibot.output(u"Sleeping for %.1f seconds, %s" - % (waittime, - time.strftime("%Y-%m-%d %H:%M:%S", - time.localtime())) + logging.warn(u"Sleeping for %.1f seconds, %s" + % (waittime, + time.strftime("%Y-%m-%d %H:%M:%S", + time.localtime())) ) time.sleep(waittime) if write:
Modified: branches/rewrite/pywikibot/tools.py =================================================================== --- branches/rewrite/pywikibot/tools.py 2008-04-28 14:43:39 UTC (rev 5282) +++ branches/rewrite/pywikibot/tools.py 2008-04-29 16:18:53 UTC (rev 5283) @@ -26,13 +26,14 @@ all the generated values, it must call the generator's stop() method to stop the background thread. Example usage:
- >>> gen = ThreadedGenerator(target=foo) + >>> gen = ThreadedGenerator(target=xrange, args=(20,)) >>> try: ... for data in gen: - ... do_work(data) + ... print data, ... finally: ... gen.stop() - + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + """
def __init__(self, group=None, target=None, name="GeneratorThread", @@ -95,3 +96,36 @@ self.stop()
+def itergroup(iterable, size): + """Make an iterator that returns lists of (up to) size items from iterable. + + Example: + + >>> i = itergroup(xrange(25), 10) + >>> print i.next() + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> print i.next() + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19] + >>> print i.next() + [20, 21, 22, 23, 24] + >>> print i.next() + Traceback (most recent call last): + ... + StopIteration + + """ + chunk = [] + for item in iter(iterable): + chunk.append(item) + if len(chunk) == size: + yield chunk + chunk = [] + if chunk: + yield chunk + + +if __name__ == "__main__": + def _test(): + import doctest + doctest.testmod() + _test()