Revision: 5283
Author: russblau
Date: 2008-04-29 16:18:53 +0000 (Tue, 29 Apr 2008)
Log Message:
-----------
Added preloadpages method for Site; fixed bugs.
Modified Paths:
--------------
branches/rewrite/pywikibot/data/api.py
branches/rewrite/pywikibot/site.py
branches/rewrite/pywikibot/throttle.py
branches/rewrite/pywikibot/tools.py
Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py 2008-04-28 14:43:39 UTC (rev 5282)
+++ branches/rewrite/pywikibot/data/api.py 2008-04-29 16:18:53 UTC (rev 5283)
@@ -93,20 +93,6 @@
self.params = {}
if "action" not in kwargs:
raise ValueError("'action' specification missing from Request.")
- if kwargs["action"] == 'query':
- if "meta" in kwargs:
- if "userinfo" not in kwargs["meta"]:
- kwargs["meta"] += "|userinfo"
- else:
- kwargs["meta"] = "userinfo"
- if "uiprop" in kwargs:
- kwargs["uiprop"] += "|blockinfo|hasmsg"
- else:
- kwargs["uiprop"] = "blockinfo|hasmsg"
- if "format" not in kwargs:
- self.params["format"] = "json"
- if "maxlag" not in kwargs:
- self.params["maxlag"] = str(config.maxlag)
self.update(**kwargs)
# implement dict interface
@@ -138,10 +124,32 @@
"""
from pywikibot.comms import http
- if self.params['format'] != 'json':
+
+ for key in self.params:
+ if isinstance(self.params[key], basestring):
+ self.params[key] = self.params[key].split("|")
+ if self.params["action"] == ['query']:
+ meta = self.params.get("meta", [])
+ if "userinfo" not in meta:
+ meta.append("userinfo")
+ self.params["meta"] = meta
+ uiprop = self.params.get("uiprop", [])
+ uiprop = set(uiprop + ["blockinfo", "hasmsg"])
+ self.params["uiprop"] = list(uiprop)
+ if "properties" in self.params:
+ if "info" in self.params["properties"]:
+ inprop = self.params.get("inprop", [])
+ info = set(info + ["protection", "talkid", "subjectid"])
+ self.params["info"] = list(info)
+ if "maxlag" not in self.params:
+ self.params["maxlag"] = [str(config.maxlag)]
+ if "format" not in self.params:
+ self.params["format"] = ["json"]
+ if self.params['format'] != ["json"]:
raise TypeError("Query format '%s' cannot be parsed."
% self.params['format'])
for key in self.params:
+ self.params[key] = "|".join(self.params[key])
if isinstance(self.params[key], unicode):
self.params[key] = self.params[key].encode(self.site.encoding())
params = urllib.urlencode(self.params)
@@ -353,12 +361,17 @@
@type prop: str
"""
- self.request = Request(action="query", prop=prop, **kwargs)
- if prop not in self.limits:
- raise ValueError("Unrecognized property '%s'" % prop)
+ if isinstance(prop, basestring):
+ prop = prop.split("|")
+ for p in prop:
+ if p not in self.limits:
+ raise ValueError("Unrecognized property '%s'" % p)
+ self.request = Request(action="query", prop="|".join(prop))
# set limit to max, if applicable
- if self.limits[prop] and kwargs.pop("getAll", False):
- self.request['g'+self.limits[generator]] = "max"
+ for p in prop:
+ if self.limits[p] and kwargs.pop("getAll", False):
+ self.request['g'+self.limits[generator]] = "max"
+ self.request.params.update(kwargs)
self.site = self.request.site
self.resultkey = prop
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-04-28 14:43:39 UTC (rev 5282)
+++ branches/rewrite/pywikibot/site.py 2008-04-29 16:18:53 UTC (rev 5283)
@@ -500,10 +500,8 @@
def getpageinfo(self, page):
"""Load page info from api and save in page attributes"""
title = page.title(withSection=False)
- query = api.PropertyGenerator(
- "info",
- inprop="protection|talkid|subjectid",
- titles=title.encode(self.encoding()))
+ query = api.PropertyGenerator("info",
+ titles=title.encode(self.encoding()))
for pageitem in query:
if pageitem['title'] != title:
raise Error(
@@ -580,6 +578,66 @@
api.update_page(target, pagedata)
page._redir = target
+ def preloadpages(self, pagelist, size=60, lookahead=0):
+ """Return a generator to a list of preloaded pages.
+
+ @param pagelist: an iterable that returns Page objects
+ @param size: how many Pages to query at a time
+ @type size: int
+ @param lookahead: if greater than zero, preload pages in a
+ separate thread for greater responsiveness; higher values
+ result in more aggressive preloading
+ @type lookahead: int
+
+ """
+ from pywikibot.tools import itergroup, ThreadedGenerator
+ gen = ThreadedGenerator(target=itergroup,
+ args=(pagelist, size),
+ qsize=lookahead)
+ try:
+ for sublist in gen:
+ pageids = []
+ cache = {}
+ for p in sublist:
+ if pageids is not None:
+ if hasattr(p, "_pageid"):
+ pageids.append(str(p._pageid))
+ else:
+ # only use pageids if all pages have them
+ pageids = None
+ cache[p.title(withSection=False)] = p
+ rvgen = api.PropertyGenerator("revisions|info")
+ if pageids is not None:
+ rvgen.request["pageids"] = "|".join(pageids)
+ else:
+ rvgen.request["titles"] = "|".join(cache.keys())
+ rvgen.request[u"rvprop"] = \
+ u"ids|flags|timestamp|user|comment|content"
+ for pagedata in rvgen:
+ if pagedata['title'] not in cache:
+ raise Error(
+ u"preloadpages: Query returned unexpected title '%s'"
+ % pagedata['title']
+ )
+ page = cache[pagedata['title']]
+ api.update_page(page, pagedata)
+ if 'revisions' in pagedata: # true if page exists
+ for rev in pagedata['revisions']:
+ revision = pywikibot.page.Revision(
+ revid=rev['revid'],
+ timestamp=rev['timestamp'],
+ user=rev['user'],
+ anon=rev.has_key('anon'),
+ comment=rev.get('comment', u''),
+ minor=rev.has_key('minor'),
+ text=rev.get('*', None)
+ )
+ page._revisions[revision.revid] = revision
+ page._revid = revision.revid
+ yield page
+ finally:
+ gen.stop()
+
# following group of methods map more-or-less directly to API queries
def getbacklinks(self, page, followRedirects=False, filterRedirects=None,
@@ -819,7 +877,8 @@
else:
page = Page(self, pagedata['title'])
api.update_page(page, pagedata)
-
+ if 'revisions' not in pagedata:
+ continue
for rev in pagedata['revisions']:
revision = pywikibot.page.Revision(
revid=rev['revid'],
@@ -849,6 +908,8 @@
raise Error(
u"getlanglinks: Query on %s returned data on '%s'"
% (page, pageitem['title']))
+ if 'langlinks' not in pageitem:
+ continue
for linkdata in pageitem['langlinks']:
yield pywikibot.Link(linkdata['*'],
source=pywikibot.Site(linkdata['lang']))
@@ -864,6 +925,8 @@
raise RuntimeError(
"getlanglinks: Query on %s returned data on '%s'"
% (page, pageitem['title']))
+ if 'extlinks' not in pageitem:
+ continue
for linkdata in pageitem['extlinks']:
yield linkdata['*']
Modified: branches/rewrite/pywikibot/throttle.py
===================================================================
--- branches/rewrite/pywikibot/throttle.py 2008-04-28 14:43:39 UTC (rev 5282)
+++ branches/rewrite/pywikibot/throttle.py 2008-04-29 16:18:53 UTC (rev 5283)
@@ -107,7 +107,7 @@
f.close()
self.process_multiplicity = count
if self.verbosedelay:
- pywikibot.output(
+ logging.info(
u"Found %s processes running, including the current process."
% count)
finally:
@@ -216,10 +216,10 @@
self.next_multiplicity = math.log(1+requestsize)/math.log(2.0)
# Announce the delay if it exceeds a preset limit
if waittime > config.noisysleep:
- pywikibot.output(u"Sleeping for %.1f seconds, %s"
- % (waittime,
- time.strftime("%Y-%m-%d %H:%M:%S",
- time.localtime()))
+ logging.warn(u"Sleeping for %.1f seconds, %s"
+ % (waittime,
+ time.strftime("%Y-%m-%d %H:%M:%S",
+ time.localtime()))
)
time.sleep(waittime)
if write:
Modified: branches/rewrite/pywikibot/tools.py
===================================================================
--- branches/rewrite/pywikibot/tools.py 2008-04-28 14:43:39 UTC (rev 5282)
+++ branches/rewrite/pywikibot/tools.py 2008-04-29 16:18:53 UTC (rev 5283)
@@ -26,13 +26,14 @@
all the generated values, it must call the generator's stop() method to
stop the background thread. Example usage:
- >>> gen = ThreadedGenerator(target=foo)
+ >>> gen = ThreadedGenerator(target=xrange, args=(20,))
>>> try:
... for data in gen:
- ... do_work(data)
+ ... print data,
... finally:
... gen.stop()
-
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+
"""
def __init__(self, group=None, target=None, name="GeneratorThread",
@@ -95,3 +96,36 @@
self.stop()
+def itergroup(iterable, size):
+ """Make an iterator that returns lists of (up to) size items from iterable.
+
+ Example:
+
+ >>> i = itergroup(xrange(25), 10)
+ >>> print i.next()
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+ >>> print i.next()
+ [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+ >>> print i.next()
+ [20, 21, 22, 23, 24]
+ >>> print i.next()
+ Traceback (most recent call last):
+ ...
+ StopIteration
+
+ """
+ chunk = []
+ for item in iter(iterable):
+ chunk.append(item)
+ if len(chunk) == size:
+ yield chunk
+ chunk = []
+ if chunk:
+ yield chunk
+
+
+if __name__ == "__main__":
+ def _test():
+ import doctest
+ doctest.testmod()
+ _test()