Revision: 5204 Author: russblau Date: 2008-04-11 20:29:11 +0000 (Fri, 11 Apr 2008)
Log Message: ----------- implemented getrevisions() [incomplete], and made modest changes elsewhere
Modified Paths: -------------- branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/date.py branches/rewrite/pywikibot/page.py branches/rewrite/pywikibot/site.py branches/rewrite/pywikibot/throttle.py
Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2008-04-11 13:36:23 UTC (rev 5203) +++ branches/rewrite/pywikibot/data/api.py 2008-04-11 20:29:11 UTC (rev 5204) @@ -145,19 +145,23 @@ while True: # TODO catch http errors try: - if self.params.get("action", "") in ("login",): - rawdata = http.request(self.site, uri, method="POST", - headers={'Content-Type': - 'application/x-www-form-urlencoded'}, - body=params) - else: - uri = uri + "?" + params - rawdata = http.request(self.site, uri) - except Exception, e: #TODO: what exceptions can occur here? - logging.warning(traceback.format_exc()) - print uri, params - self.wait() - continue + self.site.sitelock.acquire() + try: + if self.params.get("action", "") in ("login",): + rawdata = http.request(self.site, uri, method="POST", + headers={'Content-Type': + 'application/x-www-form-urlencoded'}, + body=params) + else: + uri = uri + "?" + params + rawdata = http.request(self.site, uri) + except Exception, e: #TODO: what exceptions can occur here? + logging.warning(traceback.format_exc()) + print uri, params + self.wait() + continue + finally: + self.site.sitelock.release() if rawdata.startswith(u"unknown_action"): raise APIError(rawdata[:14], rawdata[16:]) try: @@ -197,7 +201,7 @@ if lag: logging.info( "Pausing due to database lag: " + info) - self.wait(int(lag.group("lag"))) + self.lag_wait(int(lag.group("lag"))) continue if code in (u'internal_api_error_DBConnectionError', ): self.wait() @@ -208,25 +212,32 @@ except TypeError: raise RuntimeError(result)
- def wait(self, lag=None): + def wait(self): """Determine how long to wait after a failed request.""" self.max_retries -= 1 if self.max_retries < 0: raise TimeoutError("Maximum retries attempted without success.") - wait = self.retry_wait - if lag is not None: - # in case of database lag, wait half the lag time, - # but not less than 5 or more than 120 seconds - wait = max(5, min(lag // 2, 120)) logging.warn("Waiting %s seconds before retrying." % wait) - time.sleep(wait) - if lag is None: - self.retry_wait = min(120, self.retry_wait * 2) + time.sleep(self.retry_wait) + # double the next wait, but do not exceed 120 seconds + self.retry_wait = min(120, self.retry_wait * 2)
+ def lag_wait(self, lag): + """Wait due to server lag.""" + # unlike regular wait, this shuts down all access to site + self.site.sitelock.acquire() + try: + # wait at least 5 seconds, no more than 120 + wait = max(5, min(120, lag//2)) + logging.warn("Pausing %s seconds due to server lag." % wait) + time.sleep(wait) + finally: + self.site.sitelock.release()
+ class PageGenerator(object): """Iterator for response to a request of type action=query&generator=foo.""" - def __init__(self, generator="", **kwargs): + def __init__(self, generator, **kwargs): """ Required and optional parameters are as for C{Request}, except that action=query is assumed and generator is required. @@ -235,8 +246,6 @@ @type generator: str
""" - if not generator: - raise ValueError("generator argument is required.") if generator not in self.limits: raise ValueError("Unrecognized generator '%s'" % generator) self.request = Request(action="query", generator=generator, **kwargs) @@ -261,7 +270,6 @@ self.resultkey = "pages" # element to look for in result
# dict mapping generator types to their limit parameter names - limits = {'links': None, 'images': None, 'templates': None, @@ -348,6 +356,75 @@ return image
+class PropertyGenerator(object): + """Generator for queries of type action=query&property=...""" + + def __init__(self, prop, **kwargs): + """ + Required and optional parameters are as for C{Request}, except that + action=query is assumed and prop is required. + + @param prop: the "property=" type from api.php + @type prop: str + + """ + self.request = Request(action="query", prop=prop, **kwargs) + if prop not in self.limits: + raise ValueError("Unrecognized property '%s'" % prop) + # set limit to max, if applicable + if self.limits[prop] and kwargs.pop("getAll", False): + self.request['g'+self.limits[generator]] = "max" + self.site = self.request.site + self.resultkey = prop # element to look for in result + + # dict mapping property types to their limit parameter names + limits = {'revisions': 'rvlimit', + 'imageinfo': 'iilimit', + 'info': None, + 'links': None, + 'langlinks': None, + 'images': None, + 'imageinfo': None, + 'templates': None, + 'categories': None, + 'extlinks': None, + } + + def __iter__(self): + """Iterate objects for elements found in response.""" + # this looks for the resultkey ''inside'' a <page> entry + while True: + self.site.get_throttle() + self.data = self.request.submit() + if not self.data or not isinstance(self.data, dict): + raise StopIteration + if not ("query" in self.data and "pages" in self.data["query"]): + raise StopIteration + pagedata = self.data["query"]["pages"].values() + assert len(pagedata)==1 + pagedata = pagedata[0] + if not self.resultkey in pagedata: + raise StopIteration + if isinstance(pagedata[self.resultkey], dict): + for v in pagedata[self.resultkey].itervalues(): + yield v + elif isinstance(pagedata[self.resultkey], list): + for v in pagedata[self.resultkey]: + yield v + else: + raise APIError("Unknown", + "Unknown format in ['%s'] value." + % self.resultkey, + data=pagedata[self.resultkey]) + if not "query-continue" in self.data: + return + if not self.resultkey in self.data["query-continue"]: + raise APIError("Unknown", + "Missing '%s' key in ['query-continue'] value.", + data=self.data["query-continue"]) + self.request.update(self.data["query-continue"][self.resultkey]) + + class LoginManager(login.LoginManager): """Supplies getCookie() method to use API interface.""" def getCookie(self, remember=True, captchaId=None, captchaAnswer=None):
Modified: branches/rewrite/pywikibot/date.py =================================================================== --- branches/rewrite/pywikibot/date.py 2008-04-11 13:36:23 UTC (rev 5203) +++ branches/rewrite/pywikibot/date.py 2008-04-11 20:29:11 UTC (rev 5204) @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- """ This file is not runnable, but it only consists of various lists which are required by some other programs. @@ -17,9 +17,7 @@ # used for date recognition import types import re -import wikipedia
- # # Different collections of well known formats # @@ -1523,7 +1521,7 @@ """ """ for s in makeMonthNamedList( lang, pattern, capitalize ): - wikipedia.output( s ) + print( s )
def testMapEntry( formatName, showAll = True, value = None ): @@ -1542,7 +1540,7 @@ if value is not None: start, stop = value, value+1 if showAll: - wikipedia.output(u"Processing %s with limits from %d to %d and step %d" % (formatName, start,stop-1,step)) + print(u"Processing %s with limits from %d to %d and step %d" % (formatName, start,stop-1,step))
for code, convFunc in formats[formatName].iteritems(): # import time @@ -1555,18 +1553,21 @@ if newValue != value: raise AssertionError(" %s != %s: assert failed, values didn't match" % (newValue, value)) if showAll: - wikipedia.output(u"date.formats['%s']['%s'](%d): '%s' -> %d" % (formatName, code, value, convFunc(value), newValue)) + print(u"date.formats['%s']['%s'](%d): '%s' -> %d" % (formatName, code, value, convFunc(value), newValue)) except: - wikipedia.output(u"********** Error in date.formats['%s']['%s'](%d)" % (formatName, code, value)) + print(u"********** Error in date.formats['%s']['%s'](%d)" % (formatName, code, value)) raise -# wikipedia.output( u"%s\t%s\t%f" % (formatName, code, time.clock() - startClock) ) +# print( u"%s\t%s\t%f" % (formatName, code, time.clock() - startClock) )
def test(quick = False, showAll = False): - """This is a test function, to be used interactivelly to test entire format convesion map at once + """This is a test function, to be used interactively to test entire + format conversion map at once + Usage example: run python interpreter >>> import date >>> date.test() + """ for formatName in formats.keys():
@@ -1574,13 +1575,13 @@ testMapEntry( formatName, showAll, formatLimits[formatName][1] ) # Only test the first value in the test range else: testMapEntry( formatName, showAll ) # Extensive test! # Test decade rounding - wikipedia.output(u"'%s' complete." % formatName) + print(u"'%s' complete." % formatName)
if quick: - #wikipedia.output(u'Date module quick consistency test passed') + #print(u'Date module quick consistency test passed') pass else: - wikipedia.output(u'Date module has been fully tested') + print(u'Date module has been fully tested')
#
Modified: branches/rewrite/pywikibot/page.py =================================================================== --- branches/rewrite/pywikibot/page.py 2008-04-11 13:36:23 UTC (rev 5203) +++ branches/rewrite/pywikibot/page.py 2008-04-11 20:29:11 UTC (rev 5204) @@ -283,7 +283,7 @@ raise self._getexception if force or not hasattr(self, "_revid") \ or not self._revid in self._revisions: - self.site().getrevisions(self, getText=True, ids=None, sysop=sysop) + self.site().getrevisions(self, getText=True, sysop=sysop) # TODO: Exception handling for no-page, redirects, etc.
return self._revisions[self._revid].text @@ -307,7 +307,8 @@ "Page.getOldVersion(change_edit_time) option is deprecated.") if force or not oldid in self._revisions: self.site().getrevisions(self, getText=True, ids=oldid, - redirs=get_redirect, sysop=sysop) + sysop=sysop) + # TODO: what about redirects, errors? return self._revisions[oldid].text
def permalink(self): @@ -678,7 +679,7 @@ else: limit = revCount return self.site().getrevisions(self, withText=False, - older=reverseOrder, limit=limit) + older=not reverseOrder, limit=limit)
def getVersionHistoryTable(self, forceReload=False, reverseOrder=False, getAll=False, revCount=500): @@ -701,8 +702,7 @@ @return: A generator that yields tuples consisting of revision ID, edit date/time, user name and content """ - return self.site().getrevisions(self, withText=True, - older=reverseOrder, limit=None) + return self.site().getrevisions(self, withText=True)
def contributingUsers(self): """Return a set of usernames (or IPs) of users who edited this page."""
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2008-04-11 13:36:23 UTC (rev 5203) +++ branches/rewrite/pywikibot/site.py 2008-04-11 20:29:11 UTC (rev 5204) @@ -100,11 +100,12 @@ self._username = user
# following are for use with lock_page and unlock_page methods - self._mutex = threading.Lock() + self._pagemutex = threading.Lock() self._locked_pages = []
pt_min = min(config.minthrottle, config.put_throttle) - self.put_throttle = Throttle(self, pt_min, config.maxthrottle) + self.put_throttle = Throttle(self, pt_min, config.maxthrottle, + verbosedelay=True) self.put_throttle.setDelay(config.put_throttle)
gt_min = min(config.minthrottle, config.get_throttle) @@ -203,7 +204,6 @@ else: return self.family().redirect.get(self.language(), None)
- def lock_page(self, page, block=True): """Lock page for writing. Must be called before writing any page.
@@ -216,7 +216,7 @@ otherwise, raise an exception if page can't be locked
""" - self._mutex.acquire() + self._pagemutex.acquire() try: while page in self._locked_pages: if not block: @@ -224,7 +224,7 @@ time.sleep(.25) self._locked_pages.append(page.title(withSection=False)) finally: - self._mutex.release() + self._pagemutex.release()
def unlock_page(self, page): """Unlock page. Call as soon as a write operation has completed. @@ -233,11 +233,11 @@ @type page: pywikibot.Page
""" - self._mutex.acquire() + self._pagemutex.acquire() try: self._locked_pages.remove(page.title(withSection=False)) finally: - self._mutex.release() + self._pagemutex.release()
class APISite(BaseSite): @@ -338,6 +338,7 @@ 14: [u"Category"], 15: [u"Category talk"], } + self.sitelock = threading.Lock() return
# ANYTHING BELOW THIS POINT IS NOT YET IMPLEMENTED IN __init__() @@ -600,14 +601,71 @@ "Cannot get category members of non-Category page '%s'" % category.title()) cmtitle = category.title(withSection=False) - cmgen = api.PageGenerator("categorymembers", gcmtitle=cmtitle, + cmgen = api.PageGenerator(u"categorymembers", gcmtitle=cmtitle, gcmprop="ids|title|sortkey") if namespaces is not None: - cmgen.request["gcmnamespace"] = u"|".join(unicode(ns) + cmgen.request[u"gcmnamespace"] = u"|".join(unicode(ns) for ns in namespaces) return cmgen
+ def getrevisions(self, page=None, getText=False, revids=None, + older=True, limit=None, sysop=False, user=None, + excludeuser=None): + """Retrieve and store revision information.
+ @param page: retrieve the history of this Page (required unless ids + is specified) + @param getText: if True, retrieve the wiki-text of each revision as + well + @param revids: retrieve only the specified revision ids (required + unless page is specified) + @param older: if True, retrieve newest revisions first; otherwise, + retrieve oldest revisions first + @param limit: if specified, retrieve no more than this number of + revisions (defaults to latest revision only) + @type limit: int + @param user: retrieve only revisions authored by this user + @param excludeuser: retrieve all revisions not authored by this user + @param sysop: if True, switch to sysop account (if available) to + retrieve this page + + """ + if page is None and revids is None: + raise ValueError( + "getrevisions needs either page or revids argument.") + if page is not None: + rvtitle = page.title(withSection=False) + rvgen = api.PropertyGenerator(u"revisions", titles=rvtitle) + else: + ids = u"|".join(unicode(r) for r in revids) + rvgen = api.PropertyGenerator(u"revisions", revids=ids) + if getText: + rvgen.request[u"rvprop"] = \ + u"ids|flags|timestamp|user|comment|content" + if page.section(): + rvgen.request[u"rvsection"] = unicode(page.section()) + if limit: + rvgen.request[u"rvlimit"] = unicode(limit) + if not older: + rvgen.request[u"rvdir"] = u"newer" + if user: + rvgen.request[u"rvuser"] = user + elif excludeuser: + rvgen.request[u"rvexcludeuser"] = excludeuser + # TODO if sysop: + for rev in rvgen: + revision = pywikibot.page.Revision(revid=rev['revid'], + timestamp=rev['timestamp'], + user=rev['user'], + anon=rev.has_key('anon'), + comment=rev.get('comment', u''), + minor=rev.has_key('minor'), + text=rev.get('*', None)) + page._revisions[revision.revid] = revision + if revids is None and limit is None and user is None and excludeuser is None: + page._revid = revision.revid + + #### METHODS NOT IMPLEMENTED YET (but may be delegated to Family object) #### class NotImplementedYet:
Modified: branches/rewrite/pywikibot/throttle.py =================================================================== --- branches/rewrite/pywikibot/throttle.py 2008-04-11 13:36:23 UTC (rev 5203) +++ branches/rewrite/pywikibot/throttle.py 2008-04-11 20:29:11 UTC (rev 5204) @@ -35,7 +35,7 @@ """ def __init__(self, site, mindelay=config.minthrottle, maxdelay=config.maxthrottle, - multiplydelay=True): + multiplydelay=True, verbosedelay=False): self.lock = threading.RLock() self.mysite = str(site) self.mindelay = mindelay @@ -48,6 +48,7 @@ self.releasepid = 1800 # Free the process id after this many seconds self.lastwait = 0.0 self.delay = 0 + self.verbosedelay = verbosedelay if multiplydelay: self.checkMultiplicity() self.setDelay(mindelay) @@ -106,9 +107,10 @@ f.write("%(pid)s %(time)s %(site)s\n" % p) f.close() self.process_multiplicity = count - pywikibot.output( + if self.verbosedelay: + pywikibot.output( u"Found %s processes running, including the current process." - % count) + % count) finally: self.lock.release()
pywikipedia-l@lists.wikimedia.org