Revision: 5323 Author: russblau Date: 2008-05-07 21:13:48 +0000 (Wed, 07 May 2008)
Log Message: ----------- site methods allpages, alllinks, allcategories added
Modified Paths: -------------- branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/page.py branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2008-05-07 19:13:52 UTC (rev 5322) +++ branches/rewrite/pywikibot/data/api.py 2008-05-07 21:13:48 UTC (rev 5323) @@ -231,6 +231,7 @@ # double the next wait, but do not exceed 120 seconds self.retry_wait = min(120, self.retry_wait * 2)
+#TODO - refactor all these generator classes into a parent/subclass hierarchy
class PageGenerator(object): """Iterator for response to a request of type action=query&generator=foo.""" @@ -247,8 +248,11 @@ raise ValueError("Unrecognized generator '%s'" % generator) self.request = Request(action="query", generator=generator, **kwargs) # set limit to max, if applicable + # FIXME: need to distinguish between the "limit" per API request and an + # overall limit on the number of pages to be iterated if self.limits[generator]: - self.request['g'+self.limits[generator]] = "max" + limitkey = 'g' + self.limits[generator] + self.request.setdefault(limitkey, "max") if 'prop' in self.request: self.request['prop'] += "|info|imageinfo" else: @@ -330,6 +334,7 @@
class CategoryPageGenerator(PageGenerator): """Generator that yields Category objects instead of Pages.""" + def result(self, pagedata): p = PageGenerator.result(self, pagedata) return pywikibot.Category(p) @@ -337,6 +342,7 @@
class ImagePageGenerator(PageGenerator): """Generator that yields ImagePage objects instead of Pages.""" + def result(self, pagedata): p = PageGenerator.result(self, pagedata) image = pywikibot.ImagePage(p) @@ -350,8 +356,9 @@
Note that this generator yields one or more dict object(s) corresponding to each "page" item(s) from the API response; the calling module has to - decide what to do with the contents of the dict.""" + decide what to do with the contents of the dict.
+ """ def __init__(self, prop, **kwargs): """ Required and optional parameters are as for C{Request}, except that @@ -409,6 +416,76 @@ self.request.update(self.data["query-continue"][self.resultkey])
+class ListGenerator(object): + """Iterator for queries with action=query&list=... parameters""" + + def __init__(self, listaction, **kwargs): + """ + Required and optional parameters are as for C{Request}, except that + action=query is assumed and listaction is required. + + @param listaction: the "list=" type from api.php + @type listaction: str + + """ + if listaction not in self.limits: + raise ValueError("Unrecognized list type '%s'" % listaction) + self.request = Request(action="query", list=listaction, **kwargs) + # set limit to max, if applicable + # FIXME: need to distinguish between the "limit" per API request and an + # overall limit on the number of pages to be iterated + if self.limits[listaction]: + limitkey = self.limits[listaction] + self.request.setdefault(limitkey, "max") + self.resultkey = listaction + self.site = self.request.site + + # dict mapping generator types to their limit parameter names + + limits = {'allpages': 'aplimit', + 'alllinks': 'allimit', + 'allcategories': 'aclimit', + 'allusers': 'aulimit', + 'allimages': 'ailimit', + 'backlinks': 'bllimit', + 'blocks': 'bklimit', + 'categorymembers': 'cmlimit', + 'embeddedin': 'eilimit', + 'exturlusage': 'eulimit', + 'imageusage': 'iulimit', + 'logevents': 'lelimit', + 'recentchanges': 'rclimit', + 'search': 'srlimit', + 'usercontribs': 'uclimit', + 'watchlist': 'wllimit', + 'deletedrevs': 'drlimit', + 'users': None, + 'random': 'rnlimit', + } + + def __iter__(self): + """Iterate objects for elements found in response.""" + # this looks for the resultkey in the 'query' element + while True: + self.data = self.request.submit() + if not self.data or not isinstance(self.data, dict): + raise StopIteration + if not ("query" in self.data + and self.resultkey in self.data["query"]): + raise StopIteration + resultdata = self.data["query"][self.resultkey] + assert isinstance(resultdata, list) + for item in resultdata: + yield item + if not "query-continue" in self.data: + return + if not self.resultkey in self.data["query-continue"]: + raise APIError("Unknown", + "Missing '%s' key in ['query-continue'] value.", + data=self.data["query-continue"]) + self.request.update(self.data["query-continue"][self.resultkey]) + + class LoginManager(login.LoginManager): """Supplies getCookie() method to use API interface.""" def getCookie(self, remember=True, captchaId=None, captchaAnswer=None):
Modified: branches/rewrite/pywikibot/page.py =================================================================== --- branches/rewrite/pywikibot/page.py 2008-05-07 19:13:52 UTC (rev 5322) +++ branches/rewrite/pywikibot/page.py 2008-05-07 21:13:48 UTC (rev 5323) @@ -284,7 +284,7 @@ raise self._getexception if force or not hasattr(self, "_revid") \ or not self._revid in self._revisions: - self.site().getrevisions(self, getText=True, sysop=sysop) + self.site().loadrevisions(self, getText=True, sysop=sysop) # TODO: Exception handling for no-page, redirects, etc.
return self._revisions[self._revid].text @@ -307,7 +307,7 @@ logging.debug( "Page.getOldVersion(change_edit_time) option is deprecated.") if force or not oldid in self._revisions: - self.site().getrevisions(self, getText=True, ids=oldid, + self.site().loadrevisions(self, getText=True, ids=oldid, sysop=sysop) # TODO: what about redirects, errors? return self._revisions[oldid].text @@ -324,7 +324,7 @@ def latestRevision(self): """Return the current revision id for this page.""" if not hasattr(self, '_revid'): - self.site().getrevisions(self) + self.site().loadrevisions(self) return self._revid
def userName(self): @@ -664,7 +664,7 @@ if not self.isRedirectPage(): raise pywikibot.IsNotRedirectPage if not isinstance(self._redir, Page): - self.site().pageredirtarget(self) + self.site().getredirtarget(self) return self._redir
def getVersionHistory(self, forceReload=False, reverseOrder=False, @@ -682,7 +682,7 @@ limit = None else: limit = revCount - return self.site().getrevisions(self, getText=False, + return self.site().loadrevisions(self, getText=False, rvdir=not reverseOrder, limit=limit)
def getVersionHistoryTable(self, forceReload=False, reverseOrder=False, @@ -706,7 +706,7 @@ @return: A generator that yields tuples consisting of revision ID, edit date/time, user name and content """ - return self.site().getrevisions(self, withText=True) + return self.site().loadrevisions(self, withText=True)
def contributingUsers(self): """Return a set of usernames (or IPs) of users who edited this page.""" @@ -1100,7 +1100,7 @@ recurse = recurse - 1 if not hasattr(self, "_subcats"): self._subcats = [] - for member in self.site().pagecategorymembers(self, namespaces=[14]): + for member in self.site().categorymembers(self, namespaces=[14]): subcat = Category(self.site(), member.title()) self._subcats.append(subcat) yield subcat @@ -1127,7 +1127,7 @@ """ namespaces = [x for x in self.site().namespaces().keys() if x>=0 and x!=14] - for member in self.site().pagecategorymembers(self, + for member in self.site().categorymembers(self, namespaces=namespaces): yield member if recurse:
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2008-05-07 19:13:52 UTC (rev 5322) +++ branches/rewrite/pywikibot/site.py 2008-05-07 21:13:48 UTC (rev 5323) @@ -494,7 +494,7 @@ return self.namespaces()[num] return self.namespaces()[num][0]
- def getpageinfo(self, page): + def loadpageinfo(self, page): """Load page info from api and save in page attributes""" title = page.title(withSection=False) query = api.PropertyGenerator("info", @@ -502,21 +502,21 @@ for pageitem in query: if pageitem['title'] != title: raise Error( - u"getpageinfo: Query on %s returned data on '%s'" + u"loadpageinfo: Query on %s returned data on '%s'" % (page, pageitem['title'])) api.update_page(page, pageitem)
def page_exists(self, page): """Return True if and only if page is an existing page on site.""" if not hasattr(page, "_pageid"): - self.getpageinfo(page) + self.loadpageinfo(page) return page._pageid > 0
def page_restrictions(self, page): """Returns a dictionary reflecting page protections""" if not self.page_exists(page): raise NoPage(u'No page %s.' % page) - # page_exists called getpageinfo which set protection levels + # page_exists called loadpageinfo which set protection levels return page._protection
def page_can_be_edited(self, page): @@ -537,13 +537,13 @@ def page_isredirect(self, page): """Return True if and only if page is a redirect.""" if not hasattr(page, "_redir"): - self.getpageinfo(page) + self.loadpageinfo(page) return bool(page._redir)
- def pageredirtarget(self, page): + def getredirtarget(self, page): """Return Page object for the redirect target of page.""" if not hasattr(page, "_redir"): - self.getpageinfo(page) + self.loadpageinfo(page) if not page._redir: raise pywikibot.IsNotRedirectPage title = page.title(withSection=False) @@ -554,13 +554,13 @@ result = query.submit() if "query" not in result or "redirects" not in result["query"]: raise RuntimeError( - "pageredirtarget: No 'redirects' found for page %s." + "getredirtarget: No 'redirects' found for page %s." % title) redirmap = dict((item['from'], item['to']) for item in result['query']['redirects']) if title not in redirmap: raise RuntimeError( - "pageredirtarget: 'redirects' contains no key for page %s." + "getredirtarget: 'redirects' contains no key for page %s." % title) if "pages" not in result['query']: # no "pages" element indicates a circular redirect @@ -569,7 +569,7 @@ # there should be only one value in 'pages', and it is the target if pagedata['title'] not in redirmap.values(): raise RuntimeError( - "pageredirtarget: target page '%s' not found in 'redirects'" + "getredirtarget: target page '%s' not found in 'redirects'" % pagedata['title']) target = pywikibot.Page(self, pagedata['title'], pagedata['ns']) api.update_page(target, pagedata) @@ -733,7 +733,7 @@ for ns in namespaces) return tlgen
- def pagecategorymembers(self, category, namespaces=None): + def categorymembers(self, category, namespaces=None): """Iterate members of specified category.
@param category: The Category to iterate. @@ -757,7 +757,7 @@ for ns in namespaces) return cmgen
- def getrevisions(self, page=None, getText=False, revids=None, + def loadrevisions(self, page=None, getText=False, revids=None, limit=None, startid=None, endid=None, starttime=None, endtime=None, rvdir=None, user=None, excludeuser=None, section=None, sysop=False): @@ -811,25 +811,25 @@ # check for invalid argument combinations if page is None and revids is None: raise ValueError( - "getrevisions: either page or revids argument required") + "loadrevisions: either page or revids argument required") if (startid is not None or endid is not None) and \ (starttime is not None or endtime is not None): raise ValueError( - "getrevisions: startid/endid combined with starttime/endtime") + "loadrevisions: startid/endid combined with starttime/endtime") if starttime is not None and endtime is not None: if rvdir and starttime >= endtime: raise ValueError( - "getrevisions: starttime > endtime with rvdir=True") + "loadrevisions: starttime > endtime with rvdir=True") if (not rvdir) and endtime >= starttime: raise ValueError( - "getrevisions: endtime > starttime with rvdir=False") + "loadrevisions: endtime > starttime with rvdir=False") if startid is not None and endid is not None: if rvdir and startid >= endid: raise ValueError( - "getrevisions: startid > endid with rvdir=True") + "loadrevisions: startid > endid with rvdir=True") if (not rvdir) and endid >= startid: raise ValueError( - "getrevisions: endid > startid with rvdir=False") + "loadrevisions: endid > startid with rvdir=False")
# assemble API request if revids is None: @@ -866,7 +866,7 @@ if page is not None: if pagedata['title'] != page.title(withSection=False): raise Error( - u"getrevisions: Query on %s returned data on '%s'" + u"loadrevisions: Query on %s returned data on '%s'" % (page, pagedata['title'])) else: page = Page(self, pagedata['title']) @@ -924,8 +924,148 @@ for linkdata in pageitem['extlinks']: yield linkdata['*']
+ def allpages(self, start="!", prefix="", namespace=0, + filterredir=None, filterlanglinks=None, + minsize=None, maxsize=None, + protect_type=None, protect_level=None, + limit=None, reverse=False, includeRedirects=None, + throttle=None): + """Iterate pages in a single namespace.
+ Note: parameters includeRedirects and throttle are deprecated and + included only for backwards compatibility. + + @param start: Start at this title (page need not exist). + @param prefix: Only yield pages starting with this string. + @param namespace: Iterate pages from this (single) namespace + (default: 0) + @param filterredir: if True, only yield redirects; if False (and not + None), only yield non-redirects (default: yield both) + @param filterlanglinks: if True, only yield pages with language links; + if False (and not None), only yield pages without language links + (default: yield both) + @param minsize: if present, only yield pages at least this many + bytes in size + @param maxsize: if present, only yield pages at most this many bytes + in size + @param protect_type: only yield pages that have a protection of the + specified type + @type protect_type: str + @param protect_level: only yield pages that have protection at this + level; can only be used if protect_type is specified + @param limit: maximum number of pages to iterate (default: iterate + all pages in namespace) + @param reverse: if True, iterate in reverse Unicode lexigraphic + order (default: iterate in forward order)
+ """ + if not isinstance(namespace, int): + raise Error("allpages: only one namespace permitted.") + if throttle is not None: + logging.debug("allpages: the 'throttle' parameter is deprecated.") + if includeRedirects is not None: + logging.debug( + "allpages: the 'includeRedirect' parameter is deprecated.") + if includeRedirects: + if includeRedirects == "only": + filterredirs = True + else: + filterredirs = None + else: + filterredirs = False + + apgen = api.PageGenerator("allpages", gapnamespace=str(namespace), + gapfrom=start) + if prefix: + apgen.request["gapprefix"] = prefix + if filterredir is not None: + apgen.request["gapfilterredir"] = (filterredir + and "redirects" + or "nonredirects") + if filterlanglinks is not None: + apgen.request["gapfilterlanglinks"] = (filterlanglinks + and "withlanglinks" + or "withoutlanglinks") + if isinstance(minsize, int): + apgen.request["gapminsize"] = str(minsize) + if isinstance(maxsize, int): + apgen.request["gapmaxsize"] = str(maxsize) + if isinstance(protect_type, basestring): + apgen.request["gapprtype"] = protect_type + if isinstance(protect_level, basestring): + apgen.request["gapprlevel"] = protect_level + if isinstance(limit, int): + apgen.request["gaplimit"] = str(limit) + if reverse: + apgen.request["gapdir"] = "descending" + return apgen + + def alllinks(self, start="!", prefix="", namespace=0, unique=False, + limit=None, fromids=False): + """Iterate all links to pages (which need not exist) in one namespace. + + Note that, in practice, links that were found on pages that have + been deleted may not have been removed from the links table, so this + method can return false positives. + + @param start: Start at this title (page need not exist). + @param prefix: Only yield pages starting with this string. + @param namespace: Iterate pages from this (single) namespace + (default: 0) + @param unique: If True, only iterate each link title once (default: + iterate once for each linking page) + @param limit: maximum number of pages to iterate (default: iterate + all pages in namespace) + @param fromids: if True, include the pageid of the page containing + each link (default: False) as the 'fromid' attribute of the Page; + cannot be combined with unique + + """ + if unique and fromids: + raise Error("alllinks: unique and fromids cannot both be True.") + if not isinstance(namespace, int): + raise Error("alllinks: only one namespace permitted.") + algen = api.ListGenerator("alllinks", alnamespace=str(namespace), + alfrom=start) + if prefix: + algen.request["alprefix"] = prefix + if isinstance(limit, int): + algen.request["allimit"] = str(limit) + if unique: + algen.request["alunique"] = "" + if fromids: + algen.request["alprop"] = "title|ids" + for link in algen: + p = pywikibot.Page(self, link['title'], link['ns']) + if fromids: + p.fromid = link['fromid'] + yield p + + + def allcategories(self, start="!", prefix="", limit=None, + reverse=False): + """Iterate categories used (which need not have a Category page). + + Iterator yields Category objects. + + @param start: Start at this category title (category need not exist). + @param prefix: Only yield categories starting with this string. + @param limit: maximum number of categories to iterate (default: + iterate all) + @param reverse: if True, iterate in reverse Unicode lexigraphic + order (default: iterate in forward order) + + """ + acgen = api.CategoryGenerator("allcategories", gapfrom=start) + if prefix: + acgen.request["gacprefix"] = prefix + if isinstance(limit, int): + acgen.request["gaclimit"] = str(limit) + if reverse: + acgen.request["gacdir"] = "descending" + return acgen + + #### METHODS NOT IMPLEMENTED YET (but may be delegated to Family object) #### class NotImplementedYet:
@@ -1806,92 +1946,6 @@ if not repeat: break
- def allpages(self, start='!', namespace=0, includeredirects=True, - throttle=True): - """Yield all Pages from Special:Allpages. - - Parameters: - start Start at this page. By default, it starts at '!', and yields - all pages. - namespace Yield all pages in this namespace; defaults to 0. - MediaWiki software will only return pages in one namespace - at a time. - - If includeredirects is False, redirects will not be found. - If includeredirects equals the string 'only', only redirects - will be found. Note that this has not been tested on older - versions of the MediaWiki code. - - It is advised not to use this directly, but to use the - AllpagesPageGenerator from pagegenerators.py instead. - - """ - while True: - # encode Non-ASCII characters in hexadecimal format (e.g. %F6) - start = start.encode(self.encoding()) - start = urllib.quote(start) - # load a list which contains a series of article names (always 480) - path = self.allpages_address(start, namespace) - output(u'Retrieving Allpages special page for %s from %s, namespace %i' % (repr(self), start, namespace)) - returned_html = self.getUrl(path) - # Try to find begin and end markers - try: - # In 1.4, another table was added above the navigational links - if self.versionnumber() >= 4: - begin_s = '</table><hr /><table' - end_s = '</table' - else: - begin_s = '<table' - end_s = '</table' - ibegin = returned_html.index(begin_s) - iend = returned_html.index(end_s,ibegin + 3) - except ValueError: - raise ServerError( -"Couldn't extract allpages special page. Make sure you're using MonoBook skin.") - # remove the irrelevant sections - returned_html = returned_html[ibegin:iend] - if self.versionnumber()==2: - R = re.compile('/wiki/(.*?)" *class=['"]printable') - elif self.versionnumber()<5: - # Apparently the special code for redirects was added in 1.5 - R = re.compile('title ?="(.*?)"') - elif not includeredirects: - R = re.compile('<td(?: width="33%")?><a href="\S*" +title ?="(.*?)"') - elif includeredirects == 'only': - R = re.compile('<td(?: width="33%")?><[^<>]*allpagesredirect"><a href="\S*" +title ?="(.*?)"') - else: - R = re.compile('title ?="(.*?)"') - # Count the number of useful links on this page - n = 0 - for hit in R.findall(returned_html): - # count how many articles we found on the current page - n = n + 1 - if self.versionnumber()==2: - yield Page(self, url2link(hit, site = self, insite = self)) - else: - yield Page(self, hit) - # save the last hit, so that we know where to continue when we - # finished all articles on the current page. Append a '!' so that - # we don't yield a page twice. - start = Page(self,hit).titleWithoutNamespace() + '!' - # A small shortcut: if there are less than 100 pages listed on this - # page, there is certainly no next. Probably 480 would do as well, - # but better be safe than sorry. - if n < 100: - if (not includeredirects) or includeredirects == 'only': - # Maybe there were only so few because the rest is or is not a redirect - R = re.compile('title ?="(.*?)"') - allLinks = R.findall(returned_html) - if len(allLinks) < 100: - break - elif n == 0: - # In this special case, no pages of the requested type - # were found, and "start" will remain and be double-encoded. - # Use the last page as the start of the next page. - start = Page(self, allLinks[-1]).titleWithoutNamespace() + '!' - else: - break - def prefixindex(self, prefix, namespace=0, includeredirects=True): """Yield all pages with a given prefix.
pywikipedia-l@lists.wikimedia.org