[Pywikipedia-l] SVN: [5323] branches/rewrite/pywikibot
russblau at svn.wikimedia.org
russblau at svn.wikimedia.org
Wed May 7 21:13:49 UTC 2008
Revision: 5323
Author: russblau
Date: 2008-05-07 21:13:48 +0000 (Wed, 07 May 2008)
Log Message:
-----------
site methods allpages, alllinks, allcategories added
Modified Paths:
--------------
branches/rewrite/pywikibot/data/api.py
branches/rewrite/pywikibot/page.py
branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py 2008-05-07 19:13:52 UTC (rev 5322)
+++ branches/rewrite/pywikibot/data/api.py 2008-05-07 21:13:48 UTC (rev 5323)
@@ -231,6 +231,7 @@
# double the next wait, but do not exceed 120 seconds
self.retry_wait = min(120, self.retry_wait * 2)
+#TODO - refactor all these generator classes into a parent/subclass hierarchy
class PageGenerator(object):
"""Iterator for response to a request of type action=query&generator=foo."""
@@ -247,8 +248,11 @@
raise ValueError("Unrecognized generator '%s'" % generator)
self.request = Request(action="query", generator=generator, **kwargs)
# set limit to max, if applicable
+ # FIXME: need to distinguish between the "limit" per API request and an
+ # overall limit on the number of pages to be iterated
if self.limits[generator]:
- self.request['g'+self.limits[generator]] = "max"
+ limitkey = 'g' + self.limits[generator]
+ self.request.setdefault(limitkey, "max")
if 'prop' in self.request:
self.request['prop'] += "|info|imageinfo"
else:
@@ -330,6 +334,7 @@
class CategoryPageGenerator(PageGenerator):
"""Generator that yields Category objects instead of Pages."""
+
def result(self, pagedata):
p = PageGenerator.result(self, pagedata)
return pywikibot.Category(p)
@@ -337,6 +342,7 @@
class ImagePageGenerator(PageGenerator):
"""Generator that yields ImagePage objects instead of Pages."""
+
def result(self, pagedata):
p = PageGenerator.result(self, pagedata)
image = pywikibot.ImagePage(p)
@@ -350,8 +356,9 @@
Note that this generator yields one or more dict object(s) corresponding
to each "page" item(s) from the API response; the calling module has to
- decide what to do with the contents of the dict."""
+ decide what to do with the contents of the dict.
+ """
def __init__(self, prop, **kwargs):
"""
Required and optional parameters are as for C{Request}, except that
@@ -409,6 +416,76 @@
self.request.update(self.data["query-continue"][self.resultkey])
+class ListGenerator(object):
+ """Iterator for queries with action=query&list=... parameters"""
+
+ def __init__(self, listaction, **kwargs):
+ """
+ Required and optional parameters are as for C{Request}, except that
+ action=query is assumed and listaction is required.
+
+ @param listaction: the "list=" type from api.php
+ @type listaction: str
+
+ """
+ if listaction not in self.limits:
+ raise ValueError("Unrecognized list type '%s'" % listaction)
+ self.request = Request(action="query", list=listaction, **kwargs)
+ # set limit to max, if applicable
+ # FIXME: need to distinguish between the "limit" per API request and an
+ # overall limit on the number of pages to be iterated
+ if self.limits[listaction]:
+ limitkey = self.limits[listaction]
+ self.request.setdefault(limitkey, "max")
+ self.resultkey = listaction
+ self.site = self.request.site
+
+ # dict mapping generator types to their limit parameter names
+
+ limits = {'allpages': 'aplimit',
+ 'alllinks': 'allimit',
+ 'allcategories': 'aclimit',
+ 'allusers': 'aulimit',
+ 'allimages': 'ailimit',
+ 'backlinks': 'bllimit',
+ 'blocks': 'bklimit',
+ 'categorymembers': 'cmlimit',
+ 'embeddedin': 'eilimit',
+ 'exturlusage': 'eulimit',
+ 'imageusage': 'iulimit',
+ 'logevents': 'lelimit',
+ 'recentchanges': 'rclimit',
+ 'search': 'srlimit',
+ 'usercontribs': 'uclimit',
+ 'watchlist': 'wllimit',
+ 'deletedrevs': 'drlimit',
+ 'users': None,
+ 'random': 'rnlimit',
+ }
+
+ def __iter__(self):
+ """Iterate objects for elements found in response."""
+ # this looks for the resultkey in the 'query' element
+ while True:
+ self.data = self.request.submit()
+ if not self.data or not isinstance(self.data, dict):
+ raise StopIteration
+ if not ("query" in self.data
+ and self.resultkey in self.data["query"]):
+ raise StopIteration
+ resultdata = self.data["query"][self.resultkey]
+ assert isinstance(resultdata, list)
+ for item in resultdata:
+ yield item
+ if not "query-continue" in self.data:
+ return
+ if not self.resultkey in self.data["query-continue"]:
+ raise APIError("Unknown",
+ "Missing '%s' key in ['query-continue'] value.",
+ data=self.data["query-continue"])
+ self.request.update(self.data["query-continue"][self.resultkey])
+
+
class LoginManager(login.LoginManager):
"""Supplies getCookie() method to use API interface."""
def getCookie(self, remember=True, captchaId=None, captchaAnswer=None):
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py 2008-05-07 19:13:52 UTC (rev 5322)
+++ branches/rewrite/pywikibot/page.py 2008-05-07 21:13:48 UTC (rev 5323)
@@ -284,7 +284,7 @@
raise self._getexception
if force or not hasattr(self, "_revid") \
or not self._revid in self._revisions:
- self.site().getrevisions(self, getText=True, sysop=sysop)
+ self.site().loadrevisions(self, getText=True, sysop=sysop)
# TODO: Exception handling for no-page, redirects, etc.
return self._revisions[self._revid].text
@@ -307,7 +307,7 @@
logging.debug(
"Page.getOldVersion(change_edit_time) option is deprecated.")
if force or not oldid in self._revisions:
- self.site().getrevisions(self, getText=True, ids=oldid,
+ self.site().loadrevisions(self, getText=True, ids=oldid,
sysop=sysop)
# TODO: what about redirects, errors?
return self._revisions[oldid].text
@@ -324,7 +324,7 @@
def latestRevision(self):
"""Return the current revision id for this page."""
if not hasattr(self, '_revid'):
- self.site().getrevisions(self)
+ self.site().loadrevisions(self)
return self._revid
def userName(self):
@@ -664,7 +664,7 @@
if not self.isRedirectPage():
raise pywikibot.IsNotRedirectPage
if not isinstance(self._redir, Page):
- self.site().pageredirtarget(self)
+ self.site().getredirtarget(self)
return self._redir
def getVersionHistory(self, forceReload=False, reverseOrder=False,
@@ -682,7 +682,7 @@
limit = None
else:
limit = revCount
- return self.site().getrevisions(self, getText=False,
+ return self.site().loadrevisions(self, getText=False,
rvdir=not reverseOrder, limit=limit)
def getVersionHistoryTable(self, forceReload=False, reverseOrder=False,
@@ -706,7 +706,7 @@
@return: A generator that yields tuples consisting of revision ID,
edit date/time, user name and content
"""
- return self.site().getrevisions(self, withText=True)
+ return self.site().loadrevisions(self, withText=True)
def contributingUsers(self):
"""Return a set of usernames (or IPs) of users who edited this page."""
@@ -1100,7 +1100,7 @@
recurse = recurse - 1
if not hasattr(self, "_subcats"):
self._subcats = []
- for member in self.site().pagecategorymembers(self, namespaces=[14]):
+ for member in self.site().categorymembers(self, namespaces=[14]):
subcat = Category(self.site(), member.title())
self._subcats.append(subcat)
yield subcat
@@ -1127,7 +1127,7 @@
"""
namespaces = [x for x in self.site().namespaces().keys()
if x>=0 and x!=14]
- for member in self.site().pagecategorymembers(self,
+ for member in self.site().categorymembers(self,
namespaces=namespaces):
yield member
if recurse:
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-05-07 19:13:52 UTC (rev 5322)
+++ branches/rewrite/pywikibot/site.py 2008-05-07 21:13:48 UTC (rev 5323)
@@ -494,7 +494,7 @@
return self.namespaces()[num]
return self.namespaces()[num][0]
- def getpageinfo(self, page):
+ def loadpageinfo(self, page):
"""Load page info from api and save in page attributes"""
title = page.title(withSection=False)
query = api.PropertyGenerator("info",
@@ -502,21 +502,21 @@
for pageitem in query:
if pageitem['title'] != title:
raise Error(
- u"getpageinfo: Query on %s returned data on '%s'"
+ u"loadpageinfo: Query on %s returned data on '%s'"
% (page, pageitem['title']))
api.update_page(page, pageitem)
def page_exists(self, page):
"""Return True if and only if page is an existing page on site."""
if not hasattr(page, "_pageid"):
- self.getpageinfo(page)
+ self.loadpageinfo(page)
return page._pageid > 0
def page_restrictions(self, page):
"""Returns a dictionary reflecting page protections"""
if not self.page_exists(page):
raise NoPage(u'No page %s.' % page)
- # page_exists called getpageinfo which set protection levels
+ # page_exists called loadpageinfo which set protection levels
return page._protection
def page_can_be_edited(self, page):
@@ -537,13 +537,13 @@
def page_isredirect(self, page):
"""Return True if and only if page is a redirect."""
if not hasattr(page, "_redir"):
- self.getpageinfo(page)
+ self.loadpageinfo(page)
return bool(page._redir)
- def pageredirtarget(self, page):
+ def getredirtarget(self, page):
"""Return Page object for the redirect target of page."""
if not hasattr(page, "_redir"):
- self.getpageinfo(page)
+ self.loadpageinfo(page)
if not page._redir:
raise pywikibot.IsNotRedirectPage
title = page.title(withSection=False)
@@ -554,13 +554,13 @@
result = query.submit()
if "query" not in result or "redirects" not in result["query"]:
raise RuntimeError(
- "pageredirtarget: No 'redirects' found for page %s."
+ "getredirtarget: No 'redirects' found for page %s."
% title)
redirmap = dict((item['from'], item['to'])
for item in result['query']['redirects'])
if title not in redirmap:
raise RuntimeError(
- "pageredirtarget: 'redirects' contains no key for page %s."
+ "getredirtarget: 'redirects' contains no key for page %s."
% title)
if "pages" not in result['query']:
# no "pages" element indicates a circular redirect
@@ -569,7 +569,7 @@
# there should be only one value in 'pages', and it is the target
if pagedata['title'] not in redirmap.values():
raise RuntimeError(
- "pageredirtarget: target page '%s' not found in 'redirects'"
+ "getredirtarget: target page '%s' not found in 'redirects'"
% pagedata['title'])
target = pywikibot.Page(self, pagedata['title'], pagedata['ns'])
api.update_page(target, pagedata)
@@ -733,7 +733,7 @@
for ns in namespaces)
return tlgen
- def pagecategorymembers(self, category, namespaces=None):
+ def categorymembers(self, category, namespaces=None):
"""Iterate members of specified category.
@param category: The Category to iterate.
@@ -757,7 +757,7 @@
for ns in namespaces)
return cmgen
- def getrevisions(self, page=None, getText=False, revids=None,
+ def loadrevisions(self, page=None, getText=False, revids=None,
limit=None, startid=None, endid=None, starttime=None,
endtime=None, rvdir=None, user=None, excludeuser=None,
section=None, sysop=False):
@@ -811,25 +811,25 @@
# check for invalid argument combinations
if page is None and revids is None:
raise ValueError(
- "getrevisions: either page or revids argument required")
+ "loadrevisions: either page or revids argument required")
if (startid is not None or endid is not None) and \
(starttime is not None or endtime is not None):
raise ValueError(
- "getrevisions: startid/endid combined with starttime/endtime")
+ "loadrevisions: startid/endid combined with starttime/endtime")
if starttime is not None and endtime is not None:
if rvdir and starttime >= endtime:
raise ValueError(
- "getrevisions: starttime > endtime with rvdir=True")
+ "loadrevisions: starttime > endtime with rvdir=True")
if (not rvdir) and endtime >= starttime:
raise ValueError(
- "getrevisions: endtime > starttime with rvdir=False")
+ "loadrevisions: endtime > starttime with rvdir=False")
if startid is not None and endid is not None:
if rvdir and startid >= endid:
raise ValueError(
- "getrevisions: startid > endid with rvdir=True")
+ "loadrevisions: startid > endid with rvdir=True")
if (not rvdir) and endid >= startid:
raise ValueError(
- "getrevisions: endid > startid with rvdir=False")
+ "loadrevisions: endid > startid with rvdir=False")
# assemble API request
if revids is None:
@@ -866,7 +866,7 @@
if page is not None:
if pagedata['title'] != page.title(withSection=False):
raise Error(
- u"getrevisions: Query on %s returned data on '%s'"
+ u"loadrevisions: Query on %s returned data on '%s'"
% (page, pagedata['title']))
else:
page = Page(self, pagedata['title'])
@@ -924,8 +924,148 @@
for linkdata in pageitem['extlinks']:
yield linkdata['*']
+ def allpages(self, start="!", prefix="", namespace=0,
+ filterredir=None, filterlanglinks=None,
+ minsize=None, maxsize=None,
+ protect_type=None, protect_level=None,
+ limit=None, reverse=False, includeRedirects=None,
+ throttle=None):
+ """Iterate pages in a single namespace.
+ Note: parameters includeRedirects and throttle are deprecated and
+ included only for backwards compatibility.
+
+ @param start: Start at this title (page need not exist).
+ @param prefix: Only yield pages starting with this string.
+ @param namespace: Iterate pages from this (single) namespace
+ (default: 0)
+ @param filterredir: if True, only yield redirects; if False (and not
+ None), only yield non-redirects (default: yield both)
+ @param filterlanglinks: if True, only yield pages with language links;
+ if False (and not None), only yield pages without language links
+ (default: yield both)
+ @param minsize: if present, only yield pages at least this many
+ bytes in size
+ @param maxsize: if present, only yield pages at most this many bytes
+ in size
+ @param protect_type: only yield pages that have a protection of the
+ specified type
+ @type protect_type: str
+ @param protect_level: only yield pages that have protection at this
+ level; can only be used if protect_type is specified
+ @param limit: maximum number of pages to iterate (default: iterate
+ all pages in namespace)
+ @param reverse: if True, iterate in reverse Unicode lexigraphic
+ order (default: iterate in forward order)
+ """
+ if not isinstance(namespace, int):
+ raise Error("allpages: only one namespace permitted.")
+ if throttle is not None:
+ logging.debug("allpages: the 'throttle' parameter is deprecated.")
+ if includeRedirects is not None:
+ logging.debug(
+ "allpages: the 'includeRedirect' parameter is deprecated.")
+ if includeRedirects:
+ if includeRedirects == "only":
+ filterredirs = True
+ else:
+ filterredirs = None
+ else:
+ filterredirs = False
+
+ apgen = api.PageGenerator("allpages", gapnamespace=str(namespace),
+ gapfrom=start)
+ if prefix:
+ apgen.request["gapprefix"] = prefix
+ if filterredir is not None:
+ apgen.request["gapfilterredir"] = (filterredir
+ and "redirects"
+ or "nonredirects")
+ if filterlanglinks is not None:
+ apgen.request["gapfilterlanglinks"] = (filterlanglinks
+ and "withlanglinks"
+ or "withoutlanglinks")
+ if isinstance(minsize, int):
+ apgen.request["gapminsize"] = str(minsize)
+ if isinstance(maxsize, int):
+ apgen.request["gapmaxsize"] = str(maxsize)
+ if isinstance(protect_type, basestring):
+ apgen.request["gapprtype"] = protect_type
+ if isinstance(protect_level, basestring):
+ apgen.request["gapprlevel"] = protect_level
+ if isinstance(limit, int):
+ apgen.request["gaplimit"] = str(limit)
+ if reverse:
+ apgen.request["gapdir"] = "descending"
+ return apgen
+
+ def alllinks(self, start="!", prefix="", namespace=0, unique=False,
+ limit=None, fromids=False):
+ """Iterate all links to pages (which need not exist) in one namespace.
+
+ Note that, in practice, links that were found on pages that have
+ been deleted may not have been removed from the links table, so this
+ method can return false positives.
+
+ @param start: Start at this title (page need not exist).
+ @param prefix: Only yield pages starting with this string.
+ @param namespace: Iterate pages from this (single) namespace
+ (default: 0)
+ @param unique: If True, only iterate each link title once (default:
+ iterate once for each linking page)
+ @param limit: maximum number of pages to iterate (default: iterate
+ all pages in namespace)
+ @param fromids: if True, include the pageid of the page containing
+ each link (default: False) as the 'fromid' attribute of the Page;
+ cannot be combined with unique
+
+ """
+ if unique and fromids:
+ raise Error("alllinks: unique and fromids cannot both be True.")
+ if not isinstance(namespace, int):
+ raise Error("alllinks: only one namespace permitted.")
+ algen = api.ListGenerator("alllinks", alnamespace=str(namespace),
+ alfrom=start)
+ if prefix:
+ algen.request["alprefix"] = prefix
+ if isinstance(limit, int):
+ algen.request["allimit"] = str(limit)
+ if unique:
+ algen.request["alunique"] = ""
+ if fromids:
+ algen.request["alprop"] = "title|ids"
+ for link in algen:
+ p = pywikibot.Page(self, link['title'], link['ns'])
+ if fromids:
+ p.fromid = link['fromid']
+ yield p
+
+
+ def allcategories(self, start="!", prefix="", limit=None,
+ reverse=False):
+ """Iterate categories used (which need not have a Category page).
+
+ Iterator yields Category objects.
+
+ @param start: Start at this category title (category need not exist).
+ @param prefix: Only yield categories starting with this string.
+ @param limit: maximum number of categories to iterate (default:
+ iterate all)
+ @param reverse: if True, iterate in reverse Unicode lexigraphic
+ order (default: iterate in forward order)
+
+ """
+ acgen = api.CategoryGenerator("allcategories", gapfrom=start)
+ if prefix:
+ acgen.request["gacprefix"] = prefix
+ if isinstance(limit, int):
+ acgen.request["gaclimit"] = str(limit)
+ if reverse:
+ acgen.request["gacdir"] = "descending"
+ return acgen
+
+
#### METHODS NOT IMPLEMENTED YET (but may be delegated to Family object) ####
class NotImplementedYet:
@@ -1806,92 +1946,6 @@
if not repeat:
break
- def allpages(self, start='!', namespace=0, includeredirects=True,
- throttle=True):
- """Yield all Pages from Special:Allpages.
-
- Parameters:
- start Start at this page. By default, it starts at '!', and yields
- all pages.
- namespace Yield all pages in this namespace; defaults to 0.
- MediaWiki software will only return pages in one namespace
- at a time.
-
- If includeredirects is False, redirects will not be found.
- If includeredirects equals the string 'only', only redirects
- will be found. Note that this has not been tested on older
- versions of the MediaWiki code.
-
- It is advised not to use this directly, but to use the
- AllpagesPageGenerator from pagegenerators.py instead.
-
- """
- while True:
- # encode Non-ASCII characters in hexadecimal format (e.g. %F6)
- start = start.encode(self.encoding())
- start = urllib.quote(start)
- # load a list which contains a series of article names (always 480)
- path = self.allpages_address(start, namespace)
- output(u'Retrieving Allpages special page for %s from %s, namespace %i' % (repr(self), start, namespace))
- returned_html = self.getUrl(path)
- # Try to find begin and end markers
- try:
- # In 1.4, another table was added above the navigational links
- if self.versionnumber() >= 4:
- begin_s = '</table><hr /><table'
- end_s = '</table'
- else:
- begin_s = '<table'
- end_s = '</table'
- ibegin = returned_html.index(begin_s)
- iend = returned_html.index(end_s,ibegin + 3)
- except ValueError:
- raise ServerError(
-"Couldn't extract allpages special page. Make sure you're using MonoBook skin.")
- # remove the irrelevant sections
- returned_html = returned_html[ibegin:iend]
- if self.versionnumber()==2:
- R = re.compile('/wiki/(.*?)\" *class=[\'\"]printable')
- elif self.versionnumber()<5:
- # Apparently the special code for redirects was added in 1.5
- R = re.compile('title ?=\"(.*?)\"')
- elif not includeredirects:
- R = re.compile('\<td(?: width="33%")?\>\<a href=\"\S*\" +title ?="(.*?)"')
- elif includeredirects == 'only':
- R = re.compile('\<td(?: width="33%")?>\<[^\<\>]*allpagesredirect\"\>\<a href=\"\S*\" +title ?="(.*?)"')
- else:
- R = re.compile('title ?=\"(.*?)\"')
- # Count the number of useful links on this page
- n = 0
- for hit in R.findall(returned_html):
- # count how many articles we found on the current page
- n = n + 1
- if self.versionnumber()==2:
- yield Page(self, url2link(hit, site = self, insite = self))
- else:
- yield Page(self, hit)
- # save the last hit, so that we know where to continue when we
- # finished all articles on the current page. Append a '!' so that
- # we don't yield a page twice.
- start = Page(self,hit).titleWithoutNamespace() + '!'
- # A small shortcut: if there are less than 100 pages listed on this
- # page, there is certainly no next. Probably 480 would do as well,
- # but better be safe than sorry.
- if n < 100:
- if (not includeredirects) or includeredirects == 'only':
- # Maybe there were only so few because the rest is or is not a redirect
- R = re.compile('title ?=\"(.*?)\"')
- allLinks = R.findall(returned_html)
- if len(allLinks) < 100:
- break
- elif n == 0:
- # In this special case, no pages of the requested type
- # were found, and "start" will remain and be double-encoded.
- # Use the last page as the start of the next page.
- start = Page(self, allLinks[-1]).titleWithoutNamespace() + '!'
- else:
- break
-
def prefixindex(self, prefix, namespace=0, includeredirects=True):
"""Yield all pages with a given prefix.
More information about the Pywikipedia-l
mailing list