Revision: 6156 Author: russblau Date: 2008-12-16 19:40:20 +0000 (Tue, 16 Dec 2008)
Log Message: ----------- update properties
Modified Paths: -------------- branches/rewrite/pywikibot/catlib.py branches/rewrite/pywikibot/exceptions.py branches/rewrite/pywikibot/page.py branches/rewrite/pywikibot/pagegenerators.py branches/rewrite/pywikibot/site.py branches/rewrite/pywikibot/textlib.py branches/rewrite/pywikibot/throttle.py branches/rewrite/pywikibot/tools.py
Property Changed: ---------------- branches/rewrite/pywikibot/__init__.py branches/rewrite/pywikibot/bot.py branches/rewrite/pywikibot/catlib.py branches/rewrite/pywikibot/exceptions.py branches/rewrite/pywikibot/page.py branches/rewrite/pywikibot/pagegenerators.py branches/rewrite/pywikibot/site.py branches/rewrite/pywikibot/textlib.py branches/rewrite/pywikibot/throttle.py branches/rewrite/pywikibot/tools.py
Property changes on: branches/rewrite/pywikibot/__init__.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision
Property changes on: branches/rewrite/pywikibot/bot.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision
Modified: branches/rewrite/pywikibot/catlib.py =================================================================== --- branches/rewrite/pywikibot/catlib.py 2008-12-16 19:34:48 UTC (rev 6155) +++ branches/rewrite/pywikibot/catlib.py 2008-12-16 19:40:20 UTC (rev 6156) @@ -1,22 +1,22 @@ -# -*- coding: utf-8 -*- -""" -WARNING: THIS MODULE EXISTS SOLELY TO PROVIDE BACKWARDS-COMPATIBILITY. - -Do not use in new scripts; use the source to find the appropriate -function/method instead. - -""" -# -# (C) Pywikipedia bot team, 2008 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id: $' - - -from pywikibot import Category - - -def change_category(article, oldCat, newCat, comment=None, sortKey=None, - inPlace=True): - return article.change_category(oldCat, newCat, comment, sortKey, inPlace) +# -*- coding: utf-8 -*- +""" +WARNING: THIS MODULE EXISTS SOLELY TO PROVIDE BACKWARDS-COMPATIBILITY. + +Do not use in new scripts; use the source to find the appropriate +function/method instead. + +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + + +from pywikibot import Category + + +def change_category(article, oldCat, newCat, comment=None, sortKey=None, + inPlace=True): + return article.change_category(oldCat, newCat, comment, sortKey, inPlace)
Property changes on: branches/rewrite/pywikibot/catlib.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native
Modified: branches/rewrite/pywikibot/exceptions.py =================================================================== --- branches/rewrite/pywikibot/exceptions.py 2008-12-16 19:34:48 UTC (rev 6155) +++ branches/rewrite/pywikibot/exceptions.py 2008-12-16 19:40:20 UTC (rev 6156) @@ -1,87 +1,87 @@ -# -*- coding: utf-8 -*- -""" -Exception classes used throughout the framework. -""" -# -# (C) Pywikipedia bot team, 2008 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id: $' - - -import sys - -# TODO: These are copied from wikipedia.py; not certain that all of them -# will be needed in the rewrite. - -class Error(Exception): - """Wikipedia error""" - def __init__(self, arg): - try: - self.string = arg.encode(sys.stderr.encoding, "xmlcharrefreplace") - except (AttributeError, TypeError): - self.string = arg.encode("ascii", "xmlcharrefreplace") - def __str__(self): - return self.string - -class NoUsername(Error): - """Username is not in user-config.py""" - -class NoPage(Error): - """Page does not exist""" - -class NoSuchSite(Error): - """Site does not exist""" - -class IsRedirectPage(Error): - """Page is a redirect page""" - -class IsNotRedirectPage(Error): - """Page is not a redirect page""" - -class CircularRedirect(Error): - """Page is a circular redirect - - Exception argument is the redirect target; this may be the same title - as this page or a different title (in which case the target page directly - or indirectly redirects back to this one) - - """ - -class LockedPage(Error): - """Page is locked""" - -class SectionError(Error): - """The section specified by # does not exist""" - -class PageNotSaved(Error): - """Saving the page has failed""" - -class EditConflict(PageNotSaved): - """There has been an edit conflict while uploading the page""" - -class SpamfilterError(PageNotSaved): - """Saving the page has failed because the MediaWiki spam filter detected a blacklisted URL.""" - def __init__(self, arg): - self.url = arg - self.args = arg, - -class ServerError(Error): - """Got unexpected server response""" - -class BadTitle(Error): - """Server responded with BadTitle.""" - -# UserBlocked exceptions should in general not be caught. If the bot has -# been blocked, the bot operator should address the reason for the block -# before continuing. -class UserBlocked(Error): - """Your username or IP has been blocked""" - -class PageNotFound(Error): - """Page not found in list""" - -class CaptchaError(Error): - """Captcha is asked and config.solve_captcha == False.""" - +# -*- coding: utf-8 -*- +""" +Exception classes used throughout the framework. +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + + +import sys + +# TODO: These are copied from wikipedia.py; not certain that all of them +# will be needed in the rewrite. + +class Error(Exception): + """Wikipedia error""" + def __init__(self, arg): + try: + self.string = arg.encode(sys.stderr.encoding, "xmlcharrefreplace") + except (AttributeError, TypeError): + self.string = arg.encode("ascii", "xmlcharrefreplace") + def __str__(self): + return self.string + +class NoUsername(Error): + """Username is not in user-config.py""" + +class NoPage(Error): + """Page does not exist""" + +class NoSuchSite(Error): + """Site does not exist""" + +class IsRedirectPage(Error): + """Page is a redirect page""" + +class IsNotRedirectPage(Error): + """Page is not a redirect page""" + +class CircularRedirect(Error): + """Page is a circular redirect + + Exception argument is the redirect target; this may be the same title + as this page or a different title (in which case the target page directly + or indirectly redirects back to this one) + + """ + +class LockedPage(Error): + """Page is locked""" + +class SectionError(Error): + """The section specified by # does not exist""" + +class PageNotSaved(Error): + """Saving the page has failed""" + +class EditConflict(PageNotSaved): + """There has been an edit conflict while uploading the page""" + +class SpamfilterError(PageNotSaved): + """Saving the page has failed because the MediaWiki spam filter detected a blacklisted URL.""" + def __init__(self, arg): + self.url = arg + self.args = arg, + +class ServerError(Error): + """Got unexpected server response""" + +class BadTitle(Error): + """Server responded with BadTitle.""" + +# UserBlocked exceptions should in general not be caught. If the bot has +# been blocked, the bot operator should address the reason for the block +# before continuing. +class UserBlocked(Error): + """Your username or IP has been blocked""" + +class PageNotFound(Error): + """Page not found in list""" + +class CaptchaError(Error): + """Captcha is asked and config.solve_captcha == False.""" +
Property changes on: branches/rewrite/pywikibot/exceptions.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native
Modified: branches/rewrite/pywikibot/page.py =================================================================== --- branches/rewrite/pywikibot/page.py 2008-12-16 19:34:48 UTC (rev 6155) +++ branches/rewrite/pywikibot/page.py 2008-12-16 19:40:20 UTC (rev 6156) @@ -1,1886 +1,1886 @@ -# -*- coding: utf-8 -*- -""" -Objects representing various types of MediaWiki pages. -""" -# -# (C) Pywikipedia bot team, 2008 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id: $' - -import pywikibot -from pywikibot import deprecate_arg -from pywikibot import config -import pywikibot.site -import pywikibot.textlib - -import htmlentitydefs -import logging -import re -import sys -import threading -import unicodedata -import urllib - -logger = logging.getLogger("wiki") - -reNamespace = re.compile("^(.+?) *: *(.*)$") - - -class Page(object): - """Page: A MediaWiki page - - This object only implements internally methods that do not require - reading from or writing to the wiki. All other methods are delegated - to the Site object. - - """ - - @deprecate_arg("insite", None) - @deprecate_arg("defaultNamespace", None) - def __init__(self, source, title=u"", ns=0): - """Instantiate a Page object. - - Three calling formats are supported: - - - If the first argument is a Page, create a copy of that object. - This can be used to convert an existing Page into a subclass - object, such as Category or ImagePage. (If the title is also - given as the second argument, creates a copy with that title; - this is used when pages are moved.) - - If the first argument is a Site, create a Page on that Site - using the second argument as the title (may include a section), - and the third as the namespace number. The namespace number is - mandatory, even if the title includes the namespace prefix. This - is the preferred syntax when using an already-normalized title - obtained from api.php or a database dump. WARNING: may produce - invalid objects if page title isn't in normal form! - - If the first argument is a Link, create a Page from that link. - This is the preferred syntax when using a title scraped from - wikitext, URLs, or another non-normalized source. - - @param source: the source of the page - @type source: Link, Page (or subclass), or Site - @param title: normalized title of the page; required if source is a - Site, ignored otherwise - @type title: unicode - @param ns: namespace number; required if source is a Site, ignored - otherwise - @type ns: int - - """ - if isinstance(source, pywikibot.site.BaseSite): - self._site = source - if ns not in source.namespaces(): - raise pywikibot.Error( - "Invalid namespace '%i' for site %s." - % (ns, source.sitename())) - self._ns = ns - if ns and not title.startswith(source.namespace(ns)+u":"): - title = source.namespace(ns) + u":" + title - elif not ns and u":" in title: - pos = title.index(u':') - nsindex = source.ns_index(title[ :pos]) - if nsindex: - self._ns = nsindex - if u"#" in title: - title, self._section = title.split(u"#", 1) - else: - self._section = None - if not title: - raise pywikibot.Error( - "Page object cannot be created from Site without title.") - self._title = title - elif isinstance(source, Page): - # copy all of source's attributes to this object - self.__dict__ = source.__dict__ - if title: - # overwrite title - if ":" in title: - prefix = title[ :title.index(":")] - self._ns = site.ns_index(prefix) - if self._ns is None: - self._ns = 0 - else: - title = title[title.index(":")+1 : ].strip(" _") - self._title = "%s:%s" % ( - self.site().namespace(self._ns), - self._title) - else: - self._ns = 0 - if "#" in title: - self._section = title[title.index("#") + 1 : ].strip(" _") - title = title[ : title.index("#")].strip(" _") - self._title = title - elif isinstance(source, Link): - self._site = source.site - self._section = source.section - self._ns = source.namespace - self._title = source.title - # reassemble the canonical title from components - if self._ns: - self._title = "%s:%s" % (self.site().namespace(self._ns), - self._title) - else: - raise pywikibot.Error( - "Invalid argument type '%s' in Page constructor: %s" - % (type(source), source)) - if self._section is not None: - self._title = self._title + "#" + self._section - self._revisions = {} - - def site(self): - """Return the Site object for the wiki on which this Page resides.""" - return self._site - - def namespace(self): - """Return the number of the namespace of the page.""" - return self._ns - - @deprecate_arg("decode", None) - @deprecate_arg("savetitle", "asUrl") - def title(self, underscore=False, savetitle=False, withNamespace=True, - withSection=True, asUrl=False, asLink=False, - allowInterwiki=True, forceInterwiki=False, textlink=False, - as_filename=False): - """Return the title of this Page, as a Unicode string. - - @param underscore: if true, replace all ' ' characters with '_' - @param withNamespace: if false, omit the namespace prefix - @param withSection: if false, omit the section - @param asUrl: if true, quote title as if in an URL - @param asLink: if true, return the title in the form of a wikilink - @param allowInterwiki: (only used if asLink is true) if true, format - the link as an interwiki link if necessary - @param forceInterwiki: (only used if asLink is true) if true, always - format the link as an interwiki link - @param textlink: (only used if asLink is true) if true, place a ':' - before Category: and Image: links - @param as_filename: if true, replace any characters that are unsafe - in filenames - - """ - title = self._title - if not withNamespace and self._ns != 0: - title = title.split(u':', 1)[1] - if not withSection and self._section: - title = title.split(u'#', 1)[0] - if underscore or asUrl: - title = title.replace(u' ', u'_') - if asUrl: - encodedTitle = title.encode(self.site().encoding()) - title = urllib.quote(encodedTitle) - if asLink: - if forceInterwiki or (allowInterwiki and - (self.site().family.name != config.family - or self.site().code != config.mylang)): - if self.site().family.name != config.family \ - and self.site().family.name != self.site().code: - return u'[[%s:%s:%s]]' % (self.site().family.name, - self.site().code, - self._title) - else: - # use this form for sites like commons, where the - # code is the same as the family name - return u'[[%s:%s]]' % (self.site().code, - self._title) - elif textlink and (self.isImage() or self.isCategory()): - return u'[[:%s]]' % title - else: - return u'[[%s]]' % title - if as_filename: - # Replace characters that are not possible in file names on some - # systems. - # Spaces are possible on most systems, but are bad for URLs. - for forbidden in ':*?/\ ': - title = title.replace(forbidden, '_') - return title - - @deprecate_arg("decode", None) - @deprecate_arg("underscore", None) - def section(self): - """Return the name of the section this Page refers to. - - The section is the part of the title following a '#' character, if - any. If no section is present, return None. - - """ - if self._section: - return self._section - else: - return None - - def __str__(self): - """Return a console representation of the pagelink.""" - return self.title(asLink=True, forceInterwiki=True - ).encode(sys.stderr.encoding) - - def __unicode__(self): - return self.title(asLink=True, forceInterwiki=True) - - def __repr__(self): - """Return a more complete string representation.""" - return u"%s(%s)" % (self.__class__.__name__, - self.title().encode(sys.stderr.encoding)) - - def __cmp__(self, other): - """Test for equality and inequality of Page objects. - - Page objects are "equal" if and only if they are on the same site - and have the same normalized title, including section if any. - - Page objects are sortable by namespace first, then by title. - - """ - if not isinstance(other, Page): - # especially, return -1 if other is None - return -1 - if not self.site() == other.site(): - return cmp(self.site(), other.site()) - if self.namespace() != other.namespace(): - return cmp(self.namespace(), other.namespace()) - owntitle = self.title(withNamespace=False) - othertitle = other.title(withNamespace=False) - return cmp(owntitle, othertitle) - - def __hash__(self): - # Pseudo method that makes it possible to store Page objects as keys - # in hash-tables. This relies on the fact that the string - # representation of an instance can not change after the construction. - return hash(unicode(self)) - - def autoFormat(self): - """Return L{date.autoFormat} dictName and value, if any. - - Value can be a year, date, etc., and dictName is 'YearBC', - 'Year_December', or another dictionary name. Please note that two - entries may have exactly the same autoFormat, but be in two - different namespaces, as some sites have categories with the - same names. Regular titles return (None, None). - - """ - if not hasattr(self, '_autoFormat'): - from pywikibot import date - self._autoFormat = date.getAutoFormat( - self.site().code, - self.title(withNamespace=False) - ) - return self._autoFormat - - def isAutoTitle(self): - """Return True if title of this Page is in the autoFormat dictionary.""" - return self.autoFormat()[0] is not None - - @deprecate_arg("throttle", None) - @deprecate_arg("nofollow_redirects", None) - @deprecate_arg("change_edit_time", None) - def get(self, force=False, get_redirect=False, sysop=False): - """Return the wiki-text of the page. - - This will retrieve the page from the server if it has not been - retrieved yet, or if force is True. This can raise the following - exceptions that should be caught by the calling code: - - - NoPage: The page does not exist - - IsRedirectPage: The page is a redirect. The argument of the - exception is the title of the page it redirects to. - - SectionError: The section does not exist on a page with a # - link - - @param force: reload all page attributes, including errors. - @param get_redirect: return the redirect text, do not follow the - redirect, do not raise an exception. - @param sysop: if the user has a sysop account, use it to retrieve - this page - - """ - if force: - # When forcing, we retry the page no matter what. Old exceptions - # do not apply any more. - for attr in ['_redirarg', '_getexception']: - if hasattr(self, attr): - delattr(self,attr) - else: - # Make sure we re-raise an exception we got on an earlier attempt - if hasattr(self, '_redirarg') and not get_redirect: - raise pywikibot.IsRedirectPage, self._redirarg - elif hasattr(self, '_getexception'): - raise self._getexception - if force or not hasattr(self, "_revid") \ - or not self._revid in self._revisions \ - or self._revisions[self._revid].text is None: - self.site().loadrevisions(self, getText=True, sysop=sysop) - # TODO: Exception handling for no-page, redirects, etc. - - return self._revisions[self._revid].text - - @deprecate_arg("throttle", None) - @deprecate_arg("nofollow_redirects", None) - @deprecate_arg("change_edit_time", None) - def getOldVersion(self, oldid, force=False, get_redirect=False, - sysop=False): - """Return text of an old revision of this page; same options as get(). - - @param oldid: The revid of the revision desired. - - """ - if force or not oldid in self._revisions \ - or self._revisions[oldid].text is None: - self.site().loadrevisions(self, getText=True, revids=oldid, - sysop=sysop) - # TODO: what about redirects, errors? - return self._revisions[oldid].text - - def permalink(self): - """Return the permalink URL for current revision of this page.""" - return "%s://%s/%sindex.php?title=%s&oldid=%s" \ - % (self.site().protocol(), - self.site().hostname(), - self.site().scriptpath(), - self.title(asUrl=True), - self.latestRevision()) - - def latestRevision(self): - """Return the current revision id for this page.""" - if not hasattr(self, '_revid'): - self.site().loadrevisions(self) - return self._revid - - def _textgetter(self): - """Return the current (edited) wikitext, loading it if necessary.""" - if not hasattr(self, '_text') or self._text is None: - try: - self._text = self.get() - except pywikibot.NoPage: - # TODO: what other exceptions might be returned? - self._text = u"" - return self._text - - def _textsetter(self, value): - """Update the edited wikitext""" - self._text = unicode(value) - - def _cleartext(self): - """Delete the edited wikitext""" - if hasattr(self, "_text"): - del self._text - - text = property(_textgetter, _textsetter, _cleartext, - "The edited wikitext (unicode) of this Page") - - def expand_text(self): - """Return the page text with all templates expanded.""" - req = pywikibot.data.api.Request(action="expandtemplates", - text=self.text, - title=self.title(withSection=False), - site=self.site()) - result = req.submit() - return result["expandtemplates"]["*"] - - def userName(self): - """Return name or IP address of last user to edit page.""" - return self._revisions[self.latestRevision()].user - - def isIpEdit(self): - """Return True if last editor was unregistered.""" - return self._revisions[self.latestRevision()].anon - - def editTime(self): - """Return timestamp (in ISO 8601 format) of last revision to page.""" - return self._revisions[self.latestRevision()].timestamp - - def previousRevision(self): - """Return the revision id for the previous revision of this Page.""" - vh = self.getVersionHistory(revCount=2) - revkey = sorted(self._revisions.keys(), reverse=True)[1] - return revkey - - def exists(self): - """Return True if page exists on the wiki, even if it's a redirect. - - If the title includes a section, return False if this section isn't - found. - - """ - return self.site().page_exists(self) - - def isRedirectPage(self): - """Return True if this is a redirect, False if not or not existing.""" - return self.site().page_isredirect(self) - - def isEmpty(self): - """Return True if the page text has less than 4 characters. - - Character count ignores language links and category links. - Can raise the same exceptions as get(). - - """ - txt = self.get() - txt = pywikibot.textlib.removeLanguageLinks(txt, site = self.site()) - txt = pywikibot.textlib.removeCategoryLinks(txt, site = self.site()) - if len(txt) < 4: - return True - else: - return False - - def isTalkPage(self): - """Return True if this page is in any talk namespace.""" - ns = self.namespace() - return ns >= 0 and ns % 2 == 1 - - def toggleTalkPage(self): - """Return other member of the article-talk page pair for this Page. - - If self is a talk page, returns the associated content page; - otherwise, returns the associated talk page. The returned page need - not actually exist on the wiki. - - Returns None if self is a special page. - - """ - ns = self.namespace() - if ns < 0: # Special page - return None - if self.isTalkPage(): - if self.namespace() == 1: - return Page(self.site(), self.title(withNamespace=False)) - else: - return Page(self.site(), - self.site().namespace(ns - 1) + ':' - + self.title(withNamespace=False)) - else: - return Page(self.site(), - self.site().namespace(ns + 1) + ':' - + self.title(withNamespace=False)) - - def isCategory(self): - """Return True if the page is a Category, False otherwise.""" - return self.namespace() == 14 - - def isImage(self): - """Return True if this is an image description page, False otherwise.""" - return self.namespace() == 6 - - def isDisambig(self): - """Return True if this is a disambiguation page, False otherwise. - - Relies on the presence of specific templates, identified in - the Family file or on a wiki page, to identify disambiguation - pages. - - By default, loads a list of template names from the Family file; - if the value in the Family file is None, looks for the list on - [[MediaWiki:Disambiguationspage]]. - - """ - if not hasattr(self, "_isDisambig"): - if not hasattr(self.site(), "_disambigtemplates"): - self.site()._disambigtemplates = \ - self.site().family.disambig(self.site().code) - if self.site()._disambigtemplates is None: - try: - disambigpages = Page(self.site(), - "MediaWiki:Disambiguationspage") - self.site()._disambigtemplates = [ - link.title(withNamespace=False) - for link in disambigpages.linkedPages() - if link.namespace() == 10 - ] - except NoPage: - self.site()._disambigtemplates = ['Disambig'] - for t in self.templates(): - if t.title(withNamespace=False) in self.site()._disambigtemplates: - self._isDisambig = True - break - else: - self._isDisambig = False - return self._isDisambig - - def getReferences(self, follow_redirects=True, withTemplateInclusion=True, - onlyTemplateInclusion=False, redirectsOnly=False, - namespaces=None): - """Return an iterator all pages that refer to or embed the page. - - If you need a full list of referring pages, use - C{pages = list(s.getReferences())} - - @param follow_redirects: if True, also iterate pages that link to a - redirect pointing to the page. - @param withTemplateInclusion: if True, also iterate pages where self - is used as a template. - @param onlyTemplateInclusion: if True, only iterate pages where self - is used as a template. - @param redirectsOnly: if True, only iterate redirects to self. - @param namespaces: only iterate pages in these namespaces - - """ - # N.B.: this method intentionally overlaps with backlinks() and - # embeddedin(). Depending on the interface, it may be more efficient - # to implement those methods in the site interface and then combine - # the results for this method, or to implement this method and then - # split up the results for the others. - return self.site().pagereferences( - self, follow_redirects, redirectsOnly, - withTemplateInclusion, onlyTemplateInclusion, - namespaces) - - def backlinks(self, followRedirects=True, filterRedirects=None, - namespaces=None): - """Return an iterator for pages that link to this page. - - @param followRedirects: if True, also iterate pages that link to a - redirect pointing to the page. - @param filterRedirects: if True, only iterate redirects; if False, - omit redirects; if None, do not filter - @param namespaces: only iterate pages in these namespaces - - """ - return self.site().pagebacklinks(self, followRedirects, filterRedirects, - namespaces) - - def embeddedin(self, filter_redirects=None, namespaces=None): - """Return an iterator for pages that embed this page as a template. - - @param filterRedirects: if True, only iterate redirects; if False, - omit redirects; if None, do not filter - @param namespaces: only iterate pages in these namespaces - - """ - return self.site().page_embeddedin(self, filter_redirects, namespaces) - - def canBeEdited(self): - """Return bool indicating whether this page can be edited. - - This returns True if and only if: - - page is unprotected, and bot has an account for this site, or - - page is protected, and bot has a sysop account for this site. - - """ - return self.site().page_can_be_edited(self) - - def botMayEdit(self): - """Return True if this page allows bots to edit it. - - This will be True if the page doesn't contain {{bots}} or - {{nobots}}, or it contains them and the active bot is allowed to - edit this page. (This method is only useful on those sites that - recognize the bot-exclusion protocol; on other sites, it will always - return True.) - - The framework enforces this restriction by default. It is possible - to override this by setting ignore_bot_templates=True in - user_config.py, or using page.put(force=True). - - """ # TODO: move this to Site object? - if config.ignore_bot_templates: #Check the "master ignore switch" - return True - try: - templates = self.templatesWithParams(); - except (pywikibot.NoPage, - pywikibot.IsRedirectPage, - pywikibot.SectionError): - return True - for template in templates: - title = template[0].title(withNamespace=False) - if title == 'Nobots': - return False - elif title == 'Bots': - if len(template[1]) == 0: - return True - else: - (ttype, bots) = template[1][0].split('=', 1) - bots = bots.split(',') - if ttype == 'allow': - if 'all' in bots or username in bots: - return True - else: - return False - if ttype == 'deny': - if 'all' in bots or username in bots: - return False - else: - return True - # no restricting template found - return True - - def save(self, comment=None, watch=None, minor=True, force=False, - async=False, callback=None): - """Save the current contents of page's text to the wiki. - - @param comment: The edit summary for the modification (optional, but - most wikis strongly encourage its use) - @type comment: unicode - @param watch: if True, add or if False, remove this Page to/from bot - user's watchlist; if None, leave watchlist status unchanged - @type watch: bool or None - @param minor: if True, mark this edit as minor - @type minor: bool - @param force: if True, ignore botMayEdit() setting - @type force: bool - @param async: if True, launch a separate thread to save - asynchronously - @param callback: a callable object that will be called after the - page put operation. This object must take two arguments: (1) a - Page object, and (2) an exception instance, which will be None - if the page was saved successfully. The callback is intended for - use by bots that need to keep track of which saves were - successful. - - """ - if not comment: - comment = pywikibot.default_comment # needs to be defined - if watch is None: - unwatch = False - watch = False - else: - unwatch = not watch - if not force and not self.botMayEdit: - raise pywikibot.PageNotSaved( - "Page %s not saved; editing restricted by {{bots}} template" - % self.title(asLink=True)) - if async: - thd = threading.Thread( - target=self._save, - args=(comment, minor, watch, unwatch, callback) - ) - pywikibot.threadpool.append(thd) - thd.start() - else: - self._save(comment, minor, watch, unwatch, callback) - - def _save(self, comment, minor, watch, unwatch, callback): - err = None - try: - done = self.site().editpage(self, summary=comment, minor=minor, - watch=watch, unwatch=unwatch) - if not done: - logger.warn("Page %s not saved" % self.title(asLink=True)) - else: - logger.info("Page %s saved" % self.title(asLink=True)) - except pywikibot.Error, err: - logger.exception("Error saving page %s" % self.title(asLink=True)) - if callback: - callback(self, err) - - def put(self, newtext, comment=u'', watchArticle=None, minorEdit=True, - force=False, async=False, callback=None): - """Save the page with the contents of the first argument as the text. - - This method is maintained primarily for backwards-compatibility. - For new code, using Page.save() is preferred. See save() method - docs for all parameters not listed here. - - @param newtext: The complete text of the revised page. - @type newtext: unicode - - """ - self.text = newtext - return self.save(comment, watchArticle, minorEdit, force, - async, callback) - - def put_async(self, newtext, comment=u'', watchArticle=None, - minorEdit=True, force=False, callback=None): - """Put page on queue to be saved to wiki asynchronously. - - Asynchronous version of put (takes the same arguments), which places - pages on a queue to be saved by a daemon thread. All arguments are - the same as for .put(). This version is maintained solely for - backwards-compatibility. - - """ - return self.put(self, newtext, comment, watchArticle, - minorEdit, force, callback, async=True) - - def linkedPages(self): - """Iterate Pages that this Page links to. - - Only returns pages from "normal" internal links. Image and category - links are omitted unless prefixed with ":". Embedded templates are - omitted (but links within them are returned). All interwiki and - external links are omitted. - - @return: a generator that yields Page objects. - - """ - return self.site().pagelinks(self) - - def interwiki(self, expand=True): - """Iterate interwiki links in the page text, excluding language links. - - @param expand: if True (default), include interwiki links found in - templates transcluded onto this page; if False, only iterate - interwiki links found in this page's own wikitext - @return: a generator that yields Link objects - - """ - # This function does not exist in the API, so it has to be - # implemented by screen-scraping - if expand: - text = self.expand_text() - else: - text = self.text - for linkmatch in pywikibot.link_regex.finditer( - pywikibot.textlib.removeDisabledParts(text)): - linktitle = linkmatch.group("title") - link = Link(linktitle, self.site()) - # only yield links that are to a different site and that - # are not language links - try: - if link.site != self.site(): - if linktitle.lstrip().startswith(":"): - # initial ":" indicates not a language link - yield link - elif link.site.family != self.site().family: - # link to a different family is not a language link - yield link - except pywikibot.Error: - # ignore any links with invalid contents - continue - - def langlinks(self): - """Iterate all interlanguage links on this page. - - @return: a generator that yields Link objects. - - """ - return self.site().pagelanglinks(self) - - @deprecate_arg("followRedirects", None) - @deprecate_arg("loose", None) - def imagelinks(self, followRedirects=None, loose=None): - """Iterate ImagePage objects for images displayed on this Page. - - @return: a generator that yields ImagePage objects. - - """ - return self.site().pageimages(self) - - def templates(self): - """Iterate Page objects for templates used on this Page. - - Template parameters are ignored. This method only returns embedded - templates, not template pages that happen to be referenced through - a normal link. - - """ - return self.site().pagetemplates(self) - - def templatesWithParams(self): - """Iterate templates used on this Page. - - @return: a generator that yields a tuple for each use of a template - in the page, with the template Page as the first entry and a list of - parameters as the second entry. - - """ - templates = pywikibot.textlib.extract_templates_and_params(self.text) - # backwards-compatibility: convert the dict returned as the second - # element into a list in the format used by old scripts - result = [] - for template in templates: - args = template[1] - positional = [] - named = {} - for key in sorted(args.keys()): - try: - int(key) - except ValueError: - named[key] = args[key] - else: - positional.append(args[key]) - for name in named: - positional.append("%s=%s" % (name, named[name])) - result.append((pywikibot.Page( - pywikibot.Link(template[0], self.site())), - positional)) - return result - - @deprecate_arg("nofollow_redirects", None) - def categories(self, withSortKey=False): - """Iterate categories that the article is in. - - @param withSortKey: if True, include the sort key in each Category. - @return: a generator that yields Category objects. - - """ - return self.site().pagecategories(self, withSortKey=withSortKey) - - def extlinks(self): - """Iterate all external URLs (not interwiki links) from this page. - - @return: a generator that yields unicode objects containing URLs. - - """ - return self.site().page_extlinks(self) - - def getRedirectTarget(self): - """Return a Page object for the target this Page redirects to. - - If this page is not a redirect page, will raise an IsNotRedirectPage - exception. This method also can raise a NoPage exception. - - """ - if not self.isRedirectPage(): - raise pywikibot.IsNotRedirectPage - if not isinstance(self._redir, Page): - self.site().getredirtarget(self) - return self._redir - - @deprecate_arg("forceReload", None) - def getVersionHistory(self, reverseOrder=False, getAll=False, - revCount=500): - """Load the version history page and return history information. - - Return value is a list of tuples, where each tuple represents one - edit and is built of revision id, edit date/time, user name, and - edit summary. Starts with the most current revision, unless - reverseOrder is True. Defaults to getting the first revCount edits, - unless getAll is True. - - """ - if getAll: - limit = None - else: - limit = revCount - self.site().loadrevisions(self, getText=False, rvdir=reverseOrder, - limit=limit) - if getAll: - revCount = len(self._revisions) - return [ ( self._revisions[rev].revid, - self._revisions[rev].timestamp, - self._revisions[rev].user, - self._revisions[rev].comment - ) for rev in sorted(self._revisions.keys(), - reverse=not reverseOrder)[ : revCount] - ] - - def getVersionHistoryTable(self, forceReload=False, reverseOrder=False, - getAll=False, revCount=500): - """Return the version history as a wiki table.""" - result = '{| border="1"\n' - result += '! oldid || date/time || username || edit summary\n' - for oldid, time, username, summary \ - in self.getVersionHistory(forceReload=forceReload, - reverseOrder=reverseOrder, - getAll=getAll, revCount=revCount): - result += '|----\n' - result += '| %s || %s || %s || <nowiki>%s</nowiki>\n'\ - % (oldid, time, username, summary) - result += '|}\n' - return result - - def fullVersionHistory(self): - """Iterate all previous versions including wikitext. - - @return: A generator that yields tuples consisting of revision ID, - edit date/time, user name and content - """ - return self.site().loadrevisions(self, withText=True) - - def contributingUsers(self): - """Return a set of usernames (or IPs) of users who edited this page.""" - edits = self.getVersionHistory() - users = set([edit[2] for edit in edits]) - return users - - @deprecate_arg("throttle", None) - def move(self, newtitle, reason=None, movetalkpage=True, sysop=False, - deleteAndMove=False, safe=True): - """Move this page to a new title. - - @param newtitle: The new page title. - @param reason: The edit summary for the move. - @param movetalkpage: If true, move this page's talk page (if it exists) - @param sysop: Try to move using sysop account, if available - @param deleteAndMove: if move succeeds, delete the old page - (usually requires sysop privileges, depending on wiki settings) - @param safe: If false, attempt to delete existing page at newtitle - (if there is one) and then move this page to that title - - """ - if reason is None: - logger.info(u'Moving %s to [[%s]].' - % (self.title(asLink=True), newtitle)) - reason = pywikibot.input(u'Please enter a reason for the move:') - # TODO: implement "safe" parameter - # TODO: implement "sysop" parameter - return self.site().movepage(self, newtitle, reason, - movetalk=movetalkpage, - noredirect=deleteAndMove) - - @deprecate_arg("throttle", None) - def delete(self, reason=None, prompt=True, throttle=None, mark=False): - """Deletes the page from the wiki. Requires administrator status. - - @param reason: The edit summary for the deletion. - @param prompt: If true, prompt user for confirmation before deleting. - @param mark: if true, and user does not have sysop rights, place a - speedy-deletion request on the page instead. - - """ - if reason is None: - logger.info(u'Deleting %s.' % (self.title(asLink=True))) - reason = pywikibot.input(u'Please enter a reason for the deletion:') - answer = u'y' - if prompt and not hasattr(self.site(), '_noDeletePrompt'): - answer = pywikibot.inputChoice(u'Do you want to delete %s?' - % self.title(asLink = True, forceInterwiki = True), - ['Yes', 'No', 'All'], - ['Y', 'N', 'A'], - 'N') - if answer in ['a', 'A']: - answer = 'y' - self.site()._noDeletePrompt = True - if answer in ['y', 'Y']: - return self.site().delete(self, reason, mark=mark) - - def loadDeletedRevisions(self): - """Retrieve all deleted revisions for this Page from Special/Undelete. - - Stores all revisions' timestamps, dates, editors and comments in - self._deletedRevs attribute. - - @return: list of timestamps (which can be used to retrieve revisions - later on). - - """ - return self.site().loadDeletedRevisions(self) - - def getDeletedRevision(self, timestamp, retrieveText=False): - """Return a particular deleted revision by timestamp. - - @return: a list of [date, editor, comment, text, restoration - marker]. text will be None, unless retrieveText is True (or has - been retrieved earlier). If timestamp is not found, returns - None. - - """ - return self.site().getDeletedRevision(self, timestamp, - getText=retrieveText) - - def markDeletedRevision(self, timestamp, undelete=True): - """Mark the revision identified by timestamp for undeletion. - - @param undelete: if False, mark the revision to remain deleted. - - """ - if self._deletedRevs == None: - self.loadDeletedRevisions() - if not self._deletedRevs.has_key(timestamp): - #TODO: Throw an exception? - return None - self._deletedRevs[timestamp][4] = undelete - self._deletedRevsModified = True - - @deprecate_arg("throttle", None) - def undelete(self, comment=None): - """Undelete revisions based on the markers set by previous calls. - - If no calls have been made since loadDeletedRevisions(), everything - will be restored. - - Simplest case:: - Page(...).undelete('This will restore all revisions') - - More complex:: - pg = Page(...) - revs = pg.loadDeletedRevsions() - for rev in revs: - if ... #decide whether to undelete a revision - pg.markDeletedRevision(rev) #mark for undeletion - pg.undelete('This will restore only selected revisions.') - - @param comment: The undeletion edit summary. - - """ - if comment is None: - logger.info(u'Preparing to undelete %s.' - % (self.title(asLink=True))) - comment = pywikibot.input( - u'Please enter a reason for the undeletion:') - return self.site().undelete(self, comment) - - @deprecate_arg("throttle", None) - def protect(self, edit='sysop', move='sysop', unprotect=False, - reason=None, prompt=True): - """(Un)protect a wiki page. Requires administrator status. - - Valid protection levels (in MediaWiki 1.12) are '' (equivalent to - 'none'), 'autoconfirmed', and 'sysop'. - - @param edit: Level of edit protection - @param move: Level of move protection - @param unprotect: If true, unprotect the page (equivalent to setting - all protection levels to '') - @param reason: Edit summary. - @param prompt: If true, ask user for confirmation. - - """ - if reason is None: - if unprotect: - un = u'un' - else: - un = u'' - logger.info(u'Preparing to %sprotect %s.' - % (un, self.title(asLink=True))) - reason = pywikibot.input(u'Please enter a reason for the action:') - if unprotect: - edit = move = "" - answer = 'y' - if prompt and not hasattr(self.site(), '_noProtectPrompt'): - answer = pywikibot.inputChoice( - u'Do you want to change the protection level of %s?' - % self.title(asLink=True, forceInterwiki = True), - ['Yes', 'No', 'All'], ['Y', 'N', 'A'], 'N') - if answer in ['a', 'A']: - answer = 'y' - self.site()._noProtectPrompt = True - if answer in ['y', 'Y']: - return self.site().protect(self, edit, move, reason) - - def change_category(article, oldCat, newCat, comment=None, sortKey=None, - inPlace=True): - """Remove page from oldCat and add it to newCat. - - oldCat and newCat should be Category objects. - If newCat is None, the category will be removed. - - """ # TODO: document remaining arguments - cats = self.categories(get_redirect=True) - site = self.site() - changesMade = False - - if not self.canBeEdited(): - pywikibot.output(u"Can't edit %s, skipping it..." - % self.title(asLink=True)) - return False - if inPlace == True: - newtext = pywikibot.textlib.replaceCategoryInPlace( - self.text, oldCat, newCat) - if newtext == self.text: - pywikibot.output( - u'No changes in made in page %s.' - % self.title(asLink=True)) - return False - try: - self.put(newtext, comment) - return True - except pywikibot.EditConflict: - pywikibot.output( - u'Skipping %s because of edit conflict' - % self.title(asLink=True)) - except pywikibot.LockedPage: - pywikibot.output(u'Skipping locked page %s' - % self.title(asLink=True)) - except pywikibot.SpamfilterError, error: - pywikibot.output( - u'Changing page %s blocked by spam filter (URL=%s)' - % (self.title(asLink=True), error.url)) - except pywikibot.NoUsername: - pywikibot.output( - u"Page %s not saved; sysop privileges required." - % self.title(asLink=True)) - except pywikibot.PageNotSaved, error: - pywikibot.output(u"Saving page %s failed: %s" - % (self.title(asLink=True), error.message)) - return False - - # This loop will replace all occurrences of the category to be changed, - # and remove duplicates. - newCatList = [] - newCatSet = set() - for i in range(len(cats)): - cat = cats[i] - if cat == oldCat: - changesMade = True - if not sortKey: - sortKey = cat.sortKey - if newCat: - if newCat.title() not in newCatSet: - newCategory = Category(site, newCat.title(), - sortKey=sortKey) - newCatSet.add(newCat.title()) - newCatList.append(newCategory) - elif cat.title() not in newCatSet: - newCatSet.add(cat.title()) - newCatList.append(cat) - - if not changesMade: - pywikibot.output(u'ERROR: %s is not in category %s!' - % (self.title(asLink=True), oldCat.title())) - else: - try: - text = pywikibot.textlib.replaceCategoryLinks(self.text, - newCatList) - except ValueError: - # Make sure that the only way replaceCategoryLinks() can return - # a ValueError is in the case of interwiki links to self. - pywikibot.output( - u'Skipping %s because of interwiki link to self' % self) - try: - self.put(text, comment) - except pywikibot.EditConflict: - pywikibot.output( - u'Skipping %s because of edit conflict' % self.title()) - except pywikibot.SpamfilterError, e: - pywikibot.output( - u'Skipping %s because of blacklist entry %s' - % (self.title(), e.url)) - except pywikibot.LockedPage: - pywikibot.output( - u'Skipping %s because page is locked' % self.title()) - except pywikibot.PageNotSaved, error: - pywikibot.output(u"Saving page %s failed: %s" - % (self.title(asLink=True), error.message)) - -######## DEPRECATED METHODS ######## - - def encoding(self): - """DEPRECATED: use Site.encoding() instead""" - logger.debug(u"Page.encoding() is deprecated; use Site.encoding().") - return self.site().encoding() - - def titleWithoutNamespace(self, underscore=False): - """DEPRECATED: use self.title(withNamespace=False) instead.""" - logger.debug( - u"Page.titleWithoutNamespace() method is deprecated.") - return self.title(underscore=underscore, withNamespace=False, - withSection=False) - - def titleForFilename(self): - """DEPRECATED: use self.title(as_filename=True) instead.""" - logger.debug( - u"Page.titleForFilename() method is deprecated.") - return self.title(as_filename=True) - - def sectionFreeTitle(self, underscore=False): - """DEPRECATED: use self.title(withSection=False) instead.""" - logger.debug( - u"Page.sectionFreeTitle() method is deprecated.") - return self.title(underscore=underscore, withSection=False) - - def aslink(self, forceInterwiki=False, textlink=False, noInterwiki=False): - """DEPRECATED: use self.title(asLink=True) instead.""" - logger.debug(u"Page.aslink() method is deprecated.") - return self.title(asLink=True, forceInterwiki=forceInterwiki, - allowInterwiki=not noInterwiki, textlink=textlink) - - def urlname(self): - """Return the Page title encoded for use in an URL. - - DEPRECATED: use self.title(asUrl=True) instead. - - """ - logger.debug(u"Page.urlname() method is deprecated.") - return self.title(asUrl=True) - -####### DISABLED METHODS (warnings provided) ###### - # these methods are easily replaced by editing the page's text using - # textlib methods and then using put() on the result. - - def removeImage(self, image, put=False, summary=None, safe=True): - """Old method to remove all instances of an image from page.""" - logger.warning(u"Page.removeImage() is no longer supported.") - - def replaceImage(self, image, replacement=None, put=False, summary=None, - safe=True): - """Old method to replace all instances of an image with another.""" - logger.warning(u"Page.replaceImage() is no longer supported.") - - -class ImagePage(Page): - """A subclass of Page representing an image descriptor wiki page. - - Supports the same interface as Page, with the following added methods: - - getImagePageHtml : Download image page and return raw HTML text. - fileURL : Return the URL for the image described on this - page. - fileIsOnCommons : Return True if image stored on Wikimedia - Commons. - fileIsShared : Return True if image stored on Wikitravel - shared repository. - getFileMd5Sum : Return image file's MD5 checksum. - getFileVersionHistory : Return the image file's version history. - getFileVersionHistoryTable: Return the version history in the form of a - wiki table. - usingPages : Iterate Pages on which the image is displayed. - - """ - def __init__(self, source, title=u"", insite=None): - Page.__init__(self, source, title, 6) - if self.namespace() != 6: - raise ValueError(u"'%s' is not in the image namespace!" % title) - - def getImagePageHtml(self): - """ - Download the image page, and return the HTML, as a unicode string. - - Caches the HTML code, so that if you run this method twice on the - same ImagePage object, the page will only be downloaded once. - """ - if not hasattr(self, '_imagePageHtml'): - from pywikibot.data import http - path = "%s/index.php?title=%s" \ - % (self.site().scriptpath(), self.title(asUrl=True)) - self._imagePageHtml = http.request(self.site(), path) - return self._imagePageHtml - - def fileUrl(self): - """Return the URL for the image described on this page.""" - # TODO add scaling option? - if not hasattr(self, '_imageinfo'): - self._imageinfo = self.site().getimageinfo(self) #FIXME - return self._imageinfo['url'] - - def fileIsOnCommons(self): - """Return True if the image is stored on Wikimedia Commons""" - return self.fileUrl().startswith( - 'http://upload.wikimedia.org/wikipedia/commons/') - - def fileIsShared(self): - """Return True if image is stored on any known shared repository.""" - # as of now, the only known repositories are commons and wikitravel - if 'wikitravel_shared' in self.site().shared_image_repository(): - return self.fileUrl().startswith( - u'http://wikitravel.org/upload/shared/') - return self.fileIsOnCommons() - - def getFileMd5Sum(self): - """Return image file's MD5 checksum.""" - logger.debug( - "ImagePage.getFileMd5Sum() is deprecated; use getFileSHA1Sum().") -# FIXME: MD5 might be performed on incomplete file due to server disconnection -# (see bug #1795683). - import md5, urllib - f = urllib.urlopen(self.fileUrl()) - # TODO: check whether this needs a User-Agent header added - md5Checksum = md5.new(f.read()).hexdigest() - f.close() - return md5Checksum - - def getFileSHA1Sum(self): - """Return image file's SHA1 checksum.""" - if not hasattr(self, '_imageinfo'): - self._imageinfo = self.site().getimageinfo(self) #FIXME - return self._imageinfo['sha1'] - - def getFileVersionHistory(self): - """Return the image file's version history. - - @return: An iterator yielding tuples containing (timestamp, - username, resolution, filesize, comment). - - """ - #TODO; return value may need to change - return self.site().getimageinfo(self, history=True) #FIXME - - def getFileVersionHistoryTable(self): - """Return the version history in the form of a wiki table.""" - lines = [] - #TODO: if getFileVersionHistory changes, make sure this follows it - for (datetime, username, resolution, size, comment) \ - in self.getFileVersionHistory(): - lines.append('| %s || %s || %s || %s || <nowiki>%s</nowiki>' \ - % (datetime, username, resolution, size, comment)) - return u'{| border="1"\n! date/time || username || resolution || size || edit summary\n|----\n' + u'\n|----\n'.join(lines) + '\n|}' - - def usingPages(self): - """Yield Pages on which the image is displayed.""" - return self.site().getimageusage(self) - - -class Category(Page): - """A page in the Category: namespace""" - - @deprecate_arg("sortKey", None) - def __init__(self, source, title=u"", insite=None): - """All parameters are the same as for Page() constructor. - - """ - Page.__init__(self, source, title, 14) - if self.namespace() != 14: - raise ValueError(u"'%s' is not in the category namespace!" - % title) - - @deprecate_arg("forceInterwiki", None) - @deprecate_arg("textlink", None) - @deprecate_arg("noInterwiki", None) - def aslink(self, sortKey=u''): - """Return a link to place a page in this Category. - - Use this only to generate a "true" category link, not for interwikis - or text links to category pages. - - @param sortKey: The sort key for the article to be placed in this - Category; if omitted, default sort key is used. - @type sortKey: (optional) unicode - - """ - if sortKey: - titleWithSortKey = '%s|%s' % (self.title(withSection=False), - self.sortKey) - else: - titleWithSortKey = self.title(withSection=False) - return '[[%s]]' % titleWithSortKey - - @deprecate_arg("startFrom", None) - @deprecate_arg("cacheResults", None) - def subcategories(self, recurse=False): - """Iterate all subcategories of the current category. - - @param recurse: if not False or 0, also iterate subcategories of - subcategories. If an int, limit recursion to this number of - levels. (Example: recurse=1 will iterate direct subcats and - first-level sub-sub-cats, but no deeper.) - @type recurse: int or bool - - """ - if not isinstance(recurse, bool) and recurse: - recurse = recurse - 1 - if not hasattr(self, "_subcats"): - self._subcats = [] - for member in self.site().categorymembers(self, namespaces=[14]): - subcat = Category(self.site(), member.title()) - self._subcats.append(subcat) - yield subcat - if recurse: - for item in subcat.subcategories(recurse): - yield item - else: - for subcat in self._subcats: - yield subcat - if recurse: - for item in subcat.subcategories(recurse): - yield item - - @deprecate_arg("startFrom", None) - def articles(self, recurse=False): - """ - Yields all articles in the current category. - - @param recurse: if not False or 0, also iterate articles in - subcategories. If an int, limit recursion to this number of - levels. (Example: recurse=1 will iterate articles in first-level - subcats, but no deeper.) - @type recurse: int or bool - - """ - namespaces = [x for x in self.site().namespaces().keys() - if x>=0 and x!=14] - for member in self.site().categorymembers(self, - namespaces=namespaces): - yield member - if recurse: - if not isinstance(recurse, bool) and recurse: - recurse = recurse - 1 - for subcat in self.subcategories(): - for article in subcat.articles(recurse): - yield article - - def isEmptyCategory(self): - """Return True if category has no members (including subcategories).""" - for member in self.site().categorymembers(self, limit=1): - return False - return True - - def copyTo(self, catname): - """ - Copy text of category page to a new page. Does not move contents. - - @param catname: New category title (without namespace) - @return: True if copying was successful, False if target page - already existed. - - """ - # This seems far too specialized to be in the top-level framework - catname = self.site().category_namespace() + ':' + catname - targetCat = Category(self.site(), catname) - if targetCat.exists(): - logger.warn('Target page %s already exists!' - % targetCat.title()) - return False - else: - logger.info('Moving text from %s to %s.' - % (self.title(), targetCat.title())) - authors = ', '.join(self.contributingUsers()) - creationSummary = pywikibot.translate( - self.site(), msg_created_for_renaming - ) % (self.title(), authors) - targetCat.put(self.get(), creationSummary) - return True - - def copyAndKeep(self, catname, cfdTemplates): - """Copy partial category page text (not contents) to a new title. - - Like copyTo above, except this removes a list of templates (like - deletion templates) that appear in the old category text. It also - removes all text between the two HTML comments BEGIN CFD TEMPLATE - and END CFD TEMPLATE. (This is to deal with CFD templates that are - substituted.) - - Returns true if copying was successful, false if target page already - existed. - - @param catname: New category title (without namespace) - @param cfdTemplates: A list (or iterator) of templates to be removed - from the page text - @return: True if copying was successful, False if target page - already existed. - - """ - # I don't see why we need this as part of the framework either - catname = self.site().category_namespace() + ':' + catname - targetCat = Category(self.site(), catname) - if targetCat.exists(): - logger.warn('Target page %s already exists!' - % targetCat.title()) - return False - else: - logger.info('Moving text from %s to %s.' - % (self.title(), targetCat.title())) - authors = ', '.join(self.contributingUsers()) - creationSummary = pywikibot.translate( - self.site(), msg_created_for_renaming - ) % (self.title(), authors) - newtext = self.get() - for regexName in cfdTemplates: - matchcfd = re.compile(r"{{%s.*?}}" % regexName, re.IGNORECASE) - newtext = matchcfd.sub('',newtext) - matchcomment = re.compile( - r"<!--BEGIN CFD TEMPLATE-->.*?<!--END CFD TEMPLATE-->", - re.IGNORECASE | re.MULTILINE | re.DOTALL) - newtext = matchcomment.sub('', newtext) - pos = 0 - while (newtext[pos:pos+1] == "\n"): - pos = pos + 1 - newtext = newtext[pos:] - targetCat.put(newtext, creationSummary) - return True - -#### DEPRECATED METHODS #### - def subcategoriesList(self, recurse=False): - """DEPRECATED: Equivalent to list(self.subcategories(...))""" - logger.debug("Category.subcategoriesList() method is deprecated.") - return sorted(list(set(self.subcategories(recurse)))) - - def articlesList(self, recurse=False): - """DEPRECATED: equivalent to list(self.articles(...))""" - logger.debug("Category.articlesList() method is deprecated.") - return sorted(list(set(self.articles(recurse)))) - - def supercategories(self): - """DEPRECATED: equivalent to self.categories()""" - logger.debug("Category.supercategories() method is deprecated.") - return self.categories() - - def supercategoriesList(self): - """DEPRECATED: equivalent to list(self.categories(...))""" - logger.debug("Category.articlesList() method is deprecated.") - return sorted(list(set(self.categories()))) - - -class Revision(object): - """A structure holding information about a single revision of a Page.""" - def __init__(self, revid, timestamp, user, anon=False, comment=u"", - text=None, minor=False): - """All parameters correspond to object attributes (e.g., revid - parameter is stored as self.revid) - - @param revid: Revision id number - @type revid: int - @param text: Revision wikitext. - @type text: unicode, or None if text not yet retrieved - @param timestamp: Revision time stamp (in ISO 8601 format) - @type timestamp: unicode - @param user: user who edited this revision - @type user: unicode - @param anon: user is unregistered - @type anon: bool - @param comment: edit comment text - @type comment: unicode - @param minor: edit flagged as minor - @type minor: bool - - """ - self.revid = revid - self.text = text - self.timestamp = timestamp - self.user = user - self.anon = anon - self.comment = comment - self.minor = minor - - -class Link(object): - """A Mediawiki link (local or interwiki) - - Has the following attributes: - - - site: The Site object for the wiki linked to - - namespace: The namespace of the page linked to (int) - - title: The title of the page linked to (unicode); does not include - namespace or section - - section: The section of the page linked to (unicode or None); this - contains any text following a '#' character in the title - - anchor: The anchor text (unicode or None); this contains any text - following a '|' character inside the link - - """ - illegal_titles_pattern = re.compile( - # Matching titles will be held as illegal. - u'''[^ %!"$&'()*,\-.\/0-9:;=?@A-Z\\^_`a-z~\u0080-\uFFFF+]''' - # URL percent encoding sequences interfere with the ability - # to round-trip titles -- you can't link to them consistently. - u'|%[0-9A-Fa-f]{2}' - # XML/HTML character references produce similar issues. - u'|&[A-Za-z0-9\x80-\xff]+;' - u'|&#[0-9]+;' - u'|&#x[0-9A-Fa-f]+;' - ) - - def __init__(self, text, source=None, defaultNamespace=0): - """Constructor - - @param text: the link text (everything appearing between [[ and ]] - on a wiki page) - @type text: unicode - @param source: the Site on which the link was found (not necessarily - the site to which the link refers) - @type source: Site - @param defaultNamespace: a namespace to use if the link does not - contain one (defaults to 0) - @type defaultNamespace: int - - """ - self._text = text - self._source = source - self._defaultns = defaultNamespace - - def parse(self): - """Parse text; called internally when accessing attributes""" - - # First remove the anchor, which is stored unchanged, if there is one - if u"|" in self._text: - self._text, self._anchor = self._text.split(u"|", 1) - else: - self._anchor = None - - if self._source is None: - self._source = pywikibot.Site() - self._site = self._source - - # Clean up the name, it can come from anywhere. - # Convert HTML entities to unicode - t = html2unicode(self._text) - - # Convert URL-encoded characters to unicode - t = url2unicode(t, site=self._site) - - # Normalize unicode string to a NFC (composed) format to allow proper - # string comparisons. According to - # http://svn.wikimedia.org/viewvc/mediawiki/branches/REL1_6/phase3/includes/no... - # the mediawiki code normalizes everything to NFC, not NFKC (which - # might result in information loss). - t = unicodedata.normalize('NFC', t) - - # This code was adapted from Title.php : secureAndSplit() - # - if u'\ufffd' in t: - raise pywikibot.Error("Title contains illegal char (\uFFFD)") - self._namespace = self._defaultns - - # Replace underscores by spaces - t = t.replace(u"_", u" ") - # replace multiple spaces and underscores with a single space - while u" " in t: t = t.replace(u" ", u" ") - # Strip spaces at both ends - t = t.strip(" ") - # Remove left-to-right and right-to-left markers. - t = t.replace(u"\u200e", u"").replace(u"\u200f", u"") - - firstPass = True - while u":" in t: - # Initial colon indicates main namespace rather than default - if t.startswith(u":"): - self._namespace = 0 - # remove the colon but continue processing - # remove any subsequent whitespace - t = t.lstrip(u":").lstrip(u" ") - continue - - fam = self._site.family - prefix = t[ :t.index(u":")].lower() - ns = self._site.ns_index(prefix) - if ns: - # Ordinary namespace - t = t[t.index(u":"): ].lstrip(u":").lstrip(u" ") - self._namespace = ns - break - if prefix in fam.langs.keys()\ - or prefix in fam.get_known_families(site=self._site): - # looks like an interwiki link - if not firstPass: - # Can't make a local interwiki link to an interwiki link. - raise pywikibot.Error( - "Improperly formatted interwiki link '%s'" - % self._text) - t = t[t.index(u":"): ].lstrip(u":").lstrip(u" ") - if prefix in fam.langs.keys(): - newsite = pywikibot.Site(prefix, fam) - else: - otherlang = self._site.code - familyName = fam.get_known_families(site=self._site)[prefix] - if familyName in ['commons', 'meta']: - otherlang = familyName - try: - newsite = pywikibot.Site(otherlang, familyName) - except ValueError: - raise pywikibot.Error("""\ -%s is not a local page on %s, and the %s family is -not supported by PyWikiBot!""" - % (title, self._site(), familyName)) - - # Redundant interwiki prefix to the local wiki - if newsite == self._site: - if not t: - # Can't have an empty self-link - raise pywikibot.Error( - "Invalid link title: '%s'" % self._text) - firstPass = False - continue - self._site = newsite - else: - break # text before : doesn't match any known prefix - - if u"#" in t: - t, sec = t.split(u'#', 1) - t, self._section = t.rstrip(), sec.lstrip() - else: - self._section = None - - # Reject illegal characters. - m = Link.illegal_titles_pattern.search(t) - if m: - raise pywikibot.Error( - u"Invalid title: contains illegal char(s) '%s'" % m.group(0)) - - # Pages with "/./" or "/../" appearing in the URLs will - # often be unreachable due to the way web browsers deal - #* with 'relative' URLs. Forbid them explicitly. - - if u'.' in t and ( - t == u'.' or t == u'..' - or t.startswith(u"./") - or t.startswith(u"../") - or u"/./" in t - or u"/../" in t - or t.endswith(u"/.") - or t.endswith(u"/..") - ): - raise pywikibot.Error( - "Invalid title (contains . / combinations): '%s'" - % self._text) - - # Magic tilde sequences? Nu-uh! - if u"~~~" in t: - raise pywikibot.Error("Invalid title (contains ~~~): '%s'" % self._text) - - if self._namespace != -1 and len(t) > 255: - raise pywikibot.Error("Invalid title (over 255 bytes): '%s'" % t) - - if self._site.case() == 'first-letter': - t = t[:1].upper() + t[1:] - - # Can't make a link to a namespace alone... - # "empty" local links can only be self-links - # with a fragment identifier. - if not t and self._site == self._source and self._namespace != 0: - raise ValueError("Invalid link (no page title): '%s'" % self._text) - - self._title = t - - # define attributes, to be evaluated lazily - - @property - def site(self): - if not hasattr(self, "_site"): - self.parse() - return self._site - - @property - def namespace(self): - if not hasattr(self, "_namespace"): - self.parse() - return self._namespace - - @property - def title(self): - if not hasattr(self, "_title"): - self.parse() - return self._title - - @property - def section(self): - if not hasattr(self, "_section"): - self.parse() - return self._section - - @property - def anchor(self): - if not hasattr(self, "_anchor"): - self.parse() - return self._anchor - - def astext(self, onsite=None): - """Return a text representation of the link. - - @param onsite: if specified, present as a (possibly interwiki) link - from the given site; otherwise, present as an internal link on - the source site. - - """ - if onsite is None: - onsite = self.site - title = self.title - if self.namespace: - title = onsite.namespace(self.namespace) + ":" + title - if self.section: - title = title + "#" + self.section - if onsite == self.site: - return u'[[%s]]' % title - if onsite.family == self.site.family: - return u'[[%s:%s]]' % (self.site.code, title) - if self.site.family.name == self.site.code: - # use this form for sites like commons, where the - # code is the same as the family name - return u'[[%s:%s]]' % (self.site.code, - title) - return u'[[%s:%s:%s]]' % (self.site.family.name, - self.site.code, - title) - - def __str__(self): - return self.astext() - - def __cmp__(self, other): - """Test for equality and inequality of Link objects. - - Link objects are "equal" if and only if they are on the same site - and have the same normalized title, including section if any. - - Link objects are sortable by site, then namespace, then title. - - """ - if not isinstance(other, Link): - # especially, return -1 if other is None - return -1 - if not self.site == other.site: - return cmp(self.site, other.site) - if self.namespace != other.namespace: - return cmp(self.namespace, other.namespace) - return cmp(self.title, other.title) - - -# Utility functions for parsing page titles - -def html2unicode(text, ignore = []): - """Return text, replacing HTML entities by equivalent unicode characters.""" - # This regular expression will match any decimal and hexadecimal entity and - # also entities that might be named entities. - entityR = re.compile( - r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));') - # These characters are Html-illegal, but sadly you *can* find some of - # these and converting them to unichr(decimal) is unsuitable - convertIllegalHtmlEntities = { - 128 : 8364, # € - 130 : 8218, # ‚ - 131 : 402, # ƒ - 132 : 8222, # „ - 133 : 8230, # … - 134 : 8224, # † - 135 : 8225, # ‡ - 136 : 710, # ˆ - 137 : 8240, # ‰ - 138 : 352, # Š - 139 : 8249, # ‹ - 140 : 338, # Œ - 142 : 381, # Ž - 145 : 8216, # ‘ - 146 : 8217, # ’ - 147 : 8220, # “ - 148 : 8221, # ” - 149 : 8226, # • - 150 : 8211, # – - 151 : 8212, # — - 152 : 732, # ˜ - 153 : 8482, # ™ - 154 : 353, # š - 155 : 8250, # › - 156 : 339, # œ - 158 : 382, # ž - 159 : 376 # Ÿ - } - #ensuring that illegal   and , which have no known values, - #don't get converted to unichr(129), unichr(141) or unichr(157) - ignore = set(ignore) | set([129, 141, 157]) - result = u'' - i = 0 - found = True - while found: - text = text[i:] - match = entityR.search(text) - if match: - unicodeCodepoint = None - if match.group('decimal'): - unicodeCodepoint = int(match.group('decimal')) - elif match.group('hex'): - unicodeCodepoint = int(match.group('hex'), 16) - elif match.group('name'): - name = match.group('name') - if htmlentitydefs.name2codepoint.has_key(name): - # We found a known HTML entity. - unicodeCodepoint = htmlentitydefs.name2codepoint[name] - result += text[:match.start()] - try: - unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint] - except KeyError: - pass - if unicodeCodepoint and unicodeCodepoint not in ignore: - result += unichr(unicodeCodepoint) - else: - # Leave the entity unchanged - result += text[match.start():match.end()] - i = match.end() - else: - result += text - found = False - return result - -def url2unicode(title, site, site2 = None): - """Convert url-encoded text to unicode using site's encoding. - - If site2 is provided, try its encodings as well. Uses the first encoding - that doesn't cause an error. - - """ - # create a list of all possible encodings for both hint sites - encList = [site.encoding()] + list(site.encodings()) - if site2 and site2 <> site: - encList.append(site2.encoding()) - encList += list(site2.encodings()) - firstException = None - # try to handle all encodings (will probably retry utf-8) - for enc in encList: - try: - t = title.encode(enc) - t = urllib.unquote(t) - return unicode(t, enc) - except UnicodeError, ex: - if not firstException: - firstException = ex - pass - # Couldn't convert, raise the original exception - raise firstException - +# -*- coding: utf-8 -*- +""" +Objects representing various types of MediaWiki pages. +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import pywikibot +from pywikibot import deprecate_arg +from pywikibot import config +import pywikibot.site +import pywikibot.textlib + +import htmlentitydefs +import logging +import re +import sys +import threading +import unicodedata +import urllib + +logger = logging.getLogger("wiki") + +reNamespace = re.compile("^(.+?) *: *(.*)$") + + +class Page(object): + """Page: A MediaWiki page + + This object only implements internally methods that do not require + reading from or writing to the wiki. All other methods are delegated + to the Site object. + + """ + + @deprecate_arg("insite", None) + @deprecate_arg("defaultNamespace", None) + def __init__(self, source, title=u"", ns=0): + """Instantiate a Page object. + + Three calling formats are supported: + + - If the first argument is a Page, create a copy of that object. + This can be used to convert an existing Page into a subclass + object, such as Category or ImagePage. (If the title is also + given as the second argument, creates a copy with that title; + this is used when pages are moved.) + - If the first argument is a Site, create a Page on that Site + using the second argument as the title (may include a section), + and the third as the namespace number. The namespace number is + mandatory, even if the title includes the namespace prefix. This + is the preferred syntax when using an already-normalized title + obtained from api.php or a database dump. WARNING: may produce + invalid objects if page title isn't in normal form! + - If the first argument is a Link, create a Page from that link. + This is the preferred syntax when using a title scraped from + wikitext, URLs, or another non-normalized source. + + @param source: the source of the page + @type source: Link, Page (or subclass), or Site + @param title: normalized title of the page; required if source is a + Site, ignored otherwise + @type title: unicode + @param ns: namespace number; required if source is a Site, ignored + otherwise + @type ns: int + + """ + if isinstance(source, pywikibot.site.BaseSite): + self._site = source + if ns not in source.namespaces(): + raise pywikibot.Error( + "Invalid namespace '%i' for site %s." + % (ns, source.sitename())) + self._ns = ns + if ns and not title.startswith(source.namespace(ns)+u":"): + title = source.namespace(ns) + u":" + title + elif not ns and u":" in title: + pos = title.index(u':') + nsindex = source.ns_index(title[ :pos]) + if nsindex: + self._ns = nsindex + if u"#" in title: + title, self._section = title.split(u"#", 1) + else: + self._section = None + if not title: + raise pywikibot.Error( + "Page object cannot be created from Site without title.") + self._title = title + elif isinstance(source, Page): + # copy all of source's attributes to this object + self.__dict__ = source.__dict__ + if title: + # overwrite title + if ":" in title: + prefix = title[ :title.index(":")] + self._ns = site.ns_index(prefix) + if self._ns is None: + self._ns = 0 + else: + title = title[title.index(":")+1 : ].strip(" _") + self._title = "%s:%s" % ( + self.site().namespace(self._ns), + self._title) + else: + self._ns = 0 + if "#" in title: + self._section = title[title.index("#") + 1 : ].strip(" _") + title = title[ : title.index("#")].strip(" _") + self._title = title + elif isinstance(source, Link): + self._site = source.site + self._section = source.section + self._ns = source.namespace + self._title = source.title + # reassemble the canonical title from components + if self._ns: + self._title = "%s:%s" % (self.site().namespace(self._ns), + self._title) + else: + raise pywikibot.Error( + "Invalid argument type '%s' in Page constructor: %s" + % (type(source), source)) + if self._section is not None: + self._title = self._title + "#" + self._section + self._revisions = {} + + def site(self): + """Return the Site object for the wiki on which this Page resides.""" + return self._site + + def namespace(self): + """Return the number of the namespace of the page.""" + return self._ns + + @deprecate_arg("decode", None) + @deprecate_arg("savetitle", "asUrl") + def title(self, underscore=False, savetitle=False, withNamespace=True, + withSection=True, asUrl=False, asLink=False, + allowInterwiki=True, forceInterwiki=False, textlink=False, + as_filename=False): + """Return the title of this Page, as a Unicode string. + + @param underscore: if true, replace all ' ' characters with '_' + @param withNamespace: if false, omit the namespace prefix + @param withSection: if false, omit the section + @param asUrl: if true, quote title as if in an URL + @param asLink: if true, return the title in the form of a wikilink + @param allowInterwiki: (only used if asLink is true) if true, format + the link as an interwiki link if necessary + @param forceInterwiki: (only used if asLink is true) if true, always + format the link as an interwiki link + @param textlink: (only used if asLink is true) if true, place a ':' + before Category: and Image: links + @param as_filename: if true, replace any characters that are unsafe + in filenames + + """ + title = self._title + if not withNamespace and self._ns != 0: + title = title.split(u':', 1)[1] + if not withSection and self._section: + title = title.split(u'#', 1)[0] + if underscore or asUrl: + title = title.replace(u' ', u'_') + if asUrl: + encodedTitle = title.encode(self.site().encoding()) + title = urllib.quote(encodedTitle) + if asLink: + if forceInterwiki or (allowInterwiki and + (self.site().family.name != config.family + or self.site().code != config.mylang)): + if self.site().family.name != config.family \ + and self.site().family.name != self.site().code: + return u'[[%s:%s:%s]]' % (self.site().family.name, + self.site().code, + self._title) + else: + # use this form for sites like commons, where the + # code is the same as the family name + return u'[[%s:%s]]' % (self.site().code, + self._title) + elif textlink and (self.isImage() or self.isCategory()): + return u'[[:%s]]' % title + else: + return u'[[%s]]' % title + if as_filename: + # Replace characters that are not possible in file names on some + # systems. + # Spaces are possible on most systems, but are bad for URLs. + for forbidden in ':*?/\ ': + title = title.replace(forbidden, '_') + return title + + @deprecate_arg("decode", None) + @deprecate_arg("underscore", None) + def section(self): + """Return the name of the section this Page refers to. + + The section is the part of the title following a '#' character, if + any. If no section is present, return None. + + """ + if self._section: + return self._section + else: + return None + + def __str__(self): + """Return a console representation of the pagelink.""" + return self.title(asLink=True, forceInterwiki=True + ).encode(sys.stderr.encoding) + + def __unicode__(self): + return self.title(asLink=True, forceInterwiki=True) + + def __repr__(self): + """Return a more complete string representation.""" + return u"%s(%s)" % (self.__class__.__name__, + self.title().encode(sys.stderr.encoding)) + + def __cmp__(self, other): + """Test for equality and inequality of Page objects. + + Page objects are "equal" if and only if they are on the same site + and have the same normalized title, including section if any. + + Page objects are sortable by namespace first, then by title. + + """ + if not isinstance(other, Page): + # especially, return -1 if other is None + return -1 + if not self.site() == other.site(): + return cmp(self.site(), other.site()) + if self.namespace() != other.namespace(): + return cmp(self.namespace(), other.namespace()) + owntitle = self.title(withNamespace=False) + othertitle = other.title(withNamespace=False) + return cmp(owntitle, othertitle) + + def __hash__(self): + # Pseudo method that makes it possible to store Page objects as keys + # in hash-tables. This relies on the fact that the string + # representation of an instance can not change after the construction. + return hash(unicode(self)) + + def autoFormat(self): + """Return L{date.autoFormat} dictName and value, if any. + + Value can be a year, date, etc., and dictName is 'YearBC', + 'Year_December', or another dictionary name. Please note that two + entries may have exactly the same autoFormat, but be in two + different namespaces, as some sites have categories with the + same names. Regular titles return (None, None). + + """ + if not hasattr(self, '_autoFormat'): + from pywikibot import date + self._autoFormat = date.getAutoFormat( + self.site().code, + self.title(withNamespace=False) + ) + return self._autoFormat + + def isAutoTitle(self): + """Return True if title of this Page is in the autoFormat dictionary.""" + return self.autoFormat()[0] is not None + + @deprecate_arg("throttle", None) + @deprecate_arg("nofollow_redirects", None) + @deprecate_arg("change_edit_time", None) + def get(self, force=False, get_redirect=False, sysop=False): + """Return the wiki-text of the page. + + This will retrieve the page from the server if it has not been + retrieved yet, or if force is True. This can raise the following + exceptions that should be caught by the calling code: + + - NoPage: The page does not exist + - IsRedirectPage: The page is a redirect. The argument of the + exception is the title of the page it redirects to. + - SectionError: The section does not exist on a page with a # + link + + @param force: reload all page attributes, including errors. + @param get_redirect: return the redirect text, do not follow the + redirect, do not raise an exception. + @param sysop: if the user has a sysop account, use it to retrieve + this page + + """ + if force: + # When forcing, we retry the page no matter what. Old exceptions + # do not apply any more. + for attr in ['_redirarg', '_getexception']: + if hasattr(self, attr): + delattr(self,attr) + else: + # Make sure we re-raise an exception we got on an earlier attempt + if hasattr(self, '_redirarg') and not get_redirect: + raise pywikibot.IsRedirectPage, self._redirarg + elif hasattr(self, '_getexception'): + raise self._getexception + if force or not hasattr(self, "_revid") \ + or not self._revid in self._revisions \ + or self._revisions[self._revid].text is None: + self.site().loadrevisions(self, getText=True, sysop=sysop) + # TODO: Exception handling for no-page, redirects, etc. + + return self._revisions[self._revid].text + + @deprecate_arg("throttle", None) + @deprecate_arg("nofollow_redirects", None) + @deprecate_arg("change_edit_time", None) + def getOldVersion(self, oldid, force=False, get_redirect=False, + sysop=False): + """Return text of an old revision of this page; same options as get(). + + @param oldid: The revid of the revision desired. + + """ + if force or not oldid in self._revisions \ + or self._revisions[oldid].text is None: + self.site().loadrevisions(self, getText=True, revids=oldid, + sysop=sysop) + # TODO: what about redirects, errors? + return self._revisions[oldid].text + + def permalink(self): + """Return the permalink URL for current revision of this page.""" + return "%s://%s/%sindex.php?title=%s&oldid=%s" \ + % (self.site().protocol(), + self.site().hostname(), + self.site().scriptpath(), + self.title(asUrl=True), + self.latestRevision()) + + def latestRevision(self): + """Return the current revision id for this page.""" + if not hasattr(self, '_revid'): + self.site().loadrevisions(self) + return self._revid + + def _textgetter(self): + """Return the current (edited) wikitext, loading it if necessary.""" + if not hasattr(self, '_text') or self._text is None: + try: + self._text = self.get() + except pywikibot.NoPage: + # TODO: what other exceptions might be returned? + self._text = u"" + return self._text + + def _textsetter(self, value): + """Update the edited wikitext""" + self._text = unicode(value) + + def _cleartext(self): + """Delete the edited wikitext""" + if hasattr(self, "_text"): + del self._text + + text = property(_textgetter, _textsetter, _cleartext, + "The edited wikitext (unicode) of this Page") + + def expand_text(self): + """Return the page text with all templates expanded.""" + req = pywikibot.data.api.Request(action="expandtemplates", + text=self.text, + title=self.title(withSection=False), + site=self.site()) + result = req.submit() + return result["expandtemplates"]["*"] + + def userName(self): + """Return name or IP address of last user to edit page.""" + return self._revisions[self.latestRevision()].user + + def isIpEdit(self): + """Return True if last editor was unregistered.""" + return self._revisions[self.latestRevision()].anon + + def editTime(self): + """Return timestamp (in ISO 8601 format) of last revision to page.""" + return self._revisions[self.latestRevision()].timestamp + + def previousRevision(self): + """Return the revision id for the previous revision of this Page.""" + vh = self.getVersionHistory(revCount=2) + revkey = sorted(self._revisions.keys(), reverse=True)[1] + return revkey + + def exists(self): + """Return True if page exists on the wiki, even if it's a redirect. + + If the title includes a section, return False if this section isn't + found. + + """ + return self.site().page_exists(self) + + def isRedirectPage(self): + """Return True if this is a redirect, False if not or not existing.""" + return self.site().page_isredirect(self) + + def isEmpty(self): + """Return True if the page text has less than 4 characters. + + Character count ignores language links and category links. + Can raise the same exceptions as get(). + + """ + txt = self.get() + txt = pywikibot.textlib.removeLanguageLinks(txt, site = self.site()) + txt = pywikibot.textlib.removeCategoryLinks(txt, site = self.site()) + if len(txt) < 4: + return True + else: + return False + + def isTalkPage(self): + """Return True if this page is in any talk namespace.""" + ns = self.namespace() + return ns >= 0 and ns % 2 == 1 + + def toggleTalkPage(self): + """Return other member of the article-talk page pair for this Page. + + If self is a talk page, returns the associated content page; + otherwise, returns the associated talk page. The returned page need + not actually exist on the wiki. + + Returns None if self is a special page. + + """ + ns = self.namespace() + if ns < 0: # Special page + return None + if self.isTalkPage(): + if self.namespace() == 1: + return Page(self.site(), self.title(withNamespace=False)) + else: + return Page(self.site(), + self.site().namespace(ns - 1) + ':' + + self.title(withNamespace=False)) + else: + return Page(self.site(), + self.site().namespace(ns + 1) + ':' + + self.title(withNamespace=False)) + + def isCategory(self): + """Return True if the page is a Category, False otherwise.""" + return self.namespace() == 14 + + def isImage(self): + """Return True if this is an image description page, False otherwise.""" + return self.namespace() == 6 + + def isDisambig(self): + """Return True if this is a disambiguation page, False otherwise. + + Relies on the presence of specific templates, identified in + the Family file or on a wiki page, to identify disambiguation + pages. + + By default, loads a list of template names from the Family file; + if the value in the Family file is None, looks for the list on + [[MediaWiki:Disambiguationspage]]. + + """ + if not hasattr(self, "_isDisambig"): + if not hasattr(self.site(), "_disambigtemplates"): + self.site()._disambigtemplates = \ + self.site().family.disambig(self.site().code) + if self.site()._disambigtemplates is None: + try: + disambigpages = Page(self.site(), + "MediaWiki:Disambiguationspage") + self.site()._disambigtemplates = [ + link.title(withNamespace=False) + for link in disambigpages.linkedPages() + if link.namespace() == 10 + ] + except NoPage: + self.site()._disambigtemplates = ['Disambig'] + for t in self.templates(): + if t.title(withNamespace=False) in self.site()._disambigtemplates: + self._isDisambig = True + break + else: + self._isDisambig = False + return self._isDisambig + + def getReferences(self, follow_redirects=True, withTemplateInclusion=True, + onlyTemplateInclusion=False, redirectsOnly=False, + namespaces=None): + """Return an iterator all pages that refer to or embed the page. + + If you need a full list of referring pages, use + C{pages = list(s.getReferences())} + + @param follow_redirects: if True, also iterate pages that link to a + redirect pointing to the page. + @param withTemplateInclusion: if True, also iterate pages where self + is used as a template. + @param onlyTemplateInclusion: if True, only iterate pages where self + is used as a template. + @param redirectsOnly: if True, only iterate redirects to self. + @param namespaces: only iterate pages in these namespaces + + """ + # N.B.: this method intentionally overlaps with backlinks() and + # embeddedin(). Depending on the interface, it may be more efficient + # to implement those methods in the site interface and then combine + # the results for this method, or to implement this method and then + # split up the results for the others. + return self.site().pagereferences( + self, follow_redirects, redirectsOnly, + withTemplateInclusion, onlyTemplateInclusion, + namespaces) + + def backlinks(self, followRedirects=True, filterRedirects=None, + namespaces=None): + """Return an iterator for pages that link to this page. + + @param followRedirects: if True, also iterate pages that link to a + redirect pointing to the page. + @param filterRedirects: if True, only iterate redirects; if False, + omit redirects; if None, do not filter + @param namespaces: only iterate pages in these namespaces + + """ + return self.site().pagebacklinks(self, followRedirects, filterRedirects, + namespaces) + + def embeddedin(self, filter_redirects=None, namespaces=None): + """Return an iterator for pages that embed this page as a template. + + @param filterRedirects: if True, only iterate redirects; if False, + omit redirects; if None, do not filter + @param namespaces: only iterate pages in these namespaces + + """ + return self.site().page_embeddedin(self, filter_redirects, namespaces) + + def canBeEdited(self): + """Return bool indicating whether this page can be edited. + + This returns True if and only if: + - page is unprotected, and bot has an account for this site, or + - page is protected, and bot has a sysop account for this site. + + """ + return self.site().page_can_be_edited(self) + + def botMayEdit(self): + """Return True if this page allows bots to edit it. + + This will be True if the page doesn't contain {{bots}} or + {{nobots}}, or it contains them and the active bot is allowed to + edit this page. (This method is only useful on those sites that + recognize the bot-exclusion protocol; on other sites, it will always + return True.) + + The framework enforces this restriction by default. It is possible + to override this by setting ignore_bot_templates=True in + user_config.py, or using page.put(force=True). + + """ # TODO: move this to Site object? + if config.ignore_bot_templates: #Check the "master ignore switch" + return True + try: + templates = self.templatesWithParams(); + except (pywikibot.NoPage, + pywikibot.IsRedirectPage, + pywikibot.SectionError): + return True + for template in templates: + title = template[0].title(withNamespace=False) + if title == 'Nobots': + return False + elif title == 'Bots': + if len(template[1]) == 0: + return True + else: + (ttype, bots) = template[1][0].split('=', 1) + bots = bots.split(',') + if ttype == 'allow': + if 'all' in bots or username in bots: + return True + else: + return False + if ttype == 'deny': + if 'all' in bots or username in bots: + return False + else: + return True + # no restricting template found + return True + + def save(self, comment=None, watch=None, minor=True, force=False, + async=False, callback=None): + """Save the current contents of page's text to the wiki. + + @param comment: The edit summary for the modification (optional, but + most wikis strongly encourage its use) + @type comment: unicode + @param watch: if True, add or if False, remove this Page to/from bot + user's watchlist; if None, leave watchlist status unchanged + @type watch: bool or None + @param minor: if True, mark this edit as minor + @type minor: bool + @param force: if True, ignore botMayEdit() setting + @type force: bool + @param async: if True, launch a separate thread to save + asynchronously + @param callback: a callable object that will be called after the + page put operation. This object must take two arguments: (1) a + Page object, and (2) an exception instance, which will be None + if the page was saved successfully. The callback is intended for + use by bots that need to keep track of which saves were + successful. + + """ + if not comment: + comment = pywikibot.default_comment # needs to be defined + if watch is None: + unwatch = False + watch = False + else: + unwatch = not watch + if not force and not self.botMayEdit: + raise pywikibot.PageNotSaved( + "Page %s not saved; editing restricted by {{bots}} template" + % self.title(asLink=True)) + if async: + thd = threading.Thread( + target=self._save, + args=(comment, minor, watch, unwatch, callback) + ) + pywikibot.threadpool.append(thd) + thd.start() + else: + self._save(comment, minor, watch, unwatch, callback) + + def _save(self, comment, minor, watch, unwatch, callback): + err = None + try: + done = self.site().editpage(self, summary=comment, minor=minor, + watch=watch, unwatch=unwatch) + if not done: + logger.warn("Page %s not saved" % self.title(asLink=True)) + else: + logger.info("Page %s saved" % self.title(asLink=True)) + except pywikibot.Error, err: + logger.exception("Error saving page %s" % self.title(asLink=True)) + if callback: + callback(self, err) + + def put(self, newtext, comment=u'', watchArticle=None, minorEdit=True, + force=False, async=False, callback=None): + """Save the page with the contents of the first argument as the text. + + This method is maintained primarily for backwards-compatibility. + For new code, using Page.save() is preferred. See save() method + docs for all parameters not listed here. + + @param newtext: The complete text of the revised page. + @type newtext: unicode + + """ + self.text = newtext + return self.save(comment, watchArticle, minorEdit, force, + async, callback) + + def put_async(self, newtext, comment=u'', watchArticle=None, + minorEdit=True, force=False, callback=None): + """Put page on queue to be saved to wiki asynchronously. + + Asynchronous version of put (takes the same arguments), which places + pages on a queue to be saved by a daemon thread. All arguments are + the same as for .put(). This version is maintained solely for + backwards-compatibility. + + """ + return self.put(self, newtext, comment, watchArticle, + minorEdit, force, callback, async=True) + + def linkedPages(self): + """Iterate Pages that this Page links to. + + Only returns pages from "normal" internal links. Image and category + links are omitted unless prefixed with ":". Embedded templates are + omitted (but links within them are returned). All interwiki and + external links are omitted. + + @return: a generator that yields Page objects. + + """ + return self.site().pagelinks(self) + + def interwiki(self, expand=True): + """Iterate interwiki links in the page text, excluding language links. + + @param expand: if True (default), include interwiki links found in + templates transcluded onto this page; if False, only iterate + interwiki links found in this page's own wikitext + @return: a generator that yields Link objects + + """ + # This function does not exist in the API, so it has to be + # implemented by screen-scraping + if expand: + text = self.expand_text() + else: + text = self.text + for linkmatch in pywikibot.link_regex.finditer( + pywikibot.textlib.removeDisabledParts(text)): + linktitle = linkmatch.group("title") + link = Link(linktitle, self.site()) + # only yield links that are to a different site and that + # are not language links + try: + if link.site != self.site(): + if linktitle.lstrip().startswith(":"): + # initial ":" indicates not a language link + yield link + elif link.site.family != self.site().family: + # link to a different family is not a language link + yield link + except pywikibot.Error: + # ignore any links with invalid contents + continue + + def langlinks(self): + """Iterate all interlanguage links on this page. + + @return: a generator that yields Link objects. + + """ + return self.site().pagelanglinks(self) + + @deprecate_arg("followRedirects", None) + @deprecate_arg("loose", None) + def imagelinks(self, followRedirects=None, loose=None): + """Iterate ImagePage objects for images displayed on this Page. + + @return: a generator that yields ImagePage objects. + + """ + return self.site().pageimages(self) + + def templates(self): + """Iterate Page objects for templates used on this Page. + + Template parameters are ignored. This method only returns embedded + templates, not template pages that happen to be referenced through + a normal link. + + """ + return self.site().pagetemplates(self) + + def templatesWithParams(self): + """Iterate templates used on this Page. + + @return: a generator that yields a tuple for each use of a template + in the page, with the template Page as the first entry and a list of + parameters as the second entry. + + """ + templates = pywikibot.textlib.extract_templates_and_params(self.text) + # backwards-compatibility: convert the dict returned as the second + # element into a list in the format used by old scripts + result = [] + for template in templates: + args = template[1] + positional = [] + named = {} + for key in sorted(args.keys()): + try: + int(key) + except ValueError: + named[key] = args[key] + else: + positional.append(args[key]) + for name in named: + positional.append("%s=%s" % (name, named[name])) + result.append((pywikibot.Page( + pywikibot.Link(template[0], self.site())), + positional)) + return result + + @deprecate_arg("nofollow_redirects", None) + def categories(self, withSortKey=False): + """Iterate categories that the article is in. + + @param withSortKey: if True, include the sort key in each Category. + @return: a generator that yields Category objects. + + """ + return self.site().pagecategories(self, withSortKey=withSortKey) + + def extlinks(self): + """Iterate all external URLs (not interwiki links) from this page. + + @return: a generator that yields unicode objects containing URLs. + + """ + return self.site().page_extlinks(self) + + def getRedirectTarget(self): + """Return a Page object for the target this Page redirects to. + + If this page is not a redirect page, will raise an IsNotRedirectPage + exception. This method also can raise a NoPage exception. + + """ + if not self.isRedirectPage(): + raise pywikibot.IsNotRedirectPage + if not isinstance(self._redir, Page): + self.site().getredirtarget(self) + return self._redir + + @deprecate_arg("forceReload", None) + def getVersionHistory(self, reverseOrder=False, getAll=False, + revCount=500): + """Load the version history page and return history information. + + Return value is a list of tuples, where each tuple represents one + edit and is built of revision id, edit date/time, user name, and + edit summary. Starts with the most current revision, unless + reverseOrder is True. Defaults to getting the first revCount edits, + unless getAll is True. + + """ + if getAll: + limit = None + else: + limit = revCount + self.site().loadrevisions(self, getText=False, rvdir=reverseOrder, + limit=limit) + if getAll: + revCount = len(self._revisions) + return [ ( self._revisions[rev].revid, + self._revisions[rev].timestamp, + self._revisions[rev].user, + self._revisions[rev].comment + ) for rev in sorted(self._revisions.keys(), + reverse=not reverseOrder)[ : revCount] + ] + + def getVersionHistoryTable(self, forceReload=False, reverseOrder=False, + getAll=False, revCount=500): + """Return the version history as a wiki table.""" + result = '{| border="1"\n' + result += '! oldid || date/time || username || edit summary\n' + for oldid, time, username, summary \ + in self.getVersionHistory(forceReload=forceReload, + reverseOrder=reverseOrder, + getAll=getAll, revCount=revCount): + result += '|----\n' + result += '| %s || %s || %s || <nowiki>%s</nowiki>\n'\ + % (oldid, time, username, summary) + result += '|}\n' + return result + + def fullVersionHistory(self): + """Iterate all previous versions including wikitext. + + @return: A generator that yields tuples consisting of revision ID, + edit date/time, user name and content + """ + return self.site().loadrevisions(self, withText=True) + + def contributingUsers(self): + """Return a set of usernames (or IPs) of users who edited this page.""" + edits = self.getVersionHistory() + users = set([edit[2] for edit in edits]) + return users + + @deprecate_arg("throttle", None) + def move(self, newtitle, reason=None, movetalkpage=True, sysop=False, + deleteAndMove=False, safe=True): + """Move this page to a new title. + + @param newtitle: The new page title. + @param reason: The edit summary for the move. + @param movetalkpage: If true, move this page's talk page (if it exists) + @param sysop: Try to move using sysop account, if available + @param deleteAndMove: if move succeeds, delete the old page + (usually requires sysop privileges, depending on wiki settings) + @param safe: If false, attempt to delete existing page at newtitle + (if there is one) and then move this page to that title + + """ + if reason is None: + logger.info(u'Moving %s to [[%s]].' + % (self.title(asLink=True), newtitle)) + reason = pywikibot.input(u'Please enter a reason for the move:') + # TODO: implement "safe" parameter + # TODO: implement "sysop" parameter + return self.site().movepage(self, newtitle, reason, + movetalk=movetalkpage, + noredirect=deleteAndMove) + + @deprecate_arg("throttle", None) + def delete(self, reason=None, prompt=True, throttle=None, mark=False): + """Deletes the page from the wiki. Requires administrator status. + + @param reason: The edit summary for the deletion. + @param prompt: If true, prompt user for confirmation before deleting. + @param mark: if true, and user does not have sysop rights, place a + speedy-deletion request on the page instead. + + """ + if reason is None: + logger.info(u'Deleting %s.' % (self.title(asLink=True))) + reason = pywikibot.input(u'Please enter a reason for the deletion:') + answer = u'y' + if prompt and not hasattr(self.site(), '_noDeletePrompt'): + answer = pywikibot.inputChoice(u'Do you want to delete %s?' + % self.title(asLink = True, forceInterwiki = True), + ['Yes', 'No', 'All'], + ['Y', 'N', 'A'], + 'N') + if answer in ['a', 'A']: + answer = 'y' + self.site()._noDeletePrompt = True + if answer in ['y', 'Y']: + return self.site().delete(self, reason, mark=mark) + + def loadDeletedRevisions(self): + """Retrieve all deleted revisions for this Page from Special/Undelete. + + Stores all revisions' timestamps, dates, editors and comments in + self._deletedRevs attribute. + + @return: list of timestamps (which can be used to retrieve revisions + later on). + + """ + return self.site().loadDeletedRevisions(self) + + def getDeletedRevision(self, timestamp, retrieveText=False): + """Return a particular deleted revision by timestamp. + + @return: a list of [date, editor, comment, text, restoration + marker]. text will be None, unless retrieveText is True (or has + been retrieved earlier). If timestamp is not found, returns + None. + + """ + return self.site().getDeletedRevision(self, timestamp, + getText=retrieveText) + + def markDeletedRevision(self, timestamp, undelete=True): + """Mark the revision identified by timestamp for undeletion. + + @param undelete: if False, mark the revision to remain deleted. + + """ + if self._deletedRevs == None: + self.loadDeletedRevisions() + if not self._deletedRevs.has_key(timestamp): + #TODO: Throw an exception? + return None + self._deletedRevs[timestamp][4] = undelete + self._deletedRevsModified = True + + @deprecate_arg("throttle", None) + def undelete(self, comment=None): + """Undelete revisions based on the markers set by previous calls. + + If no calls have been made since loadDeletedRevisions(), everything + will be restored. + + Simplest case:: + Page(...).undelete('This will restore all revisions') + + More complex:: + pg = Page(...) + revs = pg.loadDeletedRevsions() + for rev in revs: + if ... #decide whether to undelete a revision + pg.markDeletedRevision(rev) #mark for undeletion + pg.undelete('This will restore only selected revisions.') + + @param comment: The undeletion edit summary. + + """ + if comment is None: + logger.info(u'Preparing to undelete %s.' + % (self.title(asLink=True))) + comment = pywikibot.input( + u'Please enter a reason for the undeletion:') + return self.site().undelete(self, comment) + + @deprecate_arg("throttle", None) + def protect(self, edit='sysop', move='sysop', unprotect=False, + reason=None, prompt=True): + """(Un)protect a wiki page. Requires administrator status. + + Valid protection levels (in MediaWiki 1.12) are '' (equivalent to + 'none'), 'autoconfirmed', and 'sysop'. + + @param edit: Level of edit protection + @param move: Level of move protection + @param unprotect: If true, unprotect the page (equivalent to setting + all protection levels to '') + @param reason: Edit summary. + @param prompt: If true, ask user for confirmation. + + """ + if reason is None: + if unprotect: + un = u'un' + else: + un = u'' + logger.info(u'Preparing to %sprotect %s.' + % (un, self.title(asLink=True))) + reason = pywikibot.input(u'Please enter a reason for the action:') + if unprotect: + edit = move = "" + answer = 'y' + if prompt and not hasattr(self.site(), '_noProtectPrompt'): + answer = pywikibot.inputChoice( + u'Do you want to change the protection level of %s?' + % self.title(asLink=True, forceInterwiki = True), + ['Yes', 'No', 'All'], ['Y', 'N', 'A'], 'N') + if answer in ['a', 'A']: + answer = 'y' + self.site()._noProtectPrompt = True + if answer in ['y', 'Y']: + return self.site().protect(self, edit, move, reason) + + def change_category(article, oldCat, newCat, comment=None, sortKey=None, + inPlace=True): + """Remove page from oldCat and add it to newCat. + + oldCat and newCat should be Category objects. + If newCat is None, the category will be removed. + + """ # TODO: document remaining arguments + cats = self.categories(get_redirect=True) + site = self.site() + changesMade = False + + if not self.canBeEdited(): + pywikibot.output(u"Can't edit %s, skipping it..." + % self.title(asLink=True)) + return False + if inPlace == True: + newtext = pywikibot.textlib.replaceCategoryInPlace( + self.text, oldCat, newCat) + if newtext == self.text: + pywikibot.output( + u'No changes in made in page %s.' + % self.title(asLink=True)) + return False + try: + self.put(newtext, comment) + return True + except pywikibot.EditConflict: + pywikibot.output( + u'Skipping %s because of edit conflict' + % self.title(asLink=True)) + except pywikibot.LockedPage: + pywikibot.output(u'Skipping locked page %s' + % self.title(asLink=True)) + except pywikibot.SpamfilterError, error: + pywikibot.output( + u'Changing page %s blocked by spam filter (URL=%s)' + % (self.title(asLink=True), error.url)) + except pywikibot.NoUsername: + pywikibot.output( + u"Page %s not saved; sysop privileges required." + % self.title(asLink=True)) + except pywikibot.PageNotSaved, error: + pywikibot.output(u"Saving page %s failed: %s" + % (self.title(asLink=True), error.message)) + return False + + # This loop will replace all occurrences of the category to be changed, + # and remove duplicates. + newCatList = [] + newCatSet = set() + for i in range(len(cats)): + cat = cats[i] + if cat == oldCat: + changesMade = True + if not sortKey: + sortKey = cat.sortKey + if newCat: + if newCat.title() not in newCatSet: + newCategory = Category(site, newCat.title(), + sortKey=sortKey) + newCatSet.add(newCat.title()) + newCatList.append(newCategory) + elif cat.title() not in newCatSet: + newCatSet.add(cat.title()) + newCatList.append(cat) + + if not changesMade: + pywikibot.output(u'ERROR: %s is not in category %s!' + % (self.title(asLink=True), oldCat.title())) + else: + try: + text = pywikibot.textlib.replaceCategoryLinks(self.text, + newCatList) + except ValueError: + # Make sure that the only way replaceCategoryLinks() can return + # a ValueError is in the case of interwiki links to self. + pywikibot.output( + u'Skipping %s because of interwiki link to self' % self) + try: + self.put(text, comment) + except pywikibot.EditConflict: + pywikibot.output( + u'Skipping %s because of edit conflict' % self.title()) + except pywikibot.SpamfilterError, e: + pywikibot.output( + u'Skipping %s because of blacklist entry %s' + % (self.title(), e.url)) + except pywikibot.LockedPage: + pywikibot.output( + u'Skipping %s because page is locked' % self.title()) + except pywikibot.PageNotSaved, error: + pywikibot.output(u"Saving page %s failed: %s" + % (self.title(asLink=True), error.message)) + +######## DEPRECATED METHODS ######## + + def encoding(self): + """DEPRECATED: use Site.encoding() instead""" + logger.debug(u"Page.encoding() is deprecated; use Site.encoding().") + return self.site().encoding() + + def titleWithoutNamespace(self, underscore=False): + """DEPRECATED: use self.title(withNamespace=False) instead.""" + logger.debug( + u"Page.titleWithoutNamespace() method is deprecated.") + return self.title(underscore=underscore, withNamespace=False, + withSection=False) + + def titleForFilename(self): + """DEPRECATED: use self.title(as_filename=True) instead.""" + logger.debug( + u"Page.titleForFilename() method is deprecated.") + return self.title(as_filename=True) + + def sectionFreeTitle(self, underscore=False): + """DEPRECATED: use self.title(withSection=False) instead.""" + logger.debug( + u"Page.sectionFreeTitle() method is deprecated.") + return self.title(underscore=underscore, withSection=False) + + def aslink(self, forceInterwiki=False, textlink=False, noInterwiki=False): + """DEPRECATED: use self.title(asLink=True) instead.""" + logger.debug(u"Page.aslink() method is deprecated.") + return self.title(asLink=True, forceInterwiki=forceInterwiki, + allowInterwiki=not noInterwiki, textlink=textlink) + + def urlname(self): + """Return the Page title encoded for use in an URL. + + DEPRECATED: use self.title(asUrl=True) instead. + + """ + logger.debug(u"Page.urlname() method is deprecated.") + return self.title(asUrl=True) + +####### DISABLED METHODS (warnings provided) ###### + # these methods are easily replaced by editing the page's text using + # textlib methods and then using put() on the result. + + def removeImage(self, image, put=False, summary=None, safe=True): + """Old method to remove all instances of an image from page.""" + logger.warning(u"Page.removeImage() is no longer supported.") + + def replaceImage(self, image, replacement=None, put=False, summary=None, + safe=True): + """Old method to replace all instances of an image with another.""" + logger.warning(u"Page.replaceImage() is no longer supported.") + + +class ImagePage(Page): + """A subclass of Page representing an image descriptor wiki page. + + Supports the same interface as Page, with the following added methods: + + getImagePageHtml : Download image page and return raw HTML text. + fileURL : Return the URL for the image described on this + page. + fileIsOnCommons : Return True if image stored on Wikimedia + Commons. + fileIsShared : Return True if image stored on Wikitravel + shared repository. + getFileMd5Sum : Return image file's MD5 checksum. + getFileVersionHistory : Return the image file's version history. + getFileVersionHistoryTable: Return the version history in the form of a + wiki table. + usingPages : Iterate Pages on which the image is displayed. + + """ + def __init__(self, source, title=u"", insite=None): + Page.__init__(self, source, title, 6) + if self.namespace() != 6: + raise ValueError(u"'%s' is not in the image namespace!" % title) + + def getImagePageHtml(self): + """ + Download the image page, and return the HTML, as a unicode string. + + Caches the HTML code, so that if you run this method twice on the + same ImagePage object, the page will only be downloaded once. + """ + if not hasattr(self, '_imagePageHtml'): + from pywikibot.data import http + path = "%s/index.php?title=%s" \ + % (self.site().scriptpath(), self.title(asUrl=True)) + self._imagePageHtml = http.request(self.site(), path) + return self._imagePageHtml + + def fileUrl(self): + """Return the URL for the image described on this page.""" + # TODO add scaling option? + if not hasattr(self, '_imageinfo'): + self._imageinfo = self.site().getimageinfo(self) #FIXME + return self._imageinfo['url'] + + def fileIsOnCommons(self): + """Return True if the image is stored on Wikimedia Commons""" + return self.fileUrl().startswith( + 'http://upload.wikimedia.org/wikipedia/commons/') + + def fileIsShared(self): + """Return True if image is stored on any known shared repository.""" + # as of now, the only known repositories are commons and wikitravel + if 'wikitravel_shared' in self.site().shared_image_repository(): + return self.fileUrl().startswith( + u'http://wikitravel.org/upload/shared/') + return self.fileIsOnCommons() + + def getFileMd5Sum(self): + """Return image file's MD5 checksum.""" + logger.debug( + "ImagePage.getFileMd5Sum() is deprecated; use getFileSHA1Sum().") +# FIXME: MD5 might be performed on incomplete file due to server disconnection +# (see bug #1795683). + import md5, urllib + f = urllib.urlopen(self.fileUrl()) + # TODO: check whether this needs a User-Agent header added + md5Checksum = md5.new(f.read()).hexdigest() + f.close() + return md5Checksum + + def getFileSHA1Sum(self): + """Return image file's SHA1 checksum.""" + if not hasattr(self, '_imageinfo'): + self._imageinfo = self.site().getimageinfo(self) #FIXME + return self._imageinfo['sha1'] + + def getFileVersionHistory(self): + """Return the image file's version history. + + @return: An iterator yielding tuples containing (timestamp, + username, resolution, filesize, comment). + + """ + #TODO; return value may need to change + return self.site().getimageinfo(self, history=True) #FIXME + + def getFileVersionHistoryTable(self): + """Return the version history in the form of a wiki table.""" + lines = [] + #TODO: if getFileVersionHistory changes, make sure this follows it + for (datetime, username, resolution, size, comment) \ + in self.getFileVersionHistory(): + lines.append('| %s || %s || %s || %s || <nowiki>%s</nowiki>' \ + % (datetime, username, resolution, size, comment)) + return u'{| border="1"\n! date/time || username || resolution || size || edit summary\n|----\n' + u'\n|----\n'.join(lines) + '\n|}' + + def usingPages(self): + """Yield Pages on which the image is displayed.""" + return self.site().getimageusage(self) + + +class Category(Page): + """A page in the Category: namespace""" + + @deprecate_arg("sortKey", None) + def __init__(self, source, title=u"", insite=None): + """All parameters are the same as for Page() constructor. + + """ + Page.__init__(self, source, title, 14) + if self.namespace() != 14: + raise ValueError(u"'%s' is not in the category namespace!" + % title) + + @deprecate_arg("forceInterwiki", None) + @deprecate_arg("textlink", None) + @deprecate_arg("noInterwiki", None) + def aslink(self, sortKey=u''): + """Return a link to place a page in this Category. + + Use this only to generate a "true" category link, not for interwikis + or text links to category pages. + + @param sortKey: The sort key for the article to be placed in this + Category; if omitted, default sort key is used. + @type sortKey: (optional) unicode + + """ + if sortKey: + titleWithSortKey = '%s|%s' % (self.title(withSection=False), + self.sortKey) + else: + titleWithSortKey = self.title(withSection=False) + return '[[%s]]' % titleWithSortKey + + @deprecate_arg("startFrom", None) + @deprecate_arg("cacheResults", None) + def subcategories(self, recurse=False): + """Iterate all subcategories of the current category. + + @param recurse: if not False or 0, also iterate subcategories of + subcategories. If an int, limit recursion to this number of + levels. (Example: recurse=1 will iterate direct subcats and + first-level sub-sub-cats, but no deeper.) + @type recurse: int or bool + + """ + if not isinstance(recurse, bool) and recurse: + recurse = recurse - 1 + if not hasattr(self, "_subcats"): + self._subcats = [] + for member in self.site().categorymembers(self, namespaces=[14]): + subcat = Category(self.site(), member.title()) + self._subcats.append(subcat) + yield subcat + if recurse: + for item in subcat.subcategories(recurse): + yield item + else: + for subcat in self._subcats: + yield subcat + if recurse: + for item in subcat.subcategories(recurse): + yield item + + @deprecate_arg("startFrom", None) + def articles(self, recurse=False): + """ + Yields all articles in the current category. + + @param recurse: if not False or 0, also iterate articles in + subcategories. If an int, limit recursion to this number of + levels. (Example: recurse=1 will iterate articles in first-level + subcats, but no deeper.) + @type recurse: int or bool + + """ + namespaces = [x for x in self.site().namespaces().keys() + if x>=0 and x!=14] + for member in self.site().categorymembers(self, + namespaces=namespaces): + yield member + if recurse: + if not isinstance(recurse, bool) and recurse: + recurse = recurse - 1 + for subcat in self.subcategories(): + for article in subcat.articles(recurse): + yield article + + def isEmptyCategory(self): + """Return True if category has no members (including subcategories).""" + for member in self.site().categorymembers(self, limit=1): + return False + return True + + def copyTo(self, catname): + """ + Copy text of category page to a new page. Does not move contents. + + @param catname: New category title (without namespace) + @return: True if copying was successful, False if target page + already existed. + + """ + # This seems far too specialized to be in the top-level framework + catname = self.site().category_namespace() + ':' + catname + targetCat = Category(self.site(), catname) + if targetCat.exists(): + logger.warn('Target page %s already exists!' + % targetCat.title()) + return False + else: + logger.info('Moving text from %s to %s.' + % (self.title(), targetCat.title())) + authors = ', '.join(self.contributingUsers()) + creationSummary = pywikibot.translate( + self.site(), msg_created_for_renaming + ) % (self.title(), authors) + targetCat.put(self.get(), creationSummary) + return True + + def copyAndKeep(self, catname, cfdTemplates): + """Copy partial category page text (not contents) to a new title. + + Like copyTo above, except this removes a list of templates (like + deletion templates) that appear in the old category text. It also + removes all text between the two HTML comments BEGIN CFD TEMPLATE + and END CFD TEMPLATE. (This is to deal with CFD templates that are + substituted.) + + Returns true if copying was successful, false if target page already + existed. + + @param catname: New category title (without namespace) + @param cfdTemplates: A list (or iterator) of templates to be removed + from the page text + @return: True if copying was successful, False if target page + already existed. + + """ + # I don't see why we need this as part of the framework either + catname = self.site().category_namespace() + ':' + catname + targetCat = Category(self.site(), catname) + if targetCat.exists(): + logger.warn('Target page %s already exists!' + % targetCat.title()) + return False + else: + logger.info('Moving text from %s to %s.' + % (self.title(), targetCat.title())) + authors = ', '.join(self.contributingUsers()) + creationSummary = pywikibot.translate( + self.site(), msg_created_for_renaming + ) % (self.title(), authors) + newtext = self.get() + for regexName in cfdTemplates: + matchcfd = re.compile(r"{{%s.*?}}" % regexName, re.IGNORECASE) + newtext = matchcfd.sub('',newtext) + matchcomment = re.compile( + r"<!--BEGIN CFD TEMPLATE-->.*?<!--END CFD TEMPLATE-->", + re.IGNORECASE | re.MULTILINE | re.DOTALL) + newtext = matchcomment.sub('', newtext) + pos = 0 + while (newtext[pos:pos+1] == "\n"): + pos = pos + 1 + newtext = newtext[pos:] + targetCat.put(newtext, creationSummary) + return True + +#### DEPRECATED METHODS #### + def subcategoriesList(self, recurse=False): + """DEPRECATED: Equivalent to list(self.subcategories(...))""" + logger.debug("Category.subcategoriesList() method is deprecated.") + return sorted(list(set(self.subcategories(recurse)))) + + def articlesList(self, recurse=False): + """DEPRECATED: equivalent to list(self.articles(...))""" + logger.debug("Category.articlesList() method is deprecated.") + return sorted(list(set(self.articles(recurse)))) + + def supercategories(self): + """DEPRECATED: equivalent to self.categories()""" + logger.debug("Category.supercategories() method is deprecated.") + return self.categories() + + def supercategoriesList(self): + """DEPRECATED: equivalent to list(self.categories(...))""" + logger.debug("Category.articlesList() method is deprecated.") + return sorted(list(set(self.categories()))) + + +class Revision(object): + """A structure holding information about a single revision of a Page.""" + def __init__(self, revid, timestamp, user, anon=False, comment=u"", + text=None, minor=False): + """All parameters correspond to object attributes (e.g., revid + parameter is stored as self.revid) + + @param revid: Revision id number + @type revid: int + @param text: Revision wikitext. + @type text: unicode, or None if text not yet retrieved + @param timestamp: Revision time stamp (in ISO 8601 format) + @type timestamp: unicode + @param user: user who edited this revision + @type user: unicode + @param anon: user is unregistered + @type anon: bool + @param comment: edit comment text + @type comment: unicode + @param minor: edit flagged as minor + @type minor: bool + + """ + self.revid = revid + self.text = text + self.timestamp = timestamp + self.user = user + self.anon = anon + self.comment = comment + self.minor = minor + + +class Link(object): + """A Mediawiki link (local or interwiki) + + Has the following attributes: + + - site: The Site object for the wiki linked to + - namespace: The namespace of the page linked to (int) + - title: The title of the page linked to (unicode); does not include + namespace or section + - section: The section of the page linked to (unicode or None); this + contains any text following a '#' character in the title + - anchor: The anchor text (unicode or None); this contains any text + following a '|' character inside the link + + """ + illegal_titles_pattern = re.compile( + # Matching titles will be held as illegal. + u'''[^ %!"$&'()*,\-.\/0-9:;=?@A-Z\\^_`a-z~\u0080-\uFFFF+]''' + # URL percent encoding sequences interfere with the ability + # to round-trip titles -- you can't link to them consistently. + u'|%[0-9A-Fa-f]{2}' + # XML/HTML character references produce similar issues. + u'|&[A-Za-z0-9\x80-\xff]+;' + u'|&#[0-9]+;' + u'|&#x[0-9A-Fa-f]+;' + ) + + def __init__(self, text, source=None, defaultNamespace=0): + """Constructor + + @param text: the link text (everything appearing between [[ and ]] + on a wiki page) + @type text: unicode + @param source: the Site on which the link was found (not necessarily + the site to which the link refers) + @type source: Site + @param defaultNamespace: a namespace to use if the link does not + contain one (defaults to 0) + @type defaultNamespace: int + + """ + self._text = text + self._source = source + self._defaultns = defaultNamespace + + def parse(self): + """Parse text; called internally when accessing attributes""" + + # First remove the anchor, which is stored unchanged, if there is one + if u"|" in self._text: + self._text, self._anchor = self._text.split(u"|", 1) + else: + self._anchor = None + + if self._source is None: + self._source = pywikibot.Site() + self._site = self._source + + # Clean up the name, it can come from anywhere. + # Convert HTML entities to unicode + t = html2unicode(self._text) + + # Convert URL-encoded characters to unicode + t = url2unicode(t, site=self._site) + + # Normalize unicode string to a NFC (composed) format to allow proper + # string comparisons. According to + # http://svn.wikimedia.org/viewvc/mediawiki/branches/REL1_6/phase3/includes/no... + # the mediawiki code normalizes everything to NFC, not NFKC (which + # might result in information loss). + t = unicodedata.normalize('NFC', t) + + # This code was adapted from Title.php : secureAndSplit() + # + if u'\ufffd' in t: + raise pywikibot.Error("Title contains illegal char (\uFFFD)") + self._namespace = self._defaultns + + # Replace underscores by spaces + t = t.replace(u"_", u" ") + # replace multiple spaces and underscores with a single space + while u" " in t: t = t.replace(u" ", u" ") + # Strip spaces at both ends + t = t.strip(" ") + # Remove left-to-right and right-to-left markers. + t = t.replace(u"\u200e", u"").replace(u"\u200f", u"") + + firstPass = True + while u":" in t: + # Initial colon indicates main namespace rather than default + if t.startswith(u":"): + self._namespace = 0 + # remove the colon but continue processing + # remove any subsequent whitespace + t = t.lstrip(u":").lstrip(u" ") + continue + + fam = self._site.family + prefix = t[ :t.index(u":")].lower() + ns = self._site.ns_index(prefix) + if ns: + # Ordinary namespace + t = t[t.index(u":"): ].lstrip(u":").lstrip(u" ") + self._namespace = ns + break + if prefix in fam.langs.keys()\ + or prefix in fam.get_known_families(site=self._site): + # looks like an interwiki link + if not firstPass: + # Can't make a local interwiki link to an interwiki link. + raise pywikibot.Error( + "Improperly formatted interwiki link '%s'" + % self._text) + t = t[t.index(u":"): ].lstrip(u":").lstrip(u" ") + if prefix in fam.langs.keys(): + newsite = pywikibot.Site(prefix, fam) + else: + otherlang = self._site.code + familyName = fam.get_known_families(site=self._site)[prefix] + if familyName in ['commons', 'meta']: + otherlang = familyName + try: + newsite = pywikibot.Site(otherlang, familyName) + except ValueError: + raise pywikibot.Error("""\ +%s is not a local page on %s, and the %s family is +not supported by PyWikiBot!""" + % (title, self._site(), familyName)) + + # Redundant interwiki prefix to the local wiki + if newsite == self._site: + if not t: + # Can't have an empty self-link + raise pywikibot.Error( + "Invalid link title: '%s'" % self._text) + firstPass = False + continue + self._site = newsite + else: + break # text before : doesn't match any known prefix + + if u"#" in t: + t, sec = t.split(u'#', 1) + t, self._section = t.rstrip(), sec.lstrip() + else: + self._section = None + + # Reject illegal characters. + m = Link.illegal_titles_pattern.search(t) + if m: + raise pywikibot.Error( + u"Invalid title: contains illegal char(s) '%s'" % m.group(0)) + + # Pages with "/./" or "/../" appearing in the URLs will + # often be unreachable due to the way web browsers deal + #* with 'relative' URLs. Forbid them explicitly. + + if u'.' in t and ( + t == u'.' or t == u'..' + or t.startswith(u"./") + or t.startswith(u"../") + or u"/./" in t + or u"/../" in t + or t.endswith(u"/.") + or t.endswith(u"/..") + ): + raise pywikibot.Error( + "Invalid title (contains . / combinations): '%s'" + % self._text) + + # Magic tilde sequences? Nu-uh! + if u"~~~" in t: + raise pywikibot.Error("Invalid title (contains ~~~): '%s'" % self._text) + + if self._namespace != -1 and len(t) > 255: + raise pywikibot.Error("Invalid title (over 255 bytes): '%s'" % t) + + if self._site.case() == 'first-letter': + t = t[:1].upper() + t[1:] + + # Can't make a link to a namespace alone... + # "empty" local links can only be self-links + # with a fragment identifier. + if not t and self._site == self._source and self._namespace != 0: + raise ValueError("Invalid link (no page title): '%s'" % self._text) + + self._title = t + + # define attributes, to be evaluated lazily + + @property + def site(self): + if not hasattr(self, "_site"): + self.parse() + return self._site + + @property + def namespace(self): + if not hasattr(self, "_namespace"): + self.parse() + return self._namespace + + @property + def title(self): + if not hasattr(self, "_title"): + self.parse() + return self._title + + @property + def section(self): + if not hasattr(self, "_section"): + self.parse() + return self._section + + @property + def anchor(self): + if not hasattr(self, "_anchor"): + self.parse() + return self._anchor + + def astext(self, onsite=None): + """Return a text representation of the link. + + @param onsite: if specified, present as a (possibly interwiki) link + from the given site; otherwise, present as an internal link on + the source site. + + """ + if onsite is None: + onsite = self.site + title = self.title + if self.namespace: + title = onsite.namespace(self.namespace) + ":" + title + if self.section: + title = title + "#" + self.section + if onsite == self.site: + return u'[[%s]]' % title + if onsite.family == self.site.family: + return u'[[%s:%s]]' % (self.site.code, title) + if self.site.family.name == self.site.code: + # use this form for sites like commons, where the + # code is the same as the family name + return u'[[%s:%s]]' % (self.site.code, + title) + return u'[[%s:%s:%s]]' % (self.site.family.name, + self.site.code, + title) + + def __str__(self): + return self.astext() + + def __cmp__(self, other): + """Test for equality and inequality of Link objects. + + Link objects are "equal" if and only if they are on the same site + and have the same normalized title, including section if any. + + Link objects are sortable by site, then namespace, then title. + + """ + if not isinstance(other, Link): + # especially, return -1 if other is None + return -1 + if not self.site == other.site: + return cmp(self.site, other.site) + if self.namespace != other.namespace: + return cmp(self.namespace, other.namespace) + return cmp(self.title, other.title) + + +# Utility functions for parsing page titles + +def html2unicode(text, ignore = []): + """Return text, replacing HTML entities by equivalent unicode characters.""" + # This regular expression will match any decimal and hexadecimal entity and + # also entities that might be named entities. + entityR = re.compile( + r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));') + # These characters are Html-illegal, but sadly you *can* find some of + # these and converting them to unichr(decimal) is unsuitable + convertIllegalHtmlEntities = { + 128 : 8364, # € + 130 : 8218, # ‚ + 131 : 402, # ƒ + 132 : 8222, # „ + 133 : 8230, # … + 134 : 8224, # † + 135 : 8225, # ‡ + 136 : 710, # ˆ + 137 : 8240, # ‰ + 138 : 352, # Š + 139 : 8249, # ‹ + 140 : 338, # Œ + 142 : 381, # Ž + 145 : 8216, # ‘ + 146 : 8217, # ’ + 147 : 8220, # “ + 148 : 8221, # ” + 149 : 8226, # • + 150 : 8211, # – + 151 : 8212, # — + 152 : 732, # ˜ + 153 : 8482, # ™ + 154 : 353, # š + 155 : 8250, # › + 156 : 339, # œ + 158 : 382, # ž + 159 : 376 # Ÿ + } + #ensuring that illegal   and , which have no known values, + #don't get converted to unichr(129), unichr(141) or unichr(157) + ignore = set(ignore) | set([129, 141, 157]) + result = u'' + i = 0 + found = True + while found: + text = text[i:] + match = entityR.search(text) + if match: + unicodeCodepoint = None + if match.group('decimal'): + unicodeCodepoint = int(match.group('decimal')) + elif match.group('hex'): + unicodeCodepoint = int(match.group('hex'), 16) + elif match.group('name'): + name = match.group('name') + if htmlentitydefs.name2codepoint.has_key(name): + # We found a known HTML entity. + unicodeCodepoint = htmlentitydefs.name2codepoint[name] + result += text[:match.start()] + try: + unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint] + except KeyError: + pass + if unicodeCodepoint and unicodeCodepoint not in ignore: + result += unichr(unicodeCodepoint) + else: + # Leave the entity unchanged + result += text[match.start():match.end()] + i = match.end() + else: + result += text + found = False + return result + +def url2unicode(title, site, site2 = None): + """Convert url-encoded text to unicode using site's encoding. + + If site2 is provided, try its encodings as well. Uses the first encoding + that doesn't cause an error. + + """ + # create a list of all possible encodings for both hint sites + encList = [site.encoding()] + list(site.encodings()) + if site2 and site2 <> site: + encList.append(site2.encoding()) + encList += list(site2.encodings()) + firstException = None + # try to handle all encodings (will probably retry utf-8) + for enc in encList: + try: + t = title.encode(enc) + t = urllib.unquote(t) + return unicode(t, enc) + except UnicodeError, ex: + if not firstException: + firstException = ex + pass + # Couldn't convert, raise the original exception + raise firstException +
Property changes on: branches/rewrite/pywikibot/page.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native
Modified: branches/rewrite/pywikibot/pagegenerators.py =================================================================== --- branches/rewrite/pywikibot/pagegenerators.py 2008-12-16 19:34:48 UTC (rev 6155) +++ branches/rewrite/pywikibot/pagegenerators.py 2008-12-16 19:40:20 UTC (rev 6156) @@ -1,965 +1,965 @@ -# -*- coding: utf-8 -*- -"""This module offers a wide variety of page generators. A page generator is an -object that is iterable (see http://www.python.org/dev/peps/pep-0255/ ) and -that yields page objects on which other scripts can then work. - -In general, there is no need to run this script directly. It can, however, -be run for testing purposes. It will then print the page titles to standard -output. - -These parameters are supported to specify which pages titles to print: - -¶ms; -""" -# -# (C) Pywikipedia bot team, 2008 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id: $' - -import pywikibot - -import itertools -import Queue -import re -import sys -import threading - - -# ported from version 1 for backwards-compatibility -# most of these functions just wrap a Site or Page method that returns -# a generator - -parameterHelp = """\ --cat Work on all pages which are in a specific category. - Argument can also be given as "-cat:categoryname" or - as "-cat:categoryname|fromtitle". - --catr Like -cat, but also recursively includes pages in - subcategories, sub-subcategories etc. of the - given category. - Argument can also be given as "-catr:categoryname" or - as "-catr:categoryname|fromtitle". - --subcats Work on all subcategories of a specific category. - Argument can also be given as "-subcats:categoryname" or - as "-subcats:categoryname|fromtitle". - --subcatsr Like -subcats, but also includes sub-subcategories etc. of - the given category. - Argument can also be given as "-subcatsr:categoryname" or - as "-subcatsr:categoryname|fromtitle". - --uncat Work on all pages which are not categorised. - --uncatcat Work on all categories which are not categorised. - --uncatfiles Work on all files which are not categorised. - --file Read a list of pages to treat from the named text file. - Page titles in the file must be enclosed with [[brackets]]. - Argument can also be given as "-file:filename". - --filelinks Work on all pages that use a certain image/media file. - Argument can also be given as "-filelinks:filename". - --yahoo Work on all pages that are found in a Yahoo search. - Depends on python module pYsearch. See yahoo_appid in - config.py for instructions. - --search Work on all pages that are found in a MediaWiki search - across all namespaces. - --google Work on all pages that are found in a Google search. - You need a Google Web API license key. Note that Google - doesn't give out license keys anymore. See google_key in - config.py for instructions. - Argument can also be given as "-google:searchstring". - --interwiki Work on the given page and all equivalent pages in other - languages. This can, for example, be used to fight - multi-site spamming. - Attention: this will cause the bot to modify - pages on several wiki sites, this is not well tested, - so check your edits! - --links Work on all pages that are linked from a certain page. - Argument can also be given as "-links:linkingpagetitle". - --new Work on the 60 newest pages. If given as -new:x, will work - on the x newest pages. - --imagelinks Work on all images that are linked from a certain page. - Argument can also be given as "-imagelinks:linkingpagetitle". - --newimages Work on the 100 newest images. If given as -newimages:x, - will work on the x newest images. - --ref Work on all pages that link to a certain page. - Argument can also be given as "-ref:referredpagetitle". - --start Specifies that the robot should go alphabetically through - all pages on the home wiki, starting at the named page. - Argument can also be given as "-start:pagetitle". - - You can also include a namespace. For example, - "-start:Template:!" will make the bot work on all pages - in the template namespace. - --prefixindex Work on pages commencing with a common prefix. - --regex Obsolete, use -titleregex - --titleregex Work on titles that match the given regular expression. - --transcludes Work on all pages that use a certain template. - Argument can also be given as "-transcludes:Template:Title". - --unusedfiles Work on all description pages of images/media files that are - not used anywhere. - Argument can be given as "-unusedfiles:n" where - n is the maximum number of articles to work on. - --unwatched Work on all articles that are not watched by anyone. - Argument can be given as "-unwatched:n" where - n is the maximum number of articles to work on. - --usercontribs Work on all articles that were edited by a certain user : - Example : -usercontribs:DumZiBoT - --weblink Work on all articles that contain an external link to - a given URL; may be given as "-weblink:url" - --withoutinterwiki Work on all pages that don't have interlanguage links. - Argument can be given as "-withoutinterwiki:n" where - n is some number (??). -""" - -docuReplacements = {'¶ms;': parameterHelp} - -# if a bot uses GeneratorFactory, the module should include the line -# docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp} -# and include the marker ¶ms; in the module's docstring - - -class GeneratorFactory(object): - """Process command line arguments and return appropriate page generator.""" - - def setCategoryGen(self, arg, length, recurse = False): - if len(arg) == length: - categoryname = pywikibot.input(u'Please enter the category name:') - else: - categoryname = arg[length + 1:] - - ind = categoryname.find('|') - if ind > 0: - startfrom = categoryname[ind + 1:] - categoryname = categoryname[:ind] - else: - startfrom = None - - cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname)) - return CategorizedPageGenerator(cat, start=startfrom, recurse=recurse) - - def setSubCategoriesGen(self, arg, length, recurse=False): - if len(arg) == length: - categoryname = pywikibot.input(u'Please enter the category name:') - else: - categoryname = arg[length + 1:] - - ind = categoryname.find('|') - if ind > 0: - startfrom = categoryname[ind + 1:] - categoryname = categoryname[:ind] - else: - startfrom = None - - cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname)) - return SubCategoriesPageGenerator(cat, start=startfrom, recurse=recurse) - - def handleArg(self, arg): - gen = None - if arg.startswith('-filelinks'): - fileLinksPageTitle = arg[11:] - if not fileLinksPageTitle: - fileLinksPageTitle = pywikibot.input( - u'Links to which image page should be processed?') - if fileLinksPageTitle.startswith(pywikibot.Site().namespace(6) - + ":"): - fileLinksPage = pywikibot.ImagePage(pywikibot.Site(), - fileLinksPageTitle) - else: - fileLinksPage = pywikibot.ImagePage(pywikibot.Site(), - 'Image:' + - fileLinksPageTitle) - gen = FileLinksGenerator(fileLinksPage) - elif arg.startswith('-unusedfiles'): - if len(arg) == 12: - gen = UnusedFilesGenerator() - else: - gen = UnusedFilesGenerator(number = int(arg[13:])) - elif arg.startswith('-unwatched'): - if len(arg) == 10: - gen = UnwatchedPagesPageGenerator() - else: - gen = UnwatchedPagesPageGenerator(number = int(arg[11:])) - elif arg.startswith('-usercontribs'): - gen = UserContributionsGenerator(arg[14:]) - elif arg.startswith('-withoutinterwiki'): - if len(arg) == 17: - gen = WithoutInterwikiPageGenerator() - else: - gen = WithoutInterwikiPageGenerator(number = int(arg[18:])) - elif arg.startswith('-interwiki'): - title = arg[11:] - if not title: - title = pywikibot.input(u'Which page should be processed?') - page = pywikibot.Page(pywikibot.Site(), title) - gen = InterwikiPageGenerator(page) - elif arg.startswith('-file'): - textfilename = arg[6:] - if not textfilename: - textfilename = pywikibot.input( - u'Please enter the local file name:') - gen = TextfilePageGenerator(textfilename) - elif arg.startswith('-catr'): - gen = self.setCategoryGen(arg, 5, recurse = True) - elif arg.startswith('-cat'): - gen = self.setCategoryGen(arg, 4) - elif arg.startswith('-subcatsr'): - gen = self.setSubCategoriesGen(arg, 9, recurse = True) - elif arg.startswith('-subcats'): - gen = self.setSubCategoriesGen(arg, 8) - elif arg.startswith('-uncatfiles'): - gen = UnCategorizedImageGenerator() - elif arg.startswith('-uncatcat'): - gen = UnCategorizedCategoryGenerator() - elif arg.startswith('-uncat'): - gen = UnCategorizedPageGenerator() - elif arg.startswith('-ref'): - referredPageTitle = arg[5:] - if not referredPageTitle: - referredPageTitle = pywikibot.input( - u'Links to which page should be processed?') - referredPage = pywikibot.Page(pywikibot.Site(), referredPageTitle) - gen = ReferringPageGenerator(referredPage) - elif arg.startswith('-links'): - linkingPageTitle = arg[7:] - if not linkingPageTitle: - linkingPageTitle = pywikibot.input( - u'Links from which page should be processed?') - linkingPage = pywikibot.Page(pywikibot.Site(), linkingPageTitle) - gen = LinkedPageGenerator(linkingPage) - elif arg.startswith('-weblink'): - url = arg[9:] - if not url: - url = pywikibot.input( - u'Pages with which weblink should be processed?') - gen = LinksearchPageGenerator(url) - elif arg.startswith('-transcludes'): - transclusionPageTitle = arg[len('-transcludes:'):] - if not transclusionPageTitle: - transclusionPageTitle = pywikibot.input( - u'Pages that transclude which page should be processed?') - transclusionPage = pywikibot.Page(pywikibot.Site(), - 'Template:%s' % transclusionPageTitle) - gen = ReferringPageGenerator(transclusionPage, - onlyTemplateInclusion=True) - elif arg.startswith('-start'): - if arg.startswith('-startxml'): - pywikibot.output(u'-startxml : wrong parameter') - raise ValueError - firstPageTitle = arg[7:] - if not firstPageTitle: - firstPageTitle = pywikibot.input( - u'At which page do you want to start?') - namespace = pywikibot.Page(pywikibot.Site(), - firstPageTitle).namespace() - firstPageTitle = pywikibot.Page(pywikibot.link(firstPageTitle) - ).titleWithoutNamespace() - gen = AllpagesPageGenerator(firstPageTitle, namespace, - includeredirects=False) - elif arg.startswith('-prefixindex'): - prefix = arg[13:] - namespace = None - if not prefix: - prefix = pywikibot.input( - u'What page names are you looking for?') - gen = PrefixingPageGenerator(prefix=prefix) - elif arg.startswith('-newimages'): - limit = arg[11:] or pywikibot.input( - u'How many images do you want to load?') - gen = NewimagesPageGenerator(number=int(limit)) - elif arg.startswith('-new'): - if len(arg) >=5: - gen = NewpagesPageGenerator(number=int(arg[5:])) - else: - gen = NewpagesPageGenerator(number=60) - elif arg.startswith('-imagelinks'): - imagelinkstitle = arg[len('-imagelinks:'):] - if not imagelinkstitle: - imagelinkstitle = pywikibot.input( - u'Images on which page should be processed?') - imagelinksPage = pywikibot.Page(pywikibot.Link(imagelinkstitle)) - gen = ImagesPageGenerator(imagelinksPage) - elif arg.startswith('-search'): - mediawikiQuery = arg[8:] - if not mediawikiQuery: - mediawikiQuery = pywikibot.input( - u'What do you want to search for?') - # In order to be useful, all namespaces are required - gen = SearchPageGenerator(mediawikiQuery, namespaces = []) - elif arg.startswith('-google'): - gen = GoogleSearchPageGenerator(arg[8:]) - elif arg.startswith('-titleregex'): - if len(arg) == 6: - regex = pywikibot.input( - u'What page names are you looking for?') - else: - regex = arg[7:] - gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex) - elif arg.startswith('-yahoo'): - gen = YahooSearchPageGenerator(arg[7:]) - else: - return None - # make sure all yielded pages are unique - gen = DuplicateFilterPageGenerator(gen) - return gen - - -class ThreadedGenerator(threading.Thread): - """Look-ahead generator class. - - Runs a generator in a separate thread and queues the results; can - be called like a regular generator. - - Subclasses should override self.generator, _not_ self.run - - Important: the generator thread will stop itself if the generator's - internal queue is exhausted; but, if the calling program does not use - all the generated values, it must call the generator's stop() method to - stop the background thread. Example usage: - - >>> gen = ThreadedGenerator(target=foo) - >>> try: - ... for data in gen: - ... do_work(data) - ... finally: - ... gen.stop() - - """ #NOT CURRENTLY USED: Intended for future development - - def __init__(self, group=None, target=None, name="GeneratorThread", - args=(), kwargs=None, qsize=65536): - """Constructor. Takes same keyword arguments as threading.Thread. - - target must be a generator function (or other callable that returns - an iterable object). - - @param qsize: The size of the lookahead queue. The larger the qsize, - the more values will be computed in advance of use (which can eat - up memory and processor time). - @type qsize: int - - """ - if kwargs is None: - kwargs = {} - if target: - self.generator = target - if not hasattr(self, "generator"): - raise RuntimeError("No generator for ThreadedGenerator to run.") - self.args, self.kwargs = args, kwargs - threading.Thread.__init__(self, group=group, name=name) - self.queue = Queue.Queue(qsize) - self.finished = threading.Event() - - def __iter__(self): - """Iterate results from the queue.""" - if not self.isAlive() and not self.finished.isSet(): - self.start() - # if there is an item in the queue, yield it, otherwise wait - while not self.finished.isSet(): - try: - yield self.queue.get(True, 0.25) - except Queue.Empty: - pass - except KeyboardInterrupt: - self.stop() - - def stop(self): - """Stop the background thread.""" -## if not self.finished.isSet(): -## pywikibot.output("DEBUG: signalling %s to stop." % self) - self.finished.set() - - def run(self): - """Run the generator and store the results on the queue.""" - self.__gen = self.generator(*self.args, **self.kwargs) - for result in self.__gen: - while True: - if self.finished.isSet(): -## pywikibot.output("DEBUG: %s received stop signal." % self) - return - try: - self.queue.put_nowait(result) - except Queue.Full: - time.sleep(0.25) - continue - break - # wait for queue to be emptied, then kill the thread - while not self.finished.isSet() and not self.queue.empty(): - time.sleep(0.25) - self.stop() -## pywikibot.output("DEBUG: %s stopped because generator exhausted." % self) - - -def AllpagesPageGenerator(start ='!', namespace=None, includeredirects=True, - site=None): - """ - Using the Allpages special page, retrieve all articles' titles, and yield - page objects. - If includeredirects is False, redirects are not included. If - includeredirects equals the string 'only', only redirects are added. - """ - if site is None: - site = pywikibot.getSite() - if includeredirects: - if includeredirects == 'only': - filterredir = True - else: - filterredir = None - else: - filterredir = False - return site.allpages(start=start, namespace=namespace, - filterredir=filterredir) - - -def PrefixingPageGenerator(prefix, namespace=None, includeredirects=True, - site=None): - if site is None: - site = pywikibot.Site() - page = pywikibot.Page(site, prefix) - if namespace is None: - namespace = page.namespace() - title = page.titleWithoutNamespace() - if includeredirects: - if includeredirects == 'only': - filterredir = True - else: - filterredir = None - else: - filterredir = False - return site.allpages(prefix=title, namespace=namespace, - filterredir=filterredir) - - -def NewpagesPageGenerator(number=100, get_redirect=False, repeat=False, - site=None): - # API does not (yet) have a newpages function, so this tries to duplicate - # it by filtering the recentchanges output - # defaults to namespace 0 because that's how Special:Newpages defaults - if site is None: - site = pywikibot.Site() - return site.recentchanges(limit=number, showredirects=get_redirect, - changetype="new", namespaces=0) - - -def FileLinksGenerator(referredImagePage): - return referredImagePage.usingPages() - - -def ImagesPageGenerator(pageWithImages): - return pageWithImages.imagelinks() - - -def InterwikiPageGenerator(page): - """Iterator over all interwiki (non-language) links on a page.""" - for link in page.interwiki(): - yield pywikibot.Page(link) - - -def LanguageLinksPageGenerator(page): - """Iterator over all interwiki language links on a page.""" - for link in page.langlinks(): - yield pywikibot.Page(link) - - -def ReferringPageGenerator(referredPage, followRedirects=False, - withTemplateInclusion=True, - onlyTemplateInclusion=False): - '''Yields all pages referring to a specific page.''' - return referredPage.getReferences( - follow_redirects=followRedirects, - withTemplateInclusion=withTemplateInclusion, - onlyTemplateInclusion=onlyTemplateInclusion) - - -def CategorizedPageGenerator(category, recurse=False, start=None): - '''Yield all pages in a specific category. - - If recurse is True, pages in subcategories are included as well; if - recurse is an int, only subcategories to that depth will be included - (e.g., recurse=2 will get pages in subcats and sub-subcats, but will - not go any further). - If start is a string value, only pages whose sortkey comes after start - alphabetically are included. - - ''' # TODO: page generator could be modified to use cmstartsortkey ... - for a in category.articles(recurse=recurse): - if start is None or a.title(withNamespace=False) >= start: - yield a - - -def SubCategoriesPageGenerator(category, recurse=False, start=None): - '''Yields all subcategories in a specific category. - - If recurse is True, pages in subcategories are included as well; if - recurse is an int, only subcategories to that depth will be included - (e.g., recurse=2 will get pages in subcats and sub-subcats, but will - not go any further). - If start is a string value, only categories whose sortkey comes after - start alphabetically are included. - - ''' # TODO: page generator could be modified to use cmstartsortkey ... - for s in category.subcategories(recurse=recurse): - if start is None or s.title(withNamespace=False) >= start: - yield s - - -def LinkedPageGenerator(linkingPage): - """Yields all pages linked from a specific page.""" - return linkingPage.linkedPages() - - -def TextfilePageGenerator(filename=None, site=None): - """Iterate pages from a list in a text file. - - The file must contain page links between double-square-brackets. The - generator will yield each corresponding Page object. - - @param filename: the name of the file that should be read. If no name is - given, the generator prompts the user. - @param site: the default Site for which Page objects should be created - - """ - if filename is None: - filename = pywikibot.input(u'Please enter the filename:') - if site is None: - site = pywikibot.Site() - f = codecs.open(filename, 'r', config.textfile_encoding) - for linkmatch in pywikibot.link_regex.finditer(f.read()): - # If the link is in interwiki format, the Page object may reside - # on a different Site than the default. - # This makes it possible to work on different wikis using a single - # text file, but also could be dangerous because you might - # inadvertently change pages on another wiki! - yield pywikibot.Page(pywikibot.Link(linkmatch.groups("title"), site)) - f.close() - - -def PagesFromTitlesGenerator(iterable, site=None): - """Generate pages from the titles (unicode strings) yielded by iterable.""" - if site is None: - site = pywikibot.Site() - for title in iterable: - if not isinstance(title, basestring): - break - yield pywikibot.Page(pywikibot.Link(title, site)) - - -def UserContributionsGenerator(username, number=250, namespaces=None, - site=None): - """Yields number unique pages edited by user:username - namespaces : list of namespace numbers to fetch contribs from - - """ - if site is None: - site = pywikibot.Site() - return site.usercontribs(user=username, limit=number, namespaces=namespaces) - - -def NamespaceFilterPageGenerator(generator, namespaces, site=None): - """ - Wraps around another generator. Yields only those pages that are in one - of the given namespaces. - - The namespace list can contain both integers (namespace numbers) and - strings/unicode strings (namespace names). - - """ - if site is None: - site = pywikibot.Site() - # convert namespace names to namespace numbers - for i in xrange(len(namespaces)): - ns = namespaces[i] - if isinstance(ns, basestring): - index = site.getNamespaceIndex(ns) - if index is None: - raise ValueError(u'Unknown namespace: %s' % ns) - namespaces[i] = index - for page in generator: - if page.namespace() in namespaces: - yield page - - -def RedirectFilterPageGenerator(generator): - """Yields pages from another generator that are not redirects.""" - for page in generator: - if not page.isRedirectPage(): - yield page - - -def DuplicateFilterPageGenerator(generator): - """Yield all unique pages from another generator, omitting duplicates.""" - seenPages = {} - for page in generator: - if page not in seenPages: - seenPages[page] = None - yield page - - -def RegexFilterPageGenerator(generator, regex): - """Yield pages from another generator whose titles match regex.""" - reg = re.compile(regex, re.I) - for page in generator: - if reg.match(page.titleWithoutNamespace()): - yield page - - -def CombinedPageGenerator(generators): - return itertools.chain(*generators) - - -def CategoryGenerator(generator): - """Yield pages from another generator as Category objects. - - Makes sense only if it is ascertained that only categories are being - retrieved. - - """ - for page in generator: - yield pywikibot.Category(page) - - -def PageWithTalkPageGenerator(generator): - """ - Wraps around another generator. Yields the same pages, but for non-talk - pages, it also includes associated talk pages. - This generator does not check if the talk page in fact exists. - """ - for page in generator: - yield page - if not page.isTalkPage(): - yield page.toggleTalkPage() - - -def PreloadingGenerator(generator, pageNumber=60, lookahead=10): - """Yield preloaded pages taken from another generator.""" - - # pages may be on more than one site, for example if an interwiki - # generator is used, so use a separate preloader for each site - sites = {} - # build a list of pages for each site found in the iterator - for page in generator: - sites.setdefault(page.site(), []).append(page) - return itertools.chain(*(site.preloadpages(sites[site], pageNumber) - for site in sites)) - - -#TODO below - -def UnusedFilesGenerator(number=100, repeat=False, site=None, extension=None): - if site is None: - site = pywikibot.Site() - for page in site.unusedfiles(number=number, repeat=repeat, - extension=extension): - yield pywikibot.ImagePage(page.site(), page.title()) - -def WithoutInterwikiPageGenerator(number=100, repeat=False, site=None): - if site is None: - site = pywikibot.Site() - for page in site.withoutinterwiki(number=number, repeat=repeat): - yield page - -def UnCategorizedCategoryGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.uncategorizedcategories(number=number, repeat=repeat): - yield page - -def UnCategorizedImageGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.uncategorizedimages(number=number, repeat=repeat): - yield page - -def NewimagesPageGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.newimages(number, repeat=repeat): - yield page[0] - -def UnCategorizedPageGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.uncategorizedpages(number=number, repeat=repeat): - yield page - -def LonelyPagesPageGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.lonelypages(number=number, repeat=repeat): - yield page - -def UnwatchedPagesPageGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.unwatchedpages(number=number, repeat=repeat): - yield page - -def AncientPagesPageGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.ancientpages(number=number, repeat=repeat): - yield page[0] - -def DeadendPagesPageGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.deadendpages(number=number, repeat=repeat): - yield page - -def LongPagesPageGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.longpages(number=number, repeat=repeat): - yield page[0] - -def ShortPagesPageGenerator(number = 100, repeat = False, site = None): - if site is None: - site = pywikibot.Site() - for page in site.shortpages(number=number, repeat=repeat): - yield page[0] - -def LinksearchPageGenerator(link, step=500, site=None): - """Yields all pages that include a specified link, according to - [[Special:Linksearch]]. - - """ - if site is None: - site = pywikibot.Site() - for page in site.linksearch(link, limit=step): - yield page - -def SearchPageGenerator(query, number = 100, namespaces = None, site = None): - """ - Provides a list of results using the internal MediaWiki search engine - """ - if site is None: - site = pywikibot.Site() - for page in site.search(query, number=number, namespaces = namespaces): - yield page[0] - -class YahooSearchPageGenerator: - ''' - To use this generator, install pYsearch - ''' - def __init__(self, query = None, count = 100, site = None): # values larger than 100 fail - self.query = query or pywikibot.input(u'Please enter the search query:') - self.count = count - if site is None: - site = pywikibot.Site() - self.site = site - - def queryYahoo(self, query): - from yahoo.search.web import WebSearch - srch = WebSearch(config.yahoo_appid, query=query, results=self.count) - - dom = srch.get_results() - results = srch.parse_results(dom) - for res in results: - url = res.Url - yield url - - def __iter__(self): - # restrict query to local site - localQuery = '%s site:%s' % (self.query, self.site.hostname()) - base = 'http://%s%s' % (self.site.hostname(), self.site.nice_get_address('')) - for url in self.queryYahoo(localQuery): - if url[:len(base)] == base: - title = url[len(base):] - page = pywikibot.Page(self.site, title) - yield page - -class GoogleSearchPageGenerator: - ''' - To use this generator, you must install the pyGoogle module from - http://pygoogle.sf.net/ and get a Google Web API license key from - http://www.google.com/apis/index.html . The google_key must be set to your - license key in your configuration. - ''' - def __init__(self, query = None, site = None): - self.query = query or pywikibot.input(u'Please enter the search query:') - if site is None: - site = pywikibot.Site() - self.site = site - - ######### - # partially commented out because it is probably not in compliance with Google's "Terms of - # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US) - def queryGoogle(self, query): - #if config.google_key: - if True: - #try: - for url in self.queryViaSoapApi(query): - yield url - return - #except ImportError: - #pass - # No google license key, or pygoogle not installed. Do it the ugly way. - #for url in self.queryViaWeb(query): - # yield url - - def queryViaSoapApi(self, query): - import google - google.LICENSE_KEY = config.google_key - offset = 0 - estimatedTotalResultsCount = None - while not estimatedTotalResultsCount or offset < estimatedTotalResultsCount: - while (True): - # Google often yields 502 errors. - try: - pywikibot.output(u'Querying Google, offset %i' % offset) - data = google.doGoogleSearch(query, start = offset, filter = False) - break - except KeyboardInterrupt: - raise - except: - # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway) - # can happen here, depending on the module used. It's not easy - # to catch this properly because pygoogle decides which one of - # the soap modules to use. - pywikibot.output(u"An error occured. Retrying in 10 seconds...") - time.sleep(10) - continue - - for result in data.results: - #print 'DBG: ', result.URL - yield result.URL - # give an estimate of pages to work on, but only once. - if not estimatedTotalResultsCount: - pywikibot.output(u'Estimated total result count: %i pages.' % data.meta.estimatedTotalResultsCount) - estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount - #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount - offset += 10 - - ######### - # commented out because it is probably not in compliance with Google's "Terms of - # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US) - - #def queryViaWeb(self, query): - #""" - #Google has stopped giving out API license keys, and sooner or later - #they will probably shut down the service. - #This is a quick and ugly solution: we just grab the search results from - #the normal web interface. - #""" - #linkR = re.compile(r'<a href="([^>"]+?)" class=l>', re.IGNORECASE) - #offset = 0 - - #while True: - #pywikibot.output("Google: Querying page %d" % (offset / 100 + 1)) - #address = "http://www.google.com/search?q=%s&num=100&hl=en&start=%d" % (urllib.quote_plus(query), offset) - ## we fake being Firefox because Google blocks unknown browsers - #request = urllib2.Request(address, None, {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1 Firefox/1.5'}) - #urlfile = urllib2.urlopen(request) - #page = urlfile.read() - #urlfile.close() - #for url in linkR.findall(page): - #yield url - #if "<div id=nn>" in page: # Is there a "Next" link for next page of results? - #offset += 100 # Yes, go to next page of results. - #else: - #return - ######### - - def __iter__(self): - # restrict query to local site - localQuery = '%s site:%s' % (self.query, self.site.hostname()) - base = 'http://%s%s' % (self.site.hostname(), self.site.nice_get_address('')) - for url in self.queryGoogle(localQuery): - if url[:len(base)] == base: - title = url[len(base):] - page = pywikibot.Page(self.site, title) - # Google contains links in the format http://de.wikipedia.org/wiki/en:Foobar - if page.site() == self.site: - yield page - -def MySQLPageGenerator(query, site = None): - import MySQLdb as mysqldb - if site is None: - site = pywikibot.Site() - conn = mysqldb.connect(config.db_hostname, db = site.dbName(), - user = config.db_username, - passwd = config.db_password) - cursor = conn.cursor() - pywikibot.output(u'Executing query:\n%s' % query) - query = query.encode(site.encoding()) - cursor.execute(query) - while True: - try: - namespaceNumber, pageName = cursor.fetchone() - print namespaceNumber, pageName - except TypeError: - # Limit reached or no more results - break - #print pageName - if pageName: - namespace = site.namespace(namespaceNumber) - pageName = unicode(pageName, site.encoding()) - if namespace: - pageTitle = '%s:%s' % (namespace, pageName) - else: - pageTitle = pageName - page = pywikibot.Page(site, pageTitle) - yield page - -def YearPageGenerator(start = 1, end = 2050, site = None): - if site is None: - site = pywikibot.Site() - pywikibot.output(u"Starting with year %i" % start) - for i in xrange(start, end + 1): - if i % 100 == 0: - pywikibot.output(u'Preparing %i...' % i) - # There is no year 0 - if i != 0: - current_year = date.formatYear(site.lang, i ) - yield pywikibot.Page(site, current_year) - -def DayPageGenerator(startMonth = 1, endMonth = 12, site = None): - if site is None: - site = pywikibot.Site() - fd = date.FormatDate(site) - firstPage = pywikibot.Page(site, fd(startMonth, 1)) - pywikibot.output(u"Starting with %s" % firstPage.aslink()) - for month in xrange(startMonth, endMonth+1): - for day in xrange(1, date.getNumberOfDaysInMonth(month)+1): - yield pywikibot.Page(site, fd(month, day)) - - -if __name__ == "__main__": - try: - gen = None - genFactory = GeneratorFactory() - for arg in pywikibot.handleArgs(): - generator = genFactory.handleArg(arg) - if generator: - gen = generator - if gen: - for page in gen: - pywikibot.output(page.title(), toStdout = True) - else: - pywikibot.showHelp() - finally: - pywikibot.stopme() +# -*- coding: utf-8 -*- +"""This module offers a wide variety of page generators. A page generator is an +object that is iterable (see http://www.python.org/dev/peps/pep-0255/ ) and +that yields page objects on which other scripts can then work. + +In general, there is no need to run this script directly. It can, however, +be run for testing purposes. It will then print the page titles to standard +output. + +These parameters are supported to specify which pages titles to print: + +¶ms; +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import pywikibot + +import itertools +import Queue +import re +import sys +import threading + + +# ported from version 1 for backwards-compatibility +# most of these functions just wrap a Site or Page method that returns +# a generator + +parameterHelp = """\ +-cat Work on all pages which are in a specific category. + Argument can also be given as "-cat:categoryname" or + as "-cat:categoryname|fromtitle". + +-catr Like -cat, but also recursively includes pages in + subcategories, sub-subcategories etc. of the + given category. + Argument can also be given as "-catr:categoryname" or + as "-catr:categoryname|fromtitle". + +-subcats Work on all subcategories of a specific category. + Argument can also be given as "-subcats:categoryname" or + as "-subcats:categoryname|fromtitle". + +-subcatsr Like -subcats, but also includes sub-subcategories etc. of + the given category. + Argument can also be given as "-subcatsr:categoryname" or + as "-subcatsr:categoryname|fromtitle". + +-uncat Work on all pages which are not categorised. + +-uncatcat Work on all categories which are not categorised. + +-uncatfiles Work on all files which are not categorised. + +-file Read a list of pages to treat from the named text file. + Page titles in the file must be enclosed with [[brackets]]. + Argument can also be given as "-file:filename". + +-filelinks Work on all pages that use a certain image/media file. + Argument can also be given as "-filelinks:filename". + +-yahoo Work on all pages that are found in a Yahoo search. + Depends on python module pYsearch. See yahoo_appid in + config.py for instructions. + +-search Work on all pages that are found in a MediaWiki search + across all namespaces. + +-google Work on all pages that are found in a Google search. + You need a Google Web API license key. Note that Google + doesn't give out license keys anymore. See google_key in + config.py for instructions. + Argument can also be given as "-google:searchstring". + +-interwiki Work on the given page and all equivalent pages in other + languages. This can, for example, be used to fight + multi-site spamming. + Attention: this will cause the bot to modify + pages on several wiki sites, this is not well tested, + so check your edits! + +-links Work on all pages that are linked from a certain page. + Argument can also be given as "-links:linkingpagetitle". + +-new Work on the 60 newest pages. If given as -new:x, will work + on the x newest pages. + +-imagelinks Work on all images that are linked from a certain page. + Argument can also be given as "-imagelinks:linkingpagetitle". + +-newimages Work on the 100 newest images. If given as -newimages:x, + will work on the x newest images. + +-ref Work on all pages that link to a certain page. + Argument can also be given as "-ref:referredpagetitle". + +-start Specifies that the robot should go alphabetically through + all pages on the home wiki, starting at the named page. + Argument can also be given as "-start:pagetitle". + + You can also include a namespace. For example, + "-start:Template:!" will make the bot work on all pages + in the template namespace. + +-prefixindex Work on pages commencing with a common prefix. + +-regex Obsolete, use -titleregex + +-titleregex Work on titles that match the given regular expression. + +-transcludes Work on all pages that use a certain template. + Argument can also be given as "-transcludes:Template:Title". + +-unusedfiles Work on all description pages of images/media files that are + not used anywhere. + Argument can be given as "-unusedfiles:n" where + n is the maximum number of articles to work on. + +-unwatched Work on all articles that are not watched by anyone. + Argument can be given as "-unwatched:n" where + n is the maximum number of articles to work on. + +-usercontribs Work on all articles that were edited by a certain user : + Example : -usercontribs:DumZiBoT + +-weblink Work on all articles that contain an external link to + a given URL; may be given as "-weblink:url" + +-withoutinterwiki Work on all pages that don't have interlanguage links. + Argument can be given as "-withoutinterwiki:n" where + n is some number (??). +""" + +docuReplacements = {'¶ms;': parameterHelp} + +# if a bot uses GeneratorFactory, the module should include the line +# docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp} +# and include the marker ¶ms; in the module's docstring + + +class GeneratorFactory(object): + """Process command line arguments and return appropriate page generator.""" + + def setCategoryGen(self, arg, length, recurse = False): + if len(arg) == length: + categoryname = pywikibot.input(u'Please enter the category name:') + else: + categoryname = arg[length + 1:] + + ind = categoryname.find('|') + if ind > 0: + startfrom = categoryname[ind + 1:] + categoryname = categoryname[:ind] + else: + startfrom = None + + cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname)) + return CategorizedPageGenerator(cat, start=startfrom, recurse=recurse) + + def setSubCategoriesGen(self, arg, length, recurse=False): + if len(arg) == length: + categoryname = pywikibot.input(u'Please enter the category name:') + else: + categoryname = arg[length + 1:] + + ind = categoryname.find('|') + if ind > 0: + startfrom = categoryname[ind + 1:] + categoryname = categoryname[:ind] + else: + startfrom = None + + cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname)) + return SubCategoriesPageGenerator(cat, start=startfrom, recurse=recurse) + + def handleArg(self, arg): + gen = None + if arg.startswith('-filelinks'): + fileLinksPageTitle = arg[11:] + if not fileLinksPageTitle: + fileLinksPageTitle = pywikibot.input( + u'Links to which image page should be processed?') + if fileLinksPageTitle.startswith(pywikibot.Site().namespace(6) + + ":"): + fileLinksPage = pywikibot.ImagePage(pywikibot.Site(), + fileLinksPageTitle) + else: + fileLinksPage = pywikibot.ImagePage(pywikibot.Site(), + 'Image:' + + fileLinksPageTitle) + gen = FileLinksGenerator(fileLinksPage) + elif arg.startswith('-unusedfiles'): + if len(arg) == 12: + gen = UnusedFilesGenerator() + else: + gen = UnusedFilesGenerator(number = int(arg[13:])) + elif arg.startswith('-unwatched'): + if len(arg) == 10: + gen = UnwatchedPagesPageGenerator() + else: + gen = UnwatchedPagesPageGenerator(number = int(arg[11:])) + elif arg.startswith('-usercontribs'): + gen = UserContributionsGenerator(arg[14:]) + elif arg.startswith('-withoutinterwiki'): + if len(arg) == 17: + gen = WithoutInterwikiPageGenerator() + else: + gen = WithoutInterwikiPageGenerator(number = int(arg[18:])) + elif arg.startswith('-interwiki'): + title = arg[11:] + if not title: + title = pywikibot.input(u'Which page should be processed?') + page = pywikibot.Page(pywikibot.Site(), title) + gen = InterwikiPageGenerator(page) + elif arg.startswith('-file'): + textfilename = arg[6:] + if not textfilename: + textfilename = pywikibot.input( + u'Please enter the local file name:') + gen = TextfilePageGenerator(textfilename) + elif arg.startswith('-catr'): + gen = self.setCategoryGen(arg, 5, recurse = True) + elif arg.startswith('-cat'): + gen = self.setCategoryGen(arg, 4) + elif arg.startswith('-subcatsr'): + gen = self.setSubCategoriesGen(arg, 9, recurse = True) + elif arg.startswith('-subcats'): + gen = self.setSubCategoriesGen(arg, 8) + elif arg.startswith('-uncatfiles'): + gen = UnCategorizedImageGenerator() + elif arg.startswith('-uncatcat'): + gen = UnCategorizedCategoryGenerator() + elif arg.startswith('-uncat'): + gen = UnCategorizedPageGenerator() + elif arg.startswith('-ref'): + referredPageTitle = arg[5:] + if not referredPageTitle: + referredPageTitle = pywikibot.input( + u'Links to which page should be processed?') + referredPage = pywikibot.Page(pywikibot.Site(), referredPageTitle) + gen = ReferringPageGenerator(referredPage) + elif arg.startswith('-links'): + linkingPageTitle = arg[7:] + if not linkingPageTitle: + linkingPageTitle = pywikibot.input( + u'Links from which page should be processed?') + linkingPage = pywikibot.Page(pywikibot.Site(), linkingPageTitle) + gen = LinkedPageGenerator(linkingPage) + elif arg.startswith('-weblink'): + url = arg[9:] + if not url: + url = pywikibot.input( + u'Pages with which weblink should be processed?') + gen = LinksearchPageGenerator(url) + elif arg.startswith('-transcludes'): + transclusionPageTitle = arg[len('-transcludes:'):] + if not transclusionPageTitle: + transclusionPageTitle = pywikibot.input( + u'Pages that transclude which page should be processed?') + transclusionPage = pywikibot.Page(pywikibot.Site(), + 'Template:%s' % transclusionPageTitle) + gen = ReferringPageGenerator(transclusionPage, + onlyTemplateInclusion=True) + elif arg.startswith('-start'): + if arg.startswith('-startxml'): + pywikibot.output(u'-startxml : wrong parameter') + raise ValueError + firstPageTitle = arg[7:] + if not firstPageTitle: + firstPageTitle = pywikibot.input( + u'At which page do you want to start?') + namespace = pywikibot.Page(pywikibot.Site(), + firstPageTitle).namespace() + firstPageTitle = pywikibot.Page(pywikibot.link(firstPageTitle) + ).titleWithoutNamespace() + gen = AllpagesPageGenerator(firstPageTitle, namespace, + includeredirects=False) + elif arg.startswith('-prefixindex'): + prefix = arg[13:] + namespace = None + if not prefix: + prefix = pywikibot.input( + u'What page names are you looking for?') + gen = PrefixingPageGenerator(prefix=prefix) + elif arg.startswith('-newimages'): + limit = arg[11:] or pywikibot.input( + u'How many images do you want to load?') + gen = NewimagesPageGenerator(number=int(limit)) + elif arg.startswith('-new'): + if len(arg) >=5: + gen = NewpagesPageGenerator(number=int(arg[5:])) + else: + gen = NewpagesPageGenerator(number=60) + elif arg.startswith('-imagelinks'): + imagelinkstitle = arg[len('-imagelinks:'):] + if not imagelinkstitle: + imagelinkstitle = pywikibot.input( + u'Images on which page should be processed?') + imagelinksPage = pywikibot.Page(pywikibot.Link(imagelinkstitle)) + gen = ImagesPageGenerator(imagelinksPage) + elif arg.startswith('-search'): + mediawikiQuery = arg[8:] + if not mediawikiQuery: + mediawikiQuery = pywikibot.input( + u'What do you want to search for?') + # In order to be useful, all namespaces are required + gen = SearchPageGenerator(mediawikiQuery, namespaces = []) + elif arg.startswith('-google'): + gen = GoogleSearchPageGenerator(arg[8:]) + elif arg.startswith('-titleregex'): + if len(arg) == 6: + regex = pywikibot.input( + u'What page names are you looking for?') + else: + regex = arg[7:] + gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex) + elif arg.startswith('-yahoo'): + gen = YahooSearchPageGenerator(arg[7:]) + else: + return None + # make sure all yielded pages are unique + gen = DuplicateFilterPageGenerator(gen) + return gen + + +class ThreadedGenerator(threading.Thread): + """Look-ahead generator class. + + Runs a generator in a separate thread and queues the results; can + be called like a regular generator. + + Subclasses should override self.generator, _not_ self.run + + Important: the generator thread will stop itself if the generator's + internal queue is exhausted; but, if the calling program does not use + all the generated values, it must call the generator's stop() method to + stop the background thread. Example usage: + + >>> gen = ThreadedGenerator(target=foo) + >>> try: + ... for data in gen: + ... do_work(data) + ... finally: + ... gen.stop() + + """ #NOT CURRENTLY USED: Intended for future development + + def __init__(self, group=None, target=None, name="GeneratorThread", + args=(), kwargs=None, qsize=65536): + """Constructor. Takes same keyword arguments as threading.Thread. + + target must be a generator function (or other callable that returns + an iterable object). + + @param qsize: The size of the lookahead queue. The larger the qsize, + the more values will be computed in advance of use (which can eat + up memory and processor time). + @type qsize: int + + """ + if kwargs is None: + kwargs = {} + if target: + self.generator = target + if not hasattr(self, "generator"): + raise RuntimeError("No generator for ThreadedGenerator to run.") + self.args, self.kwargs = args, kwargs + threading.Thread.__init__(self, group=group, name=name) + self.queue = Queue.Queue(qsize) + self.finished = threading.Event() + + def __iter__(self): + """Iterate results from the queue.""" + if not self.isAlive() and not self.finished.isSet(): + self.start() + # if there is an item in the queue, yield it, otherwise wait + while not self.finished.isSet(): + try: + yield self.queue.get(True, 0.25) + except Queue.Empty: + pass + except KeyboardInterrupt: + self.stop() + + def stop(self): + """Stop the background thread.""" +## if not self.finished.isSet(): +## pywikibot.output("DEBUG: signalling %s to stop." % self) + self.finished.set() + + def run(self): + """Run the generator and store the results on the queue.""" + self.__gen = self.generator(*self.args, **self.kwargs) + for result in self.__gen: + while True: + if self.finished.isSet(): +## pywikibot.output("DEBUG: %s received stop signal." % self) + return + try: + self.queue.put_nowait(result) + except Queue.Full: + time.sleep(0.25) + continue + break + # wait for queue to be emptied, then kill the thread + while not self.finished.isSet() and not self.queue.empty(): + time.sleep(0.25) + self.stop() +## pywikibot.output("DEBUG: %s stopped because generator exhausted." % self) + + +def AllpagesPageGenerator(start ='!', namespace=None, includeredirects=True, + site=None): + """ + Using the Allpages special page, retrieve all articles' titles, and yield + page objects. + If includeredirects is False, redirects are not included. If + includeredirects equals the string 'only', only redirects are added. + """ + if site is None: + site = pywikibot.getSite() + if includeredirects: + if includeredirects == 'only': + filterredir = True + else: + filterredir = None + else: + filterredir = False + return site.allpages(start=start, namespace=namespace, + filterredir=filterredir) + + +def PrefixingPageGenerator(prefix, namespace=None, includeredirects=True, + site=None): + if site is None: + site = pywikibot.Site() + page = pywikibot.Page(site, prefix) + if namespace is None: + namespace = page.namespace() + title = page.titleWithoutNamespace() + if includeredirects: + if includeredirects == 'only': + filterredir = True + else: + filterredir = None + else: + filterredir = False + return site.allpages(prefix=title, namespace=namespace, + filterredir=filterredir) + + +def NewpagesPageGenerator(number=100, get_redirect=False, repeat=False, + site=None): + # API does not (yet) have a newpages function, so this tries to duplicate + # it by filtering the recentchanges output + # defaults to namespace 0 because that's how Special:Newpages defaults + if site is None: + site = pywikibot.Site() + return site.recentchanges(limit=number, showredirects=get_redirect, + changetype="new", namespaces=0) + + +def FileLinksGenerator(referredImagePage): + return referredImagePage.usingPages() + + +def ImagesPageGenerator(pageWithImages): + return pageWithImages.imagelinks() + + +def InterwikiPageGenerator(page): + """Iterator over all interwiki (non-language) links on a page.""" + for link in page.interwiki(): + yield pywikibot.Page(link) + + +def LanguageLinksPageGenerator(page): + """Iterator over all interwiki language links on a page.""" + for link in page.langlinks(): + yield pywikibot.Page(link) + + +def ReferringPageGenerator(referredPage, followRedirects=False, + withTemplateInclusion=True, + onlyTemplateInclusion=False): + '''Yields all pages referring to a specific page.''' + return referredPage.getReferences( + follow_redirects=followRedirects, + withTemplateInclusion=withTemplateInclusion, + onlyTemplateInclusion=onlyTemplateInclusion) + + +def CategorizedPageGenerator(category, recurse=False, start=None): + '''Yield all pages in a specific category. + + If recurse is True, pages in subcategories are included as well; if + recurse is an int, only subcategories to that depth will be included + (e.g., recurse=2 will get pages in subcats and sub-subcats, but will + not go any further). + If start is a string value, only pages whose sortkey comes after start + alphabetically are included. + + ''' # TODO: page generator could be modified to use cmstartsortkey ... + for a in category.articles(recurse=recurse): + if start is None or a.title(withNamespace=False) >= start: + yield a + + +def SubCategoriesPageGenerator(category, recurse=False, start=None): + '''Yields all subcategories in a specific category. + + If recurse is True, pages in subcategories are included as well; if + recurse is an int, only subcategories to that depth will be included + (e.g., recurse=2 will get pages in subcats and sub-subcats, but will + not go any further). + If start is a string value, only categories whose sortkey comes after + start alphabetically are included. + + ''' # TODO: page generator could be modified to use cmstartsortkey ... + for s in category.subcategories(recurse=recurse): + if start is None or s.title(withNamespace=False) >= start: + yield s + + +def LinkedPageGenerator(linkingPage): + """Yields all pages linked from a specific page.""" + return linkingPage.linkedPages() + + +def TextfilePageGenerator(filename=None, site=None): + """Iterate pages from a list in a text file. + + The file must contain page links between double-square-brackets. The + generator will yield each corresponding Page object. + + @param filename: the name of the file that should be read. If no name is + given, the generator prompts the user. + @param site: the default Site for which Page objects should be created + + """ + if filename is None: + filename = pywikibot.input(u'Please enter the filename:') + if site is None: + site = pywikibot.Site() + f = codecs.open(filename, 'r', config.textfile_encoding) + for linkmatch in pywikibot.link_regex.finditer(f.read()): + # If the link is in interwiki format, the Page object may reside + # on a different Site than the default. + # This makes it possible to work on different wikis using a single + # text file, but also could be dangerous because you might + # inadvertently change pages on another wiki! + yield pywikibot.Page(pywikibot.Link(linkmatch.groups("title"), site)) + f.close() + + +def PagesFromTitlesGenerator(iterable, site=None): + """Generate pages from the titles (unicode strings) yielded by iterable.""" + if site is None: + site = pywikibot.Site() + for title in iterable: + if not isinstance(title, basestring): + break + yield pywikibot.Page(pywikibot.Link(title, site)) + + +def UserContributionsGenerator(username, number=250, namespaces=None, + site=None): + """Yields number unique pages edited by user:username + namespaces : list of namespace numbers to fetch contribs from + + """ + if site is None: + site = pywikibot.Site() + return site.usercontribs(user=username, limit=number, namespaces=namespaces) + + +def NamespaceFilterPageGenerator(generator, namespaces, site=None): + """ + Wraps around another generator. Yields only those pages that are in one + of the given namespaces. + + The namespace list can contain both integers (namespace numbers) and + strings/unicode strings (namespace names). + + """ + if site is None: + site = pywikibot.Site() + # convert namespace names to namespace numbers + for i in xrange(len(namespaces)): + ns = namespaces[i] + if isinstance(ns, basestring): + index = site.getNamespaceIndex(ns) + if index is None: + raise ValueError(u'Unknown namespace: %s' % ns) + namespaces[i] = index + for page in generator: + if page.namespace() in namespaces: + yield page + + +def RedirectFilterPageGenerator(generator): + """Yields pages from another generator that are not redirects.""" + for page in generator: + if not page.isRedirectPage(): + yield page + + +def DuplicateFilterPageGenerator(generator): + """Yield all unique pages from another generator, omitting duplicates.""" + seenPages = {} + for page in generator: + if page not in seenPages: + seenPages[page] = None + yield page + + +def RegexFilterPageGenerator(generator, regex): + """Yield pages from another generator whose titles match regex.""" + reg = re.compile(regex, re.I) + for page in generator: + if reg.match(page.titleWithoutNamespace()): + yield page + + +def CombinedPageGenerator(generators): + return itertools.chain(*generators) + + +def CategoryGenerator(generator): + """Yield pages from another generator as Category objects. + + Makes sense only if it is ascertained that only categories are being + retrieved. + + """ + for page in generator: + yield pywikibot.Category(page) + + +def PageWithTalkPageGenerator(generator): + """ + Wraps around another generator. Yields the same pages, but for non-talk + pages, it also includes associated talk pages. + This generator does not check if the talk page in fact exists. + """ + for page in generator: + yield page + if not page.isTalkPage(): + yield page.toggleTalkPage() + + +def PreloadingGenerator(generator, pageNumber=60, lookahead=10): + """Yield preloaded pages taken from another generator.""" + + # pages may be on more than one site, for example if an interwiki + # generator is used, so use a separate preloader for each site + sites = {} + # build a list of pages for each site found in the iterator + for page in generator: + sites.setdefault(page.site(), []).append(page) + return itertools.chain(*(site.preloadpages(sites[site], pageNumber) + for site in sites)) + + +#TODO below + +def UnusedFilesGenerator(number=100, repeat=False, site=None, extension=None): + if site is None: + site = pywikibot.Site() + for page in site.unusedfiles(number=number, repeat=repeat, + extension=extension): + yield pywikibot.ImagePage(page.site(), page.title()) + +def WithoutInterwikiPageGenerator(number=100, repeat=False, site=None): + if site is None: + site = pywikibot.Site() + for page in site.withoutinterwiki(number=number, repeat=repeat): + yield page + +def UnCategorizedCategoryGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.uncategorizedcategories(number=number, repeat=repeat): + yield page + +def UnCategorizedImageGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.uncategorizedimages(number=number, repeat=repeat): + yield page + +def NewimagesPageGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.newimages(number, repeat=repeat): + yield page[0] + +def UnCategorizedPageGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.uncategorizedpages(number=number, repeat=repeat): + yield page + +def LonelyPagesPageGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.lonelypages(number=number, repeat=repeat): + yield page + +def UnwatchedPagesPageGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.unwatchedpages(number=number, repeat=repeat): + yield page + +def AncientPagesPageGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.ancientpages(number=number, repeat=repeat): + yield page[0] + +def DeadendPagesPageGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.deadendpages(number=number, repeat=repeat): + yield page + +def LongPagesPageGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.longpages(number=number, repeat=repeat): + yield page[0] + +def ShortPagesPageGenerator(number = 100, repeat = False, site = None): + if site is None: + site = pywikibot.Site() + for page in site.shortpages(number=number, repeat=repeat): + yield page[0] + +def LinksearchPageGenerator(link, step=500, site=None): + """Yields all pages that include a specified link, according to + [[Special:Linksearch]]. + + """ + if site is None: + site = pywikibot.Site() + for page in site.linksearch(link, limit=step): + yield page + +def SearchPageGenerator(query, number = 100, namespaces = None, site = None): + """ + Provides a list of results using the internal MediaWiki search engine + """ + if site is None: + site = pywikibot.Site() + for page in site.search(query, number=number, namespaces = namespaces): + yield page[0] + +class YahooSearchPageGenerator: + ''' + To use this generator, install pYsearch + ''' + def __init__(self, query = None, count = 100, site = None): # values larger than 100 fail + self.query = query or pywikibot.input(u'Please enter the search query:') + self.count = count + if site is None: + site = pywikibot.Site() + self.site = site + + def queryYahoo(self, query): + from yahoo.search.web import WebSearch + srch = WebSearch(config.yahoo_appid, query=query, results=self.count) + + dom = srch.get_results() + results = srch.parse_results(dom) + for res in results: + url = res.Url + yield url + + def __iter__(self): + # restrict query to local site + localQuery = '%s site:%s' % (self.query, self.site.hostname()) + base = 'http://%s%s' % (self.site.hostname(), self.site.nice_get_address('')) + for url in self.queryYahoo(localQuery): + if url[:len(base)] == base: + title = url[len(base):] + page = pywikibot.Page(self.site, title) + yield page + +class GoogleSearchPageGenerator: + ''' + To use this generator, you must install the pyGoogle module from + http://pygoogle.sf.net/ and get a Google Web API license key from + http://www.google.com/apis/index.html . The google_key must be set to your + license key in your configuration. + ''' + def __init__(self, query = None, site = None): + self.query = query or pywikibot.input(u'Please enter the search query:') + if site is None: + site = pywikibot.Site() + self.site = site + + ######### + # partially commented out because it is probably not in compliance with Google's "Terms of + # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US) + def queryGoogle(self, query): + #if config.google_key: + if True: + #try: + for url in self.queryViaSoapApi(query): + yield url + return + #except ImportError: + #pass + # No google license key, or pygoogle not installed. Do it the ugly way. + #for url in self.queryViaWeb(query): + # yield url + + def queryViaSoapApi(self, query): + import google + google.LICENSE_KEY = config.google_key + offset = 0 + estimatedTotalResultsCount = None + while not estimatedTotalResultsCount or offset < estimatedTotalResultsCount: + while (True): + # Google often yields 502 errors. + try: + pywikibot.output(u'Querying Google, offset %i' % offset) + data = google.doGoogleSearch(query, start = offset, filter = False) + break + except KeyboardInterrupt: + raise + except: + # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway) + # can happen here, depending on the module used. It's not easy + # to catch this properly because pygoogle decides which one of + # the soap modules to use. + pywikibot.output(u"An error occured. Retrying in 10 seconds...") + time.sleep(10) + continue + + for result in data.results: + #print 'DBG: ', result.URL + yield result.URL + # give an estimate of pages to work on, but only once. + if not estimatedTotalResultsCount: + pywikibot.output(u'Estimated total result count: %i pages.' % data.meta.estimatedTotalResultsCount) + estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount + #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount + offset += 10 + + ######### + # commented out because it is probably not in compliance with Google's "Terms of + # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US) + + #def queryViaWeb(self, query): + #""" + #Google has stopped giving out API license keys, and sooner or later + #they will probably shut down the service. + #This is a quick and ugly solution: we just grab the search results from + #the normal web interface. + #""" + #linkR = re.compile(r'<a href="([^>"]+?)" class=l>', re.IGNORECASE) + #offset = 0 + + #while True: + #pywikibot.output("Google: Querying page %d" % (offset / 100 + 1)) + #address = "http://www.google.com/search?q=%s&num=100&hl=en&start=%d" % (urllib.quote_plus(query), offset) + ## we fake being Firefox because Google blocks unknown browsers + #request = urllib2.Request(address, None, {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1 Firefox/1.5'}) + #urlfile = urllib2.urlopen(request) + #page = urlfile.read() + #urlfile.close() + #for url in linkR.findall(page): + #yield url + #if "<div id=nn>" in page: # Is there a "Next" link for next page of results? + #offset += 100 # Yes, go to next page of results. + #else: + #return + ######### + + def __iter__(self): + # restrict query to local site + localQuery = '%s site:%s' % (self.query, self.site.hostname()) + base = 'http://%s%s' % (self.site.hostname(), self.site.nice_get_address('')) + for url in self.queryGoogle(localQuery): + if url[:len(base)] == base: + title = url[len(base):] + page = pywikibot.Page(self.site, title) + # Google contains links in the format http://de.wikipedia.org/wiki/en:Foobar + if page.site() == self.site: + yield page + +def MySQLPageGenerator(query, site = None): + import MySQLdb as mysqldb + if site is None: + site = pywikibot.Site() + conn = mysqldb.connect(config.db_hostname, db = site.dbName(), + user = config.db_username, + passwd = config.db_password) + cursor = conn.cursor() + pywikibot.output(u'Executing query:\n%s' % query) + query = query.encode(site.encoding()) + cursor.execute(query) + while True: + try: + namespaceNumber, pageName = cursor.fetchone() + print namespaceNumber, pageName + except TypeError: + # Limit reached or no more results + break + #print pageName + if pageName: + namespace = site.namespace(namespaceNumber) + pageName = unicode(pageName, site.encoding()) + if namespace: + pageTitle = '%s:%s' % (namespace, pageName) + else: + pageTitle = pageName + page = pywikibot.Page(site, pageTitle) + yield page + +def YearPageGenerator(start = 1, end = 2050, site = None): + if site is None: + site = pywikibot.Site() + pywikibot.output(u"Starting with year %i" % start) + for i in xrange(start, end + 1): + if i % 100 == 0: + pywikibot.output(u'Preparing %i...' % i) + # There is no year 0 + if i != 0: + current_year = date.formatYear(site.lang, i ) + yield pywikibot.Page(site, current_year) + +def DayPageGenerator(startMonth = 1, endMonth = 12, site = None): + if site is None: + site = pywikibot.Site() + fd = date.FormatDate(site) + firstPage = pywikibot.Page(site, fd(startMonth, 1)) + pywikibot.output(u"Starting with %s" % firstPage.aslink()) + for month in xrange(startMonth, endMonth+1): + for day in xrange(1, date.getNumberOfDaysInMonth(month)+1): + yield pywikibot.Page(site, fd(month, day)) + + +if __name__ == "__main__": + try: + gen = None + genFactory = GeneratorFactory() + for arg in pywikibot.handleArgs(): + generator = genFactory.handleArg(arg) + if generator: + gen = generator + if gen: + for page in gen: + pywikibot.output(page.title(), toStdout = True) + else: + pywikibot.showHelp() + finally: + pywikibot.stopme()
Property changes on: branches/rewrite/pywikibot/pagegenerators.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2008-12-16 19:34:48 UTC (rev 6155) +++ branches/rewrite/pywikibot/site.py 2008-12-16 19:40:20 UTC (rev 6156) @@ -1,2861 +1,2861 @@ - # -*- coding: utf-8 -*- -""" -Objects representing MediaWiki sites (wikis) and families (groups of wikis -on the same topic in different languages). -""" -# -# (C) Pywikipedia bot team, 2008 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id: $' - -import pywikibot -from pywikibot import deprecate_arg -from pywikibot import config -from pywikibot.throttle import Throttle -from pywikibot.data import api -from pywikibot.exceptions import * - -try: - from hashlib import md5 -except ImportError: - from md5 import md5 -import logging -import os -import re -import sys -import threading -import urllib - -logger = logging.getLogger("wiki") - -class PageInUse(pywikibot.Error): - """Page cannot be reserved for writing due to existing lock.""" - - -def Family(fam=None, fatal=True): - """Import the named family. - - @param fam: family name (if omitted, uses the configured default) - @type fam: str - @param fatal: if True, the bot will stop running if the given family is - unknown. If False, it will only raise a ValueError exception. - @param fatal: bool - @return: a Family instance configured for the named family. - - """ - if fam == None: - fam = config.family - try: - # first try the built-in families - exec "import pywikibot.families.%s_family as myfamily" % fam - except ImportError: - # next see if user has defined a local family module - try: - sys.path.append(config.datafilepath('families')) - exec "import %s_family as myfamily" % fam - except ImportError: - if fatal: - logger.exception(u"""\ -Error importing the %s family. This probably means the family -does not exist. Also check your configuration file.""" - % fam) - sys.exit(1) - else: - raise Error("Family %s does not exist" % fam) - return myfamily.Family() - - -class BaseSite(object): - """Site methods that are independent of the communication interface.""" - # to implement a specific interface, define a Site class that inherits - # from this - - def __init__(self, code, fam=None, user=None, sysop=None): - """ - @param code: the site's language code - @type code: str - @param fam: wiki family name (optional) - @type fam: str or Family - @param user: bot user name (optional) - @type user: str - @param sysop: sysop account user name (optional) - @type sysop: str - - """ - self.__code = code.lower() - if isinstance(fam, basestring) or fam is None: - self.__family = Family(fam, fatal=False) - else: - self.__family = fam - - # if we got an outdated language code, use the new one instead. - if self.__family.obsolete.has_key(self.__code): - if self.__family.obsolete[self.__code] is not None: - self.__code = self.__family.obsolete[self.__code] - else: - # no such language anymore - raise NoSuchSite("Language %s in family %s is obsolete" - % (self.__code, self.__family.name)) - if self.__code not in self.languages(): - if self.__code == 'zh-classic' and 'zh-classical' in self.languages(): - self.__code = 'zh-classical' - # database hack (database is varchar[10] -> zh-classical - # is cut to zh-classic. - else: - raise NoSuchSite("Language %s does not exist in family %s" - % (self.__code, self.__family.name)) - - self._username = [user, sysop] - - # following are for use with lock_page and unlock_page methods - self._pagemutex = threading.Lock() - self._locked_pages = [] - - @property - def throttle(self): - """Return this Site's throttle. Initialize a new one if needed.""" - - if not hasattr(self, "_throttle"): - self._throttle = Throttle(self, multiplydelay=True, - verbosedelay=True) - try: - self.login(False) - except pywikibot.NoUsername: - pass - return self._throttle - - @property - def family(self): - """The Family object for this Site's wiki family.""" - - return self.__family - - @property - def code(self): - """The identifying code for this Site.""" - - return self.__code - - @property - def lang(self): - """The ISO language code for this Site. - - Presumed to be equal to the wiki prefix, but this can be overridden. - - """ - return self.__code - - def __cmp__(self, other): - """Perform equality and inequality tests on Site objects.""" - - if not isinstance(other, BaseSite): - return 1 - if self.family == other.family: - return cmp(self.code, other.code) - return cmp(self.family.name, other.family.name) - - def user(self): - """Return the currently-logged in bot user, or None.""" - - if self.logged_in(True): - return self._username[True] - elif self.logged_in(False): - return self._username[False] - return None - - def username(self, sysop = False): - return self._username[sysop] - - def __getattr__(self, attr): - """Calls to methods not defined in this object are passed to Family.""" - - if hasattr(self.__class__, attr): - return self.__class__.attr - try: - method = getattr(self.family, attr) - f = lambda *args, **kwargs: \ - method(self.code, *args, **kwargs) - if hasattr(method, "__doc__"): - f.__doc__ = method.__doc__ - return f - except AttributeError: - raise AttributeError("%s instance has no attribute '%s'" - % (self.__class__.__name__, attr) ) - - def sitename(self): - """Return string representing this Site's name and language.""" - - return self.family.name+':'+self.code - - __str__ = sitename - - def __repr__(self): - return 'Site("%s", "%s")' % (self.code, self.family.name) - - def __hash__(self): - return hash(repr(self)) - - def linktrail(self): - """Return regex for trailing chars displayed as part of a link. - - Returns a string, not a compiled regular expression object. - - This reads from the family file, and ''not'' from - [[MediaWiki:Linktrail]], because the MW software currently uses a - built-in linktrail from its message files and ignores the wiki - value. - - """ - return self.family.linktrail(self.code) - - def languages(self): - """Return list of all valid language codes for this site's Family.""" - - return self.family.langs.keys() - - def validLanguageLinks(self): - """Return list of language codes that can be used in interwiki links.""" - - nsnames = sum(self.namespaces().values(), []) - return [l for l in self.languages() - if l[:1].upper() + l[1:] not in self.namespaces()] - - def ns_index(self, namespace): - """Given a namespace name, return its int index, or None if invalid.""" - - for ns in self.namespaces(): - if namespace.lower() in [name.lower() - for name in self.namespaces()[ns]]: - return ns - return None - - getNamespaceIndex = ns_index # for backwards-compatibility - - def namespaces(self): - """Return dict of valid namespaces on this wiki.""" - - return self._namespaces - - def ns_normalize(self, value): - """Return canonical local form of namespace name. - - @param value: A namespace name - @type value: unicode - - """ - index = self.ns_index(value) - return self.namespace(index) - - normalizeNamespace = ns_normalize # for backwards-compatibility - - def redirect(self, default=True): - """Return the localized redirect tag for the site. - - If default is True, falls back to 'REDIRECT' if the site has no - special redirect tag. - - """ - if default: - return self.family.redirect.get(self.code, [u"REDIRECT"])[0] - else: - return self.family.redirect.get(self.code, None) - - def lock_page(self, page, block=True): - """Lock page for writing. Must be called before writing any page. - - We don't want different threads trying to write to the same page - at the same time, even to different sections. - - @param page: the page to be locked - @type page: pywikibot.Page - @param block: if true, wait until the page is available to be locked; - otherwise, raise an exception if page can't be locked - - """ - self._pagemutex.acquire() - try: - while page in self._locked_pages: - if not block: - raise PageInUse - time.sleep(.25) - self._locked_pages.append(page.title(withSection=False)) - finally: - self._pagemutex.release() - - def unlock_page(self, page): - """Unlock page. Call as soon as a write operation has completed. - - @param page: the page to be locked - @type page: pywikibot.Page - - """ - self._pagemutex.acquire() - try: - self._locked_pages.remove(page.title(withSection=False)) - finally: - self._pagemutex.release() - - def disambcategory(self): - """Return Category in which disambig pages are listed.""" - - try: - name = self.namespace(14)+':'+self.family.disambcatname[self.code] - except KeyError: - raise Error(u"No disambiguation category name found for %(site)s" - % {'site': self}) - return pywikibot.Category(pywikibot.Link(name, self)) - - def linkto(self, title, othersite = None): - """Return unicode string in the form of a wikilink to 'title' - - Use optional Site argument 'othersite' to generate an interwiki link. - - """ - logger.debug("Site.linkto() method is deprecated; use pywikibot.Link") - return pywikibot.Link(title, self).astext(othersite) - - def isInterwikiLink(self, s): - """Return True if s is in the form of an interwiki link. - - If a link object constructed using "s" as the link text parses as - belonging to a different site, this method returns True. - - """ - return (pywikibot.Link(s, self).site != self) - - def redirectRegex(self): - """Return a compiled regular expression matching on redirect pages. - - Group 1 in the regex match object will be the target title. - - """ - #TODO: is this needed, since the API identifies redirects? - # (maybe, the API can give false positives) - default = 'REDIRECT' - try: - keywords = set(self.family.redirect[self.code]) - keywords.add(default) - pattern = r'(?:' + '|'.join(keywords) + ')' - except KeyError: - # no localized keyword for redirects - pattern = r'%s' % default - # A redirect starts with hash (#), followed by a keyword, then - # arbitrary stuff, then a wikilink. The wikilink may contain - # a label, although this is not useful. - return re.compile(r'\s*#%(pattern)s\s*:?\s*[[(.+?)(?:|.*?)?]]' - % locals(), - re.IGNORECASE | re.UNICODE | re.DOTALL) - - # namespace shortcuts for backwards-compatibility - - def special_namespace(self): - return self.namespace(-1) - - def image_namespace(self): - return self.namespace(6) - - def mediawiki_namespace(self): - return self.namespace(8) - - def template_namespace(self): - return self.namespace(10) - - def category_namespace(self): - return self.namespace(14) - - def category_namespaces(self): - return self.namespace(14, all=True) - - # site-specific formatting preferences - - def category_on_one_line(self): - """Return True if this site wants all category links on one line.""" - - return self.code in self.family.category_on_one_line - - def interwiki_putfirst(self): - """Return list of language codes for ordering of interwiki links.""" - - return self.family.interwiki_putfirst.get(self.code, None) - - def interwiki_putfirst_doubled(self, list_of_links): - # TODO: is this even needed? No family in the framework has this - # dictionary defined! - if self.lang in self.family.interwiki_putfirst_doubled: - if len(list_of_links) >= \ - self.family.interwiki_putfirst_doubled[self.lang][0]: - links2 = [lang.language() for lang in list_of_links] - result = [] - for lang in self.family.interwiki_putfirst_doubled[self.lang][1]: - try: - result.append(list_of_links[links2.index(lang)]) - except ValueError: - pass - return result - else: - return False - else: - return False - - def getSite(self, code): - """Return Site object for language 'code' in this Family.""" - - return pywikibot.Site(code=code, fam=self.family, user=self.user) - - # deprecated methods for backwards-compatibility - - def fam(self): - """Return Family object for this Site.""" - return self.family - - def urlEncode(self, query): - """DEPRECATED""" - return urllib.urlencode(query) - - def getUrl(self, path, retry=True, sysop=False, data=None, - compress=True, no_hostname=False, cookie_only=False): - """DEPRECATED. - - Retained for compatibility only. All arguments except path and data - are ignored. - - """ - if data: - if not isinstance(data, basestring): - data = urllib.urlencode(data) - return pywikibot.comms.data.request(self, path, method="PUT", - body=data) - else: - return pywikibot.comms.data.request(self, path) - - def postForm(self, address, predata, sysop=False, cookies=None): - """DEPRECATED""" - return self.getUrl(address, data=predata) - - def postData(self, address, data, contentType=None, sysop=False, - compress=True, cookies=None): - """DEPRECATED""" - return self.getUrl(address, data=data) - - # unsupported methods from version 1 - - def checkCharset(self, charset): - raise NotImplementedError - def getToken(self, getalways=True, getagain=False, sysop=False): - raise NotImplementedError - def export_address(self): - raise NotImplementedError - def move_address(self): - raise NotImplementedError - def delete_address(self, s): - raise NotImplementedError - def undelete_view_address(self, s, ts=''): - raise NotImplementedError - def undelete_address(self): - raise NotImplementedError - def protect_address(self, s): - raise NotImplementedError - def unprotect_address(self, s): - raise NotImplementedError - def put_address(self, s): - raise NotImplementedError - def get_address(self, s): - raise NotImplementedError - def nice_get_address(self, s): - raise NotImplementedError - def edit_address(self, s): - raise NotImplementedError - def purge_address(self, s): - raise NotImplementedError - def block_address(self): - raise NotImplementedError - def unblock_address(self): - raise NotImplementedError - def blocksearch_address(self, s): - raise NotImplementedError - def linksearch_address(self, s, limit=500, offset=0): - raise NotImplementedError - def search_address(self, q, n=50, ns=0): - raise NotImplementedError - def allpages_address(self, s, ns = 0): - raise NotImplementedError - def log_address(self, n=50, mode = ''): - raise NotImplementedError - def newpages_address(self, n=50): - raise NotImplementedError - def longpages_address(self, n=500): - raise NotImplementedError - def shortpages_address(self, n=500): - raise NotImplementedError - def unusedfiles_address(self, n=500): - raise NotImplementedError - def categories_address(self, n=500): - raise NotImplementedError - def deadendpages_address(self, n=500): - raise NotImplementedError - def ancientpages_address(self, n=500): - raise NotImplementedError - def lonelypages_address(self, n=500): - raise NotImplementedError - def protectedpages_address(self, n=500): - raise NotImplementedError - def unwatchedpages_address(self, n=500): - raise NotImplementedError - def uncategorizedcategories_address(self, n=500): - raise NotImplementedError - def uncategorizedimages_address(self, n=500): - raise NotImplementedError - def uncategorizedpages_address(self, n=500): - raise NotImplementedError - def unusedcategories_address(self, n=500): - raise NotImplementedError - def withoutinterwiki_address(self, n=500): - raise NotImplementedError - def references_address(self, s): - raise NotImplementedError - def allmessages_address(self): - raise NotImplementedError - def upload_address(self): - raise NotImplementedError - def double_redirects_address(self, default_limit = True): - raise NotImplementedError - def broken_redirects_address(self, default_limit = True): - raise NotImplementedError - def login_address(self): - raise NotImplementedError - def captcha_image_address(self, id): - raise NotImplementedError - def watchlist_address(self): - raise NotImplementedError - def contribs_address(self, target, limit=500, offset=''): - raise NotImplementedError - - -class APISite(BaseSite): - """API interface to MediaWiki site. - - Do not use directly; use pywikibot.Site function. - - """ -## Site methods from version 1.0 (as these are implemented in this file, -## or declared deprecated/obsolete, they will be removed from this list) -########## -## cookies: return user's cookies as a string -## -## urlEncode: Encode a query to be sent using an http POST request. -## postForm: Post form data to an address at this site. -## postData: Post encoded form data to an http address at this site. -## -## shared_image_repository: Return tuple of image repositories used by this -## site. -## version: Return MediaWiki version string from Family file. -## versionnumber: Return int identifying the MediaWiki version. -## live_version: Return version number read from Special:Version. -## checkCharset(charset): Warn if charset doesn't match family file. -## -## linktrail: Return regex for trailing chars displayed as part of a link. -## disambcategory: Category in which disambiguation pages are listed. -## -## Methods that yield Page objects derived from a wiki's Special: pages -## (note, some methods yield other information in a tuple along with the -## Pages; see method docs for details) -- -## -## newpages(): Special:Newpages -## newimages(): Special:Log&type=upload -## longpages(): Special:Longpages -## shortpages(): Special:Shortpages -## deadendpages(): Special:Deadendpages -## ancientpages(): Special:Ancientpages -## lonelypages(): Special:Lonelypages -## unwatchedpages(): Special:Unwatchedpages (sysop accounts only) -## uncategorizedcategories(): Special:Uncategorizedcategories (yields -## Category objects) -## uncategorizedpages(): Special:Uncategorizedpages -## uncategorizedimages(): Special:Uncategorizedimages (yields -## ImagePage objects) -## unusedcategories(): Special:Unusuedcategories (yields Category) -## unusedfiles(): Special:Unusedimages (yields ImagePage) -## withoutinterwiki: Special:Withoutinterwiki -## linksearch: Special:Linksearch - - def __init__(self, code, fam=None, user=None, sysop=None): - BaseSite.__init__(self, code, fam, user, sysop) - self._namespaces = { - # these are the MediaWiki built-in names, which always work - # localized names are loaded later upon accessing the wiki - # namespace prefixes are always case-insensitive, but the - # canonical forms are capitalized - -2: [u"Media"], - -1: [u"Special"], - 0: [u""], - 1: [u"Talk"], - 2: [u"User"], - 3: [u"User talk"], - 4: [u"Project"], - 5: [u"Project talk"], - 6: [u"Image"], - 7: [u"Image talk"], - 8: [u"MediaWiki"], - 9: [u"MediaWiki talk"], - 10: [u"Template"], - 11: [u"Template talk"], - 12: [u"Help"], - 13: [u"Help talk"], - 14: [u"Category"], - 15: [u"Category talk"], - } - self.sitelock = threading.Lock() - self._msgcache = {} - return - -# ANYTHING BELOW THIS POINT IS NOT YET IMPLEMENTED IN __init__() - self.nocapitalize = self.__code in self.family.nocapitalize - # Calculating valid languages took quite long, so we calculate it once - # in initialization instead of each time it is used. - self._validlanguages = [] - for language in self.languages(): - if not language[:1].upper() + language[1:] in self.namespaces(): - self._validlanguages.append(language) - - def logged_in(self, sysop=False): - """Return True if logged in with specified privileges, otherwise False. - - @param sysop: if True, require sysop privileges. - - """ - if self.userinfo['name'] != self._username[sysop]: - return False - return (not sysop) or 'sysop' in self.userinfo['groups'] - - def loggedInAs(self, sysop = False): - """Return the current username if logged in, otherwise return None. - - DEPRECATED (use .user() method instead) - - """ - logger.debug("Site.loggedInAs() method is deprecated.") - return self.logged_in(sysop) and self.user() - - def login(self, sysop=False): - """Log the user in if not already logged in.""" - if not hasattr(self, "_siteinfo"): - self._getsiteinfo() - # check whether a login cookie already exists for this user - if hasattr(self, "_userinfo"): - if self.userinfo['name'] == self._username[sysop]: - return - if not self.logged_in(sysop): - loginMan = api.LoginManager(site=self, sysop=sysop, - user=self._username[sysop]) - if loginMan.login(retry = True): - self._username[sysop] = loginMan.username - if hasattr(self, "_userinfo"): - del self._userinfo - self.getuserinfo() - - forceLogin = login # alias for backward-compatibility - - def getuserinfo(self): - """Retrieve userinfo from site and store in _userinfo attribute. - - self._userinfo will be a dict with the following keys and values: - - - id: user id (numeric str) - - name: username (if user is logged in) - - anon: present if user is not logged in - - groups: list of groups (could be empty) - - rights: list of rights (could be empty) - - message: present if user has a new message on talk page - - blockinfo: present if user is blocked (dict) - - """ - if (not hasattr(self, "_userinfo") - or "rights" not in self._userinfo - or self._userinfo['name'] - != self._username["sysop" in self._userinfo["groups"]]): - uirequest = api.Request( - site=self, - action="query", - meta="userinfo", - uiprop="blockinfo|hasmsg|groups|rights" - ) - uidata = uirequest.submit() - assert 'query' in uidata, \ - "API userinfo response lacks 'query' key" - assert 'userinfo' in uidata['query'], \ - "API userinfo response lacks 'userinfo' key" - self._userinfo = uidata['query']['userinfo'] - return self._userinfo - - userinfo = property(fget=getuserinfo, doc=getuserinfo.__doc__) - - def is_blocked(self, sysop=False): - """Return true if and only if user is blocked. - - @param sysop: If true, log in to sysop account (if available) - - """ - if not self.logged_in(sysop): - self.login(sysop) - return 'blockinfo' in self._userinfo - - def isBlocked(self, sysop=False): - """Deprecated synonym for is_blocked""" - logger.debug( - "Site method 'isBlocked' should be changed to 'is_blocked'") - return self.is_blocked(sysop) - - def checkBlocks(self, sysop = False): - """Check if the user is blocked, and raise an exception if so.""" - if self.is_blocked(sysop): - # User blocked - raise UserBlocked('User is blocked in site %s' % self) - - def has_right(self, right, sysop=False): - """Return true if and only if the user has a specific right. - - Possible values of 'right' may vary depending on wiki settings, - but will usually include: - - * Actions: edit, move, delete, protect, upload - * User levels: autoconfirmed, sysop, bot - - """ - if not self.logged_in(sysop): - self.login(sysop) - return right.lower() in self._userinfo['rights'] - - def isAllowed(self, right, sysop=False): - """Deprecated; retained for backwards-compatibility""" - logger.debug("Site.isAllowed() method is deprecated; use has_right()") - return self.has_right(right, sysop) - - def has_group(self, group, sysop=False): - """Return true if and only if the user is a member of specified group. - - Possible values of 'group' may vary depending on wiki settings, - but will usually include bot. - - """ - if not self.logged_in(sysop): - self.login(sysop) - return group.lower() in self._userinfo['groups'] - - def messages(self, sysop=False): - """Returns true if the user has new messages, and false otherwise.""" - if not self.logged_in(sysop): - self.login(sysop) - return 'hasmsg' in self._userinfo - - def mediawiki_message(self, key): - """Return the MediaWiki message text for key "key" """ - if not key in self._msgcache: - msg_query = api.QueryGenerator(site=self, meta="allmessages", - amfilter=key) - for msg in msg_query: - if msg['name'] == key and not 'missing' in msg: - self._msgcache[key] = msg['*'] - break - else: - raise KeyError("Site %(self)s has no message '%(key)s'" - % locals()) - return self._msgcache[key] - - def has_mediawiki_message(self, key): - """Return True iff this site defines a MediaWiki message for 'key'.""" - try: - v = self.mediawiki_message(key) - return True - except KeyError: - return False - - def getcurrenttimestamp(self): - """Return (Mediawiki) timestamp, {{CURRENTTIMESTAMP}}, the server time. - - Format is yyyymmddhhmmss - - """ - r = api.Request(site=self, - action="parse", - text="{{CURRENTTIMESTAMP}}") - result = r.submit() - return re.search('\d+', result['parse']['text']['*']).group() - - def _getsiteinfo(self): - """Retrieve siteinfo and namespaces from site.""" - sirequest = api.Request( - site=self, - action="query", - meta="siteinfo", - siprop="general|namespaces|namespacealiases" - ) - try: - sidata = sirequest.submit() - except api.APIError: - # hack for older sites that don't support 1.12 properties - # probably should delete if we're not going to support pre-1.12 - sirequest = api.Request( - site=self, - action="query", - meta="siteinfo", - siprop="general|namespaces" - ) - sidata = sirequest.submit() - - assert 'query' in sidata, \ - "API siteinfo response lacks 'query' key" - sidata = sidata['query'] - assert 'general' in sidata, \ - "API siteinfo response lacks 'general' key" - assert 'namespaces' in sidata, \ - "API siteinfo response lacks 'namespaces' key" - self._siteinfo = sidata['general'] - nsdata = sidata['namespaces'] - for nskey in nsdata: - ns = int(nskey) - if ns in self._namespaces: - if nsdata[nskey]["*"] in self._namespaces[ns]: - continue - # this is the preferred form so it goes at front of list - self._namespaces[ns].insert(0, nsdata[nskey]["*"]) - else: - self._namespaces[ns] = [nsdata[nskey]["*"]] - if 'namespacealiases' in sidata: - aliasdata = sidata['namespacealiases'] - for item in aliasdata: - if item["*"] in self._namespaces[int(item['id'])]: - continue - # this is a less preferred form so it goes at the end - self._namespaces[int(item['id'])].append(item["*"]) - - @property - def siteinfo(self): - """Site information dict.""" - - if not hasattr(self, "_siteinfo"): - self._getsiteinfo() - return self._siteinfo - - def case(self): - """Return this site's capitalization rule.""" - - return self.siteinfo['case'] - - def language(self): - """Return the code for the language of this Site.""" - - return self.siteinfo['lang'] - - lang = property(fget=language, doc=language.__doc__) - - def namespaces(self): - """Return dict of valid namespaces on this wiki.""" - - if not hasattr(self, "_siteinfo"): - self._getsiteinfo() - return self._namespaces - - def namespace(self, num, all=False): - """Return string containing local name of namespace 'num'. - - If optional argument 'all' is true, return a list of all recognized - values for this namespace. - - """ - if all: - return self.namespaces()[num] - return self.namespaces()[num][0] - - def live_version(self): - """Return the 'real' version number found on [[Special:Version]] - - Return value is a tuple (int, int, str) of the major and minor - version numbers and any other text contained in the version. - - """ - versionstring = self.siteinfo['generator'] - m = re.match(r"^MediaWiki ([0-9]+).([0-9]+)(.*)$", versionstring) - if m: - return (int(m.group(1)), int(m.group(2)), m.group(3)) - else: - return None - - def loadpageinfo(self, page): - """Load page info from api and save in page attributes""" - title = page.title(withSection=False) - query = api.PropertyGenerator("info", site=self, - titles=title.encode(self.encoding()), - inprop="protection") - for pageitem in query: - if pageitem['title'] != title: - raise Error( - u"loadpageinfo: Query on %s returned data on '%s'" - % (page, pageitem['title'])) - api.update_page(page, pageitem) - - def loadimageinfo(self, page, history=False): - """Load image info from api and save in page attributes - - @param history: if true, return the image's version history - - """ - title = page.title(withSection=False) - query = api.PropertyGenerator("imageinfo", site=self, - titles=title.encode(self.encoding()), - iiprop=["timestamp", "user", "comment", - "url", "size", "sha1", "mime", - "metadata", "archivename"]) - if history: - query.request["iilimit"] = "max" - for pageitem in query: - if pageitem['title'] != title: - raise Error( - u"loadpageinfo: Query on %s returned data on '%s'" - % (page, pageitem['title'])) - api.update_page(page, pageitem) - if history: - return pageitem['imageinfo'] - - def page_exists(self, page): - """Return True if and only if page is an existing page on site.""" - if not hasattr(page, "_pageid"): - self.loadpageinfo(page) - return page._pageid > 0 - - def page_restrictions(self, page): - """Returns a dictionary reflecting page protections""" - if not self.page_exists(page): - raise NoPage(u'No page %s.' % page) - if not hasattr(page, "_protection"): - self.loadpageinfo(page) - return page._protection - - def page_can_be_edited(self, page): - """ - Returns True if and only if: - - page is unprotected, and bot has an account for this site, or - - page is protected, and bot has a sysop account for this site. - - """ - rest = self.page_restrictions(page) - sysop_protected = rest.has_key('edit') and rest['edit'][0] == 'sysop' - try: - api.LoginManager(site=self, sysop=sysop_protected) - except NoUsername: - return False - return True - - def page_isredirect(self, page): - """Return True if and only if page is a redirect.""" - if not hasattr(page, "_redir"): - self.loadpageinfo(page) - return bool(page._redir) - - def getredirtarget(self, page): - """Return Page object for the redirect target of page.""" - if not hasattr(page, "_redir"): - self.loadpageinfo(page) - if not page._redir: - raise pywikibot.IsNotRedirectPage(page.title()) - title = page.title(withSection=False) - query = api.Request(site=self, action="query", property="info", - inprop="protection|talkid|subjectid", - titles=title.encode(self.encoding()), - redirects="") - result = query.submit() - if "query" not in result or "redirects" not in result["query"]: - raise RuntimeError( - "getredirtarget: No 'redirects' found for page %s." - % title) - redirmap = dict((item['from'], item['to']) - for item in result['query']['redirects']) - if title not in redirmap: - raise RuntimeError( - "getredirtarget: 'redirects' contains no key for page %s." - % title) - if "pages" not in result['query']: - # no "pages" element indicates a circular redirect - raise pywikibot.CircularRedirect(redirmap[title]) - for pagedata in result['query']['pages'].values(): - # there should be only one value in 'pages', and it is the target - if pagedata['title'] not in redirmap.values(): - raise RuntimeError( - "getredirtarget: target page '%s' not found in 'redirects'" - % pagedata['title']) - target = pywikibot.Page(self, pagedata['title'], pagedata['ns']) - api.update_page(target, pagedata) - page._redir = target - - def preloadpages(self, pagelist, groupsize=60): - """Return a generator to a list of preloaded pages. - - Note that [at least in current implementation] pages may be iterated - in a different order than in the underlying pagelist. - - @param pagelist: an iterable that returns Page objects - @param groupsize: how many Pages to query at a time - @type groupsize: int - - """ - from pywikibot.tools import itergroup - for sublist in itergroup(pagelist, groupsize): - pageids = [str(p._pageid) for p in sublist - if hasattr(p, "_pageid") - and p._pageid > 0] - cache = dict((p.title(withSection=False), p) for p in sublist) - rvgen = api.PropertyGenerator("revisions|info", site=self) - rvgen.limit = -1 - if len(pageids) == len(sublist): - # only use pageids if all pages have them - rvgen.request["pageids"] = "|".join(pageids) - else: - rvgen.request["titles"] = "|".join(cache.keys()) - rvgen.request[u"rvprop"] = \ - u"ids|flags|timestamp|user|comment|content" - logger.info(u"Retrieving %s pages from %s." - % (len(cache), self) - ) - for pagedata in rvgen: - logger.debug("Preloading %s" % pagedata) - try: - if pagedata['title'] not in cache: - raise Error( - u"preloadpages: Query returned unexpected title '%s'" - % pagedata['title'] - ) - except KeyError: - logger.debug("No 'title' in %s" % pagedata) - logger.debug("pageids=%s" % pageids) - logger.debug("titles=%s" % cache.keys()) - continue - page = cache[pagedata['title']] - api.update_page(page, pagedata) - yield page - - def token(self, page, tokentype): - """Return token retrieved from wiki to allow changing page content. - - @param page: the Page for which a token should be retrieved - @param tokentype: the type of token (e.g., "edit", "move", "delete"); - see API documentation for full list of types - - """ - query = api.PropertyGenerator("info|revisions", site=self, - titles=page.title(withSection=False), - intoken=tokentype) - for item in query: - if item['title'] != page.title(withSection=False): - raise Error( - u"token: Query on page %s returned data on page [[%s]]" - % (page.title(withSection=False, asLink=True), - item['title'])) - api.update_page(page, item) - logging.debug(str(item)) - return item[tokentype + "token"] - - # following group of methods map more-or-less directly to API queries - - def pagebacklinks(self, page, followRedirects=False, filterRedirects=None, - namespaces=None): - """Iterate all pages that link to the given page. - - @param page: The Page to get links to. - @param followRedirects: Also return links to redirects pointing to - the given page. - @param filterRedirects: If True, only return redirects to the given - page. If False, only return non-redirect links. If None, return - both (no filtering). - @param namespaces: If present, only return links from the namespaces - in this list. - - """ - bltitle = page.title(withSection=False).encode(self.encoding()) - blgen = api.PageGenerator("backlinks", gbltitle=bltitle, site=self) - if isinstance(namespaces, list): - blgen.request["gblnamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - blgen.request["gblnamespace"] = str(namespaces) - if filterRedirects is not None: - blgen.request["gblfilterredir"] = filterRedirects and "redirects"\ - or "nonredirects" - if followRedirects: - # bug: see http://bugzilla.wikimedia.org/show_bug.cgi?id=16218 - # links identified by MediaWiki as redirects may not really be, - # so we have to check each "redirect" page and see if it - # really redirects to this page - blgen.request["gblfilterredir"] = "nonredirects" - redirgen = api.PageGenerator("backlinks", gbltitle=bltitle, - site=self, gblfilterredir="redirects") - if "gblnamespace" in blgen.request: - redirgen.request["gblnamespace"] = blgen.request["gblnamespace"] - genlist = [blgen] - for redir in redirgen: - if redir.getRedirectTarget() == page: - genlist.append( - self.pagebacklinks( - redir, True, None, namespaces)) - import itertools - return itertools.chain(*genlist) - return blgen - - def page_embeddedin(self, page, filterRedirects=None, namespaces=None): - """Iterate all pages that embedded the given page as a template. - - @param page: The Page to get inclusions for. - @param filterRedirects: If True, only return redirects that embed - the given page. If False, only return non-redirect links. If - None, return both (no filtering). - @param namespaces: If present, only return links from the namespaces - in this list. - - """ - eititle = page.title(withSection=False).encode(self.encoding()) - eigen = api.PageGenerator("embeddedin", geititle=eititle, site=self) - if isinstance(namespaces, list): - eigen.request["geinamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - eigen.request["geinamespace"] = str(namespaces) - if filterRedirects is not None: - eigen.request["geifilterredir"] = filterRedirects and "redirects"\ - or "nonredirects" - return eigen - - def pagereferences(self, page, followRedirects=False, filterRedirects=None, - withTemplateInclusion=True, onlyTemplateInclusion=False, - namespaces=None): - """Convenience method combining pagebacklinks and page_embeddedin.""" - - if onlyTemplateInclusion: - return self.page_embeddedin(page, namespaces=namespaces) - if not withTemplateInclusion: - return self.pagebacklinks(page, followRedirects, - namespaces=namespaces) - import itertools - return itertools.chain( - self.pagebacklinks(page, followRedirects, - filterRedirects, namespaces=namespaces), - self.page_embeddedin(page, filterRedirects, - namespaces=namespaces) - ) - - def pagelinks(self, page, namespaces=None, follow_redirects=False, - limit=None): - """Iterate internal wikilinks contained (or transcluded) on page. - - @param namespaces: Only iterate pages in these namespaces (default: all) - @type namespaces: list of ints - @param follow_redirects: if True, yields the target of any redirects, - rather than the redirect page - - """ - plgen = api.PageGenerator("links", site=self) - if isinstance(limit, int): - plgen.limit = limit - if hasattr(page, "_pageid"): - plgen.request['pageids'] = str(page._pageid) - else: - pltitle = page.title(withSection=False).encode(self.encoding()) - plgen.request['titles'] = pltitle - if follow_redirects: - plgen.request['redirects'] = '' - if isinstance(namespaces, list): - plgen.request["gplnamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - plgen.request["gplnamespace"] = str(namespaces) - return plgen - - @deprecate_arg("withSortKey", None) # Sortkey doesn't work with generator - def pagecategories(self, page, withSortKey=None): - """Iterate categories to which page belongs.""" - - clgen = api.CategoryPageGenerator("categories", site=self) - if hasattr(page, "_pageid"): - clgen.request['pageids'] = str(page._pageid) - else: - cltitle = page.title(withSection=False).encode(self.encoding()) - clgen.request['titles'] = cltitle - return clgen - - def pageimages(self, page): - """Iterate images used (not just linked) on the page.""" - - imtitle = page.title(withSection=False).encode(self.encoding()) - imgen = api.ImagePageGenerator("images", titles=imtitle, site=self) - return imgen - - def pagetemplates(self, page, namespaces=None): - """Iterate templates transcluded (not just linked) on the page.""" - - tltitle = page.title(withSection=False).encode(self.encoding()) - tlgen = api.PageGenerator("templates", titles=tltitle, site=self) - if isinstance(namespaces, list): - tlgen.request["gtlnamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - tlgen.request["gtlnamespace"] = str(namespaces) - return tlgen - - def categorymembers(self, category, namespaces=None, limit=None): - """Iterate members of specified category. - - @param category: The Category to iterate. - @param namespaces: If present, only return category members from - these namespaces. For example, use namespaces=[14] to yield - subcategories, use namespaces=[6] to yield image files, etc. Note, - however, that the iterated values are always Page objects, even - if in the Category or Image namespace. - @type namespaces: list of ints - @param limit: maximum number of pages to iterate (default: all) - @type limit: int - - """ - if category.namespace() != 14: - raise Error( - u"categorymembers: non-Category page '%s' specified" - % category.title()) - cmtitle = category.title(withSection=False).encode(self.encoding()) - cmgen = api.PageGenerator("categorymembers", gcmtitle=cmtitle, - gcmprop="ids|title|sortkey", site=self) - if isinstance(namespaces, list): - cmgen.request["gcmnamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - cmgen.request["gcmnamespace"] = str(namespaces) - if isinstance(limit, int): - cmgen.limit = limit - return cmgen - - def loadrevisions(self, page=None, getText=False, revids=None, - limit=None, startid=None, endid=None, starttime=None, - endtime=None, rvdir=None, user=None, excludeuser=None, - section=None, sysop=False): - """Retrieve and store revision information. - - By default, retrieves the last (current) revision of the page, - I{unless} any of the optional parameters revids, startid, endid, - starttime, endtime, rvdir, user, excludeuser, or limit are - specified. Unless noted below, all parameters not specified - default to False. - - If rvdir is False or not specified, startid must be greater than - endid if both are specified; likewise, starttime must be greater - than endtime. If rvdir is True, these relationships are reversed. - - @param page: retrieve revisions of this Page (required unless ids - is specified) - @param getText: if True, retrieve the wiki-text of each revision; - otherwise, only retrieve the revision metadata (default) - @param section: if specified, retrieve only this section of the text - (getText must be True); section must be given by number (top of - the article is section 0), not name - @type section: int - @param revids: retrieve only the specified revision ids (required - unless page is specified) - @type revids: list of ints - @param limit: Retrieve no more than this number of revisions - @type limit: int - @param startid: retrieve revisions starting with this revid - @param endid: stop upon retrieving this revid - @param starttime: retrieve revisions starting at this timestamp - @param endtime: stop upon reaching this timestamp - @param rvdir: if false, retrieve newest revisions first (default); - if true, retrieve earliest first - @param user: retrieve only revisions authored by this user - @param excludeuser: retrieve all revisions not authored by this user - @param sysop: if True, switch to sysop account (if available) to - retrieve this page - - """ - latest = (revids is None and - startid is None and - endid is None and - starttime is None and - endtime is None and - rvdir is None and - user is None and - excludeuser is None and - limit is None) # if True, we are retrieving current revision - - # check for invalid argument combinations - if page is None and revids is None: - raise ValueError( - "loadrevisions: either page or revids argument required") - if (startid is not None or endid is not None) and \ - (starttime is not None or endtime is not None): - raise ValueError( - "loadrevisions: startid/endid combined with starttime/endtime") - if starttime is not None and endtime is not None: - if rvdir and starttime >= endtime: - raise ValueError( - "loadrevisions: starttime > endtime with rvdir=True") - if (not rvdir) and endtime >= starttime: - raise ValueError( - "loadrevisions: endtime > starttime with rvdir=False") - if startid is not None and endid is not None: - if rvdir and startid >= endid: - raise ValueError( - "loadrevisions: startid > endid with rvdir=True") - if (not rvdir) and endid >= startid: - raise ValueError( - "loadrevisions: endid > startid with rvdir=False") - - # assemble API request - if revids is None: - rvtitle = page.title(withSection=False).encode(self.encoding()) - rvgen = api.PropertyGenerator(u"info|revisions", titles=rvtitle, - site=self) - else: - if isinstance(revids, (int, basestring)): - ids = unicode(revids) - else: - ids = u"|".join(unicode(r) for r in revids) - rvgen = api.PropertyGenerator(u"info|revisions", revids=ids, - site=self) - if getText: - rvgen.request[u"rvprop"] = \ - u"ids|flags|timestamp|user|comment|content" - if section is not None: - rvgen.request[u"rvsection"] = unicode(section) - if latest or "revids" in rvgen.request: - rvgen.limit = -1 # suppress use of rvlimit parameter - elif isinstance(limit, int): - rvgen.limit = limit - if rvdir: - rvgen.request[u"rvdir"] = u"newer" - elif rvdir is not None: - rvgen.request[u"rvdir"] = u"older" - if startid: - rvgen.request[u"rvstartid"] = startid - if endid: - rvgen.request[u"rvendid"] = endid - if starttime: - rvgen.request[u"rvstart"] = starttime - if endtime: - rvgen.request[u"rvend"] = endtime - if user: - rvgen.request[u"rvuser"] = user - elif excludeuser: - rvgen.request[u"rvexcludeuser"] = excludeuser - # TODO if sysop: something - rvgen.continuekey = "revisions" - for pagedata in rvgen: - if page is not None: - if pagedata['title'] != page.title(withSection=False): - raise Error( - u"loadrevisions: Query on %s returned data on '%s'" - % (page, pagedata['title'])) - if pagedata.has_key('missing'): - raise NoPage(u'Page %s does not exist' - % page.title(asLink=True)) - else: - page = Page(self, pagedata['title']) - api.update_page(page, pagedata) - - def pageinterwiki(self, page): - # No such function in the API (this method isn't called anywhere) - raise NotImplementedError - - def pagelanglinks(self, page): - """Iterate all interlanguage links on page, yielding Link objects.""" - lltitle = page.title(withSection=False) - llquery = api.PropertyGenerator("langlinks", - titles=lltitle.encode(self.encoding()), - site=self) - for pageitem in llquery: - if pageitem['title'] != lltitle: - raise Error( - u"getlanglinks: Query on %s returned data on '%s'" - % (page, pageitem['title'])) - if 'langlinks' not in pageitem: - continue - for linkdata in pageitem['langlinks']: - yield pywikibot.Link(linkdata['*'], - source=pywikibot.Site(linkdata['lang'])) - - def page_extlinks(self, page): - """Iterate all external links on page, yielding URL strings.""" - eltitle = page.title(withSection=False) - elquery = api.PropertyGenerator("extlinks", - titles=eltitle.encode(self.encoding()), - site=self) - for pageitem in elquery: - if pageitem['title'] != eltitle: - raise RuntimeError( - "getlanglinks: Query on %s returned data on '%s'" - % (page, pageitem['title'])) - if 'extlinks' not in pageitem: - continue - for linkdata in pageitem['extlinks']: - yield linkdata['*'] - - @deprecate_arg("throttle", None) - @deprecate_arg("includeredirects", "filterredir") - def allpages(self, start="!", prefix="", namespace=0, filterredir=None, - filterlanglinks=None, minsize=None, maxsize=None, - protect_type=None, protect_level=None, limit=None, - reverse=False, includeredirects=None): - """Iterate pages in a single namespace. - - Note: parameters includeRedirects and throttle are deprecated and - included only for backwards compatibility. - - @param start: Start at this title (page need not exist). - @param prefix: Only yield pages starting with this string. - @param namespace: Iterate pages from this (single) namespace - (default: 0) - @param filterredir: if True, only yield redirects; if False (and not - None), only yield non-redirects (default: yield both) - @param filterlanglinks: if True, only yield pages with language links; - if False (and not None), only yield pages without language links - (default: yield both) - @param minsize: if present, only yield pages at least this many - bytes in size - @param maxsize: if present, only yield pages at most this many bytes - in size - @param protect_type: only yield pages that have a protection of the - specified type - @type protect_type: str - @param protect_level: only yield pages that have protection at this - level; can only be used if protect_type is specified - @param limit: maximum number of pages to iterate (default: iterate - all pages in namespace) - @param reverse: if True, iterate in reverse Unicode lexigraphic - order (default: iterate in forward order) - @param includeredirects: DEPRECATED, use filterredirs instead - - """ - if not isinstance(namespace, int): - raise Error("allpages: only one namespace permitted.") - if includeredirects is not None: - logger.debug( -"allpages: 'includeRedirects' argument is deprecated; use 'filterredirs'.") - if includeredirects: - if includeredirects == "only": - filterredirs = True - else: - filterredirs = None - else: - filterredirs = False - - apgen = api.PageGenerator("allpages", gapnamespace=str(namespace), - gapfrom=start, site=self) - if prefix: - apgen.request["gapprefix"] = prefix - if filterredir is not None: - apgen.request["gapfilterredir"] = (filterredir - and "redirects" - or "nonredirects") - if filterlanglinks is not None: - apgen.request["gapfilterlanglinks"] = (filterlanglinks - and "withlanglinks" - or "withoutlanglinks") - if isinstance(minsize, int): - apgen.request["gapminsize"] = str(minsize) - if isinstance(maxsize, int): - apgen.request["gapmaxsize"] = str(maxsize) - if isinstance(protect_type, basestring): - apgen.request["gapprtype"] = protect_type - if isinstance(protect_level, basestring): - apgen.request["gapprlevel"] = protect_level - if isinstance(limit, int): - apgen.limit = limit - if reverse: - apgen.request["gapdir"] = "descending" - return apgen - - def prefixindex(self, prefix, namespace=0, includeredirects=True): - """Yield all pages with a given prefix. Deprecated. - - Use allpages() with the prefix= parameter instead of this method. - - """ - logger.debug("Site.prefixindex() is deprecated; use allpages instead.") - return self.allpages(prefix=prefix, namespace=namespace, - includeredirects=includeredirects) - - - def alllinks(self, start="!", prefix="", namespace=0, unique=False, - limit=None, fromids=False): - """Iterate all links to pages (which need not exist) in one namespace. - - Note that, in practice, links that were found on pages that have - been deleted may not have been removed from the links table, so this - method can return false positives. - - @param start: Start at this title (page need not exist). - @param prefix: Only yield pages starting with this string. - @param namespace: Iterate pages from this (single) namespace - (default: 0) - @param unique: If True, only iterate each link title once (default: - iterate once for each linking page) - @param limit: maximum number of pages to iterate (default: iterate - all pages in namespace) - @param fromids: if True, include the pageid of the page containing - each link (default: False) as the '_fromid' attribute of the Page; - cannot be combined with unique - - """ - if unique and fromids: - raise Error("alllinks: unique and fromids cannot both be True.") - if not isinstance(namespace, int): - raise Error("alllinks: only one namespace permitted.") - algen = api.ListGenerator("alllinks", alnamespace=str(namespace), - alfrom=start, site=self) - if prefix: - algen.request["alprefix"] = prefix - if isinstance(limit, int): - algen.limit = limit - if unique: - algen.request["alunique"] = "" - if fromids: - algen.request["alprop"] = "title|ids" - for link in algen: - p = pywikibot.Page(self, link['title'], link['ns']) - if fromids: - p._fromid = link['fromid'] - yield p - - def allcategories(self, start="!", prefix="", limit=None, - reverse=False): - """Iterate categories used (which need not have a Category page). - - Iterator yields Category objects. Note that, in practice, links that - were found on pages that have been deleted may not have been removed - from the database table, so this method can return false positives. - - @param start: Start at this category title (category need not exist). - @param prefix: Only yield categories starting with this string. - @param limit: maximum number of categories to iterate (default: - iterate all) - @param reverse: if True, iterate in reverse Unicode lexigraphic - order (default: iterate in forward order) - - """ - acgen = api.CategoryPageGenerator("allcategories", - gacfrom=start, site=self) - if prefix: - acgen.request["gacprefix"] = prefix - if isinstance(limit, int): - acgen.limit = limit - if reverse: - acgen.request["gacdir"] = "descending" - return acgen - - def categories(self, number=10, repeat=False): - """Deprecated; retained for backwards-compatibility""" - logger.debug( - "Site.categories() method is deprecated; use .allcategories()") - if repeat: - limit = None - else: - limit = number - return self.allcategories(limit=limit) - - def allusers(self, start="!", prefix="", limit=None, group=None): - """Iterate registered users, ordered by username. - - Iterated values are dicts containing 'name', 'editcount', - 'registration', and (sometimes) 'groups' keys. 'groups' will be - present only if the user is a member of at least 1 group, and will - be a list of unicodes; all the other values are unicodes and should - always be present. - - @param start: start at this username (name need not exist) - @param prefix: only iterate usernames starting with this substring - @param limit: maximum number of users to iterate (default: all) - @param group: only iterate users that are members of this group - @type group: str - - """ - augen = api.ListGenerator("allusers", aufrom=start, - auprop="editcount|groups|registration", - site=self) - if prefix: - augen.request["auprefix"] = prefix - if group: - augen.request["augroup"] = group - if isinstance(limit, int): - augen.limit = limit - return augen - - def allimages(self, start="!", prefix="", minsize=None, maxsize=None, - limit=None, reverse=False, sha1=None, sha1base36=None): - """Iterate all images, ordered by image title. - - Yields ImagePages, but these pages need not exist on the wiki. - - @param start: start at this title (name need not exist) - @param prefix: only iterate titles starting with this substring - @param limit: maximum number of titles to iterate (default: all) - @param minsize: only iterate images of at least this many bytes - @param maxsize: only iterate images of no more than this many bytes - @param reverse: if True, iterate in reverse lexigraphic order - @param sha1: only iterate image (it is theoretically possible there - could be more than one) with this sha1 hash - @param sha1base36: same as sha1 but in base 36 - - """ - aigen = api.ImagePageGenerator("allimages", gaifrom=start, - site=self) - if prefix: - aigen.request["gaiprefix"] = prefix - if isinstance(limit, int): - aigen.limit = limit - if isinstance(minsize, int): - aigen.request["gaiminsize"] = str(minsize) - if isinstance(maxsize, int): - aigen.request["gaimaxsize"] = str(maxsize) - if reverse: - aigen.request["gaidir"] = "descending" - if sha1: - aigen.request["gaisha1"] = sha1 - if sha1base36: - aigen.request["gaisha1base36"] = sha1base36 - return aigen - - def blocks(self, starttime=None, endtime=None, reverse=False, - blockids=None, users=None, limit=None): - """Iterate all current blocks, in order of creation. - - Note that logevents only logs user blocks, while this method - iterates all blocks including IP ranges. The iterator yields dicts - containing keys corresponding to the block properties (see - http://www.mediawiki.org/wiki/API:Query_-_Lists for documentation). - - @param starttime: start iterating at this timestamp - @param endtime: stop iterating at this timestamp - @param reverse: if True, iterate oldest blocks first (default: newest) - @param blockids: only iterate blocks with these id numbers - @param users: only iterate blocks affecting these usernames or IPs - @param limit: maximum number of blocks to iterate (default: all) - - """ - if starttime and endtime: - if reverse: - if starttime > endtime: - raise pywikibot.Error( - "blocks: starttime must be before endtime with reverse=True") - else: - if endtime > starttime: - raise pywikibot.Error( - "blocks: endtime must be before starttime with reverse=False") - bkgen = api.ListGenerator("blocks", site=self) - bkgen.request["bkprop"] = \ - "id|user|by|timestamp|expiry|reason|range|flags" - if starttime: - bkgen.request["bkstart"] = starttime - if endtime: - bkgen.request["bkend"] = endtime - if reverse: - bkgen.request["bkdir"] = "newer" - if blockids: - bkgen.request["bkids"] = blockids - if users: - bkgen.request["bkusers"] = users - if isinstance(limit, int): - bkgen.limit = limit - return bkgen - - def exturlusage(self, url, protocol="http", namespaces=None, - limit=None): - """Iterate Pages that contain links to the given URL. - - @param url: The URL to search for (without the protocol prefix); - this many include a '*' as a wildcard, only at the start of the - hostname - @param protocol: The protocol prefix (default: "http") - @param namespaces: Only iterate pages in these namespaces (default: all) - @type namespaces: list of ints - @param limit: Only iterate this many linking pages (default: all) - - """ - eugen = api.PageGenerator("exturlusage", geuquery=url, - geuprotocol=protocol, site=self) - if isinstance(namespaces, list): - eugen.request["geunamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - eugen.request["geunamespace"] = str(namespaces) - if isinstance(limit, int): - eugen.limit = limit - return eugen - - def imageusage(self, image, namespaces=None, filterredir=None, - limit=None): - """Iterate Pages that contain links to the given ImagePage. - - @param image: the image to search for (ImagePage need not exist on the wiki) - @type image: ImagePage - @param namespaces: Only iterate pages in these namespaces (default: all) - @type namespaces: list of ints - @param filterredir: if True, only yield redirects; if False (and not - None), only yield non-redirects (default: yield both) - @param limit: Only iterate this many linking pages (default: all) - - """ - iugen = api.PageGenerator("imageusage", site=self, - giutitle=image.title(withSection=False)) - if isinstance(namespaces, list): - iugen.request["giunamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - iugen.request["giunamespace"] = str(namespaces) - if isinstance(limit, int): - iugen.limit = limit - if filterredir is not None: - iugen.request["giufilterredir"] = (filterredir and "redirects" - or "nonredirects") - return iugen - - def logevents(self, logtype=None, user=None, page=None, - start=None, end=None, reverse=False, limit=None): - """Iterate all log entries. - - @param logtype: only iterate entries of this type (see wiki - documentation for available types, which will include "block", - "protect", "rights", "delete", "upload", "move", "import", - "patrol", "merge") - @param user: only iterate entries that match this user name - @param page: only iterate entries affecting this page - @param start: only iterate entries from and after this timestamp - @param end: only iterate entries up to and through this timestamp - @param reverse: if True, iterate oldest entries first (default: newest) - @param limit: only iterate up to this many entries - - """ - if start and end: - if reverse: - if end < start: - raise Error( - "logevents: end must be later than start with reverse=True") - else: - if start < end: - raise Error( - "logevents: start must be later than end with reverse=False") - legen = api.ListGenerator("logevents", site=self) - if logtype is not None: - legen.request["letype"] = logtype - if user is not None: - legen.request["leuser"] = user - if page is not None: - legen.request["letitle"] = page.title(withSection=False) - if start is not None: - legen.request["lestart"] = start - if end is not None: - legen.request["leend"] = end - if reverse: - legen.request["ledir"] = "newer" - if isinstance(limit, int): - legen.limit = limit - return legen - - def recentchanges(self, start=None, end=None, reverse=False, limit=None, - namespaces=None, pagelist=None, changetype=None, - showMinor=None, showBot=None, showAnon=None, - showRedirects=None, showPatrolled=None): - """Iterate recent changes. - - @param start: timestamp to start listing from - @param end: timestamp to end listing at - @param reverse: if True, start with oldest changes (default: newest) - @param limit: iterate no more than this number of entries - @param namespaces: iterate changes to pages in these namespaces only - @type namespaces: list of ints - @param pagelist: iterate changes to pages in this list only - @param pagelist: list of Pages - @param changetype: only iterate changes of this type ("edit" for - edits to existing pages, "new" for new pages, "log" for log - entries) - @param showMinor: if True, only list minor edits; if False (and not - None), only list non-minor edits - @param showBot: if True, only list bot edits; if False (and not - None), only list non-bot edits - @param showAnon: if True, only list anon edits; if False (and not - None), only list non-anon edits - @param showRedirects: if True, only list edits to redirect pages; if - False (and not None), only list edits to non-redirect pages - @param showPatrolled: if True, only list patrolled edits; if False - (and not None), only list non-patrolled edits - - """ - if start and end: - if reverse: - if end < start: - raise Error( - "recentchanges: end must be later than start with reverse=True") - else: - if start < end: - raise Error( - "recentchanges: start must be later than end with reverse=False") - rcgen = api.ListGenerator("recentchanges", site=self, - rcprop="user|comment|timestamp|title|ids" - "|redirect|patrolled|loginfo|flags") - if start is not None: - rcgen.request["rcstart"] = start - if end is not None: - rcgen.request["rcend"] = end - if reverse: - rcgen.request["rcdir"] = "newer" - if isinstance(limit, int): - rcgen.limit = limit - if isinstance(namespaces, list): - rcgen.request["rcnamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - rcgen.request["rcnamespace"] = str(namespaces) - if pagelist: - rcgen.request["rctitles"] = u"|".join(p.title(withSection=False) - for p in pagelist) - if changetype: - rcgen.request["rctype"] = changetype - filters = {'minor': showMinor, - 'bot': showBot, - 'anon': showAnon, - 'redirect': showRedirects, - 'patrolled': showPatrolled} - rcshow = [] - for item in filters: - if filters[item] is not None: - rcshow.append(filters[item] and item or ("!"+item)) - if rcshow: - rcgen.request["rcshow"] = "|".join(rcshow) - return rcgen - - @deprecate_arg("number", "limit") - def search(self, searchstring, namespaces=None, where="text", - getredirects=False, limit=None): - """Iterate Pages that contain the searchstring. - - Note that this may include non-existing Pages if the wiki's database - table contains outdated entries. - - @param searchstring: the text to search for - @type searchstring: unicode - @param where: Where to search; value must be "text" or "titles" (many - wikis do not support title search) - @param namespaces: search only in these namespaces (defaults to 0) - @type namespaces: list of ints - @param getredirects: if True, include redirects in results - @param limit: maximum number of results to iterate - - """ - if not searchstring: - raise Error("search: searchstring cannot be empty") - if where not in ("text", "titles"): - raise Error("search: unrecognized 'where' value: %s" % where) - srgen = api.PageGenerator("search", gsrsearch=searchstring, - gsrwhat=where, site=self) - if not namespaces: - logger.warning("search: namespaces cannot be empty; using [0].") - namespaces = [0] - if isinstance(namespaces, list): - srgen.request["gsrnamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - else: - srgen.request["gsrnamespace"] = str(namespaces) - if getredirects: - srgen.request["gsrredirects"] = "" - if isinstance(limit, int): - srgen.limit = limit - return srgen - - def usercontribs(self, user=None, userprefix=None, start=None, end=None, - reverse=False, limit=None, namespaces=None, - showMinor=None): - """Iterate contributions by a particular user. - - Iterated values are in the same format as recentchanges. - - @param user: Iterate contributions by this user (name or IP) - @param userprefix: Iterate contributions by all users whose names - or IPs start with this substring - @param start: Iterate contributions starting at this timestamp - @param end: Iterate contributions ending at this timestamp - @param reverse: Iterate oldest contributions first (default: newest) - @param limit: Maximum number of contributions to iterate - @param namespaces: Only iterate contributions in these namespaces - @type namespaces: list of ints - @param showMinor: if True, iterate only minor edits; if False and - not None, iterate only non-minor edits (default: iterate both) - - """ - if not (user or userprefix): - raise Error( - "usercontribs: either user or userprefix must be non-empty") - if start and end: - if reverse: - if end < start: - raise Error( - "usercontribs: end must be later than start with reverse=True") - else: - if start < end: - raise Error( - "usercontribs: start must be later than end with reverse=False") - ucgen = api.ListGenerator("usercontribs", site=self, - ucprop="ids|title|timestamp|comment|flags") - if user: - ucgen.request["ucuser"] = user - if userprefix: - ucgen.request["ucuserprefix"] = userprefix - if start is not None: - ucgen.request["ucstart"] = start - if end is not None: - ucgen.request["ucend"] = end - if reverse: - ucgen.request["ucdir"] = "newer" - if isinstance(limit, int): - ucgen.limit = limit - if isinstance(namespaces, list): - ucgen.request["ucnamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - ucgen.request["ucnamespace"] = str(namespaces) - if showMinor is not None: - ucgen.request["ucshow"] = showMinor and "minor" or "!minor" - return ucgen - - def watchlist_revs(self, start=None, end=None, reverse=False, - namespaces=None, showMinor=None, showBot=None, - showAnon=None, limit=None): - """Iterate revisions to pages on the bot user's watchlist. - - Iterated values will be in same format as recentchanges. - - @param start: Iterate revisions starting at this timestamp - @param end: Iterate revisions ending at this timestamp - @param reverse: Iterate oldest revisions first (default: newest) - @param namespaces: only iterate revisions to pages in these - namespaces (default: all) - @type namespaces: list of ints - @param showMinor: if True, only list minor edits; if False (and not - None), only list non-minor edits - @param showBot: if True, only list bot edits; if False (and not - None), only list non-bot edits - @param showAnon: if True, only list anon edits; if False (and not - None), only list non-anon edits - @param limit: Maximum number of revisions to iterate - - """ - if start and end: - if reverse: - if end < start: - raise Error( - "watchlist_revs: end must be later than start with reverse=True") - else: - if start < end: - raise Error( - "watchlist_revs: start must be later than end with reverse=False") - wlgen = api.ListGenerator("watchlist", wlallrev="", site=self, - wlprop="user|comment|timestamp|title|ids|flags") - #TODO: allow users to ask for "patrol" as well? - if start is not None: - wlgen.request["wlstart"] = start - if end is not None: - wlgen.request["wlend"] = end - if reverse: - wlgen.request["wldir"] = "newer" - if isinstance(limit, int): - wlgen.limit = limit - if isinstance(namespaces, list): - wlgen.request["wlnamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - wlgen.request["wlnamespace"] = str(namespaces) - filters = {'minor': showMinor, - 'bot': showBot, - 'anon': showAnon} - wlshow = [] - for item in filters: - if filters[item] is not None: - wlshow.append(filters[item] and item or ("!"+item)) - if wlshow: - wlgen.request["wlshow"] = "|".join(wlshow) - return wlgen - - def deletedrevs(self, page, start=None, end=None, reverse=None, limit=None, - get_text=False): - """Iterate deleted revisions. - - Each value returned by the iterator will be a dict containing the - 'title' and 'ns' keys for a particular Page and a 'revisions' key - whose value is a list of revisions in the same format as - recentchanges (plus a 'content' element if requested). If get_text - is true, the toplevel dict will contain a 'token' key as well. - - @param page: The page to check for deleted revisions - @param start: Iterate revisions starting at this timestamp - @param end: Iterate revisions ending at this timestamp - @param reverse: Iterate oldest revisions first (default: newest) - @param limit: Iterate no more than this number of revisions. - @param get_text: If True, retrieve the content of each revision and - an undelete token - - """ - if start and end: - if reverse: - if end < start: - raise Error( -"deletedrevs: end must be later than start with reverse=True") - else: - if start < end: - raise Error( -"deletedrevs: start must be later than end with reverse=False") - if not self.logged_in(): - self.login() - if "deletedhistory" not in self.userinfo['rights']: - try: - self.login(True) - except NoUsername: - pass - if "deletedhistory" not in self.userinfo['rights']: - raise Error( -"deletedrevs: User:%s not authorized to access deleted revisions." - % self.user()) - if get_text: - if "undelete" not in self.userinfo['rights']: - try: - self.login(True) - except NoUsername: - pass - if "undelete" not in self.userinfo['rights']: - raise Error( -"deletedrevs: User:%s not authorized to view deleted content." - % self.user()) - - drgen = api.ListGenerator("deletedrevs", site=self, - titles=page.title(withSection=False), - drprop="revid|user|comment|minor") - if get_text: - drgen.request['drprop'] = drgen.request['drprop'] + "|content|token" - if start is not None: - drgen.request["drstart"] = start - if end is not None: - drgen.request["drend"] = end - if reverse: - drgen.request["drdir"] = "newer" - if isinstance(limit, int): - drgen.limit = limit - return drgen - - def users(self, usernames): - """Iterate info about a list of users by name or IP. - - @param usernames: a list of user names - @type usernames: list, or other iterable, of unicodes - - """ - if not isinstance(usernames, basestring): - usernames = u"|".join(usernames) - usgen = api.ListGenerator("users", ususers=usernames, site=self, - usprop="blockinfo|groups|editcount|registration") - return usgen - - def randompages(self, limit=1, namespaces=None, redirects=False): - """Iterate a number of random pages. - - Pages are listed in a fixed sequence, only the starting point is - random. - - @param limit: the maximum number of pages to iterate (default: 1) - @param namespaces: only iterate pages in these namespaces. - @param redirects: if True, include only redirect pages in results - (default: include only non-redirects) - - """ - rngen = api.PageGenerator("random", site=self) - rngen.limit = limit - if isinstance(namespaces, list): - rngen.request["grnnamespace"] = u"|".join(unicode(ns) - for ns in namespaces) - elif namespaces is not None: - rngen.request["grnnamespace"] = str(namespaces) - if redirects: - rngen.request["grnredirect"] = "" - return rngen - - # catalog of editpage error codes, for use in generating messages - _ep_errors = { - "noapiwrite": "API editing not enabled on %(site)s wiki", - "writeapidenied": -"User %(user)s is not authorized to edit on %(site)s wiki", - "protectedtitle": -"Title %(title)s is protected against creation on %(site)s", - "cantcreate": -"User %(user)s not authorized to create new pages on %(site)s wiki", - "cantcreate-anon": -"""Bot is not logged in, and anon users are not authorized to create new pages -on %(site)s wiki""", - "articleexists": "Page %(title)s already exists on %(site)s wiki", - "noimageredirect-anon": -"""Bot is not logged in, and anon users are not authorized to create image -redirects on %(site)s wiki""", - "noimageredirect": -"User %(user)s not authorized to create image redirects on %(site)s wiki", - "spamdetected": -"Edit to page %(title)s rejected by spam filter due to content:\n", - "filtered": "%(info)s", - "contenttoobig": "%(info)s", - "noedit-anon": -"""Bot is not logged in, and anon users are not authorized to edit on -%(site)s wiki""", - "noedit": "User %(user)s not authorized to edit pages on %(site)s wiki", - "pagedeleted": -"Page %(title)s has been deleted since last retrieved from %(site)s wiki", - "editconflict": "Page %(title)s not saved due to edit conflict.", - } - - def editpage(self, page, summary, minor=True, notminor=False, - recreate=True, createonly=False, watch=False, unwatch=False): - """Submit an edited Page object to be saved to the wiki. - - @param page: The Page to be saved; its .text property will be used - as the new text to be saved to the wiki - @param token: the edit token retrieved using Site.token() - @param summary: the edit summary (required!) - @param minor: if True (default), mark edit as minor - @param notminor: if True, override account preferences to mark edit - as non-minor - @param recreate: if True (default), create new page even if this - title has previously been deleted - @param createonly: if True, raise an error if this title already - exists on the wiki - @param watch: if True, add this Page to bot's watchlist - @param unwatch: if True, remove this Page from bot's watchlist if - possible - @return: True if edit succeeded, False if it failed - - """ - text = page.text - if not text: - raise Error("editpage: no text to be saved") - try: - lastrev = page.latestRevision() - except NoPage: - lastrev = None - if not recreate: - raise Error("Page %s does not exist on %s wiki." - % (page.title(withSection=False), self)) - token = self.token(page, "edit") - self.lock_page(page) - if lastrev is not None and page.latestRevision() != lastrev: - raise Error("editpage: Edit conflict detected; saving aborted.") - req = api.Request(site=self, action="edit", - title=page.title(withSection=False), - text=text, token=token, summary=summary) -## if lastrev is not None: -## req["basetimestamp"] = page._revisions[lastrev].timestamp - if minor: - req['minor'] = "" - elif notminor: - req['notminor'] = "" - if 'bot' in self.userinfo['groups']: - req['bot'] = "" - if recreate: - req['recreate'] = "" - if createonly: - req['createonly'] = "" - if watch: - req['watch'] = "" - elif unwatch: - req['unwatch'] = "" -## FIXME: API gives 'badmd5' error -## md5hash = md5() -## md5hash.update(urllib.quote_plus(text.encode(self.encoding()))) -## req['md5'] = md5hash.digest() - while True: - try: - result = req.submit() - logger.debug("editpage response: %s" % result) - except api.APIError, err: - self.unlock_page(page) - if err.code.endswith("anon") and self.logged_in(): - logger.debug( -"editpage: received '%s' even though bot is logged in" % err.code) - errdata = { - 'site': self, - 'title': page.title(withSection=False), - 'user': self.user(), - 'info': err.info - } - if err.code == "spamdetected": - raise SpamfilterError(self._ep_errors[err.code] % errdata - + err.info[ err.info.index("fragment: ") + 9: ]) - - if err.code == "editconflict": - raise EditConflict(self._ep_errors[err.code] % errdata) - if err.code in self._ep_errors: - raise Error(self._ep_errors[err.code] % errdata) - logger.debug("editpage: Unexpected error code '%s' received." - % err.code) - raise - assert ("edit" in result and "result" in result["edit"]), result - if result["edit"]["result"] == "Success": - self.unlock_page(page) - if "nochange" in result["edit"]: - # null edit, page not changed - # TODO: do we want to notify the user of this? - return True - page._revid = result["edit"]["newrevid"] - # see http://www.mediawiki.org/wiki/API:Wikimania_2006_API_discussion#Notes - # not safe to assume that saved text is the same as sent - self.loadrevisions(page, getText=True) - return True - elif result["edit"]["result"] == "Failure": - if "captcha" in result["edit"]: - captcha = result["edit"]["captcha"] - req['captchaid'] = captcha['id'] - if captcha["type"] == "math": - req['captchaword'] = input(captcha["question"]) - continue - elif "url" in captcha: - webbrowser.open(url) - req['captchaword'] = cap_answerwikipedia.input( -"Please view CAPTCHA in your browser, then type answer here:") - continue - else: - self.unlock_page(page) - logger.error( -"editpage: unknown CAPTCHA response %s, page not saved" - % captcha) - return False - else: - self.unlock_page(page) - logger.error("editpage: unknown failure reason %s" - % str(result)) - return False - else: - self.unlock_page(page) - logger.error( -"editpage: Unknown result code '%s' received; page not saved" - % result["edit"]["result"]) - logger.error(str(result)) - return False - - # catalog of move errors for use in error messages - _mv_errors = { - "noapiwrite": "API editing not enabled on %(site)s wiki", - "writeapidenied": -"User %(user)s is not authorized to edit on %(site)s wiki", - "nosuppress": -"User %(user)s is not authorized to move pages without creating redirects", - "cantmove-anon": -"""Bot is not logged in, and anon users are not authorized to move pages on -%(site)s wiki""", - "cantmove": -"User %(user)s is not authorized to move pages on %(site)s wiki", - "immobilenamespace": -"Pages in %(oldnamespace)s namespace cannot be moved on %(site)s wiki", - "articleexists": -"Cannot move because page [[%(newtitle)s]] already exists on %(site)s wiki", - "protectedpage": -"Page [[%(oldtitle)s]] is protected against moving on %(site)s wiki", - "protectedtitle": -"Page [[%(newtitle)s]] is protected against creation on %(site)s wiki", - "nonfilenamespace": -"Cannot move a file to %(newnamespace)s namespace on %(site)s wiki", - "filetypemismatch": -"[[%(newtitle)s]] file extension does not match content of [[%(oldtitle)s]]" - } - - def movepage(self, page, newtitle, summary, movetalk=True, - noredirect=False): - """Move a Page to a new title. - - @param page: the Page to be moved (must exist) - @param newtitle: the new title for the Page - @type newtitle: unicode - @param summary: edit summary (required!) - @param movetalk: if True (default), also move the talk page if possible - @param noredirect: if True, suppress creation of a redirect from the - old title to the new one - @return: Page object with the new title - - """ - oldtitle = page.title(withSection=False) - newlink = pywikibot.Link(newtitle, self) - if newlink.namespace: - newtitle = self.namespace(newlink.namespace) + ":" + newlink.title - else: - newtitle = newlink.title - if oldtitle == newtitle: - raise Error("Cannot move page %s to its own title." - % oldtitle) - if not page.exists(): - raise Error("Cannot move page %s because it does not exist on %s." - % (oldtitle, self)) - token = self.token(page, "move") - self.lock_page(page) - req = api.Request(site=self, action="move", to=newtitle, - token=token, reason=summary) - req['from'] = oldtitle # "from" is a python keyword - if movetalk: - req['movetalk'] = "" - if noredirect: - req['noredirect'] = "" - try: - result = req.submit() - logger.debug("movepage response: %s" % result) - except api.APIError, err: - if err.code.endswith("anon") and self.logged_in(): - logger.debug( -"movepage: received '%s' even though bot is logged in" % err.code) - errdata = { - 'site': self, - 'oldtitle': oldtitle, - 'oldnamespace': self.namespace(page.namespace()), - 'newtitle': newtitle, - 'newnamespace': self.namespace(newlink.namespace), - 'user': self.user(), - } - if err.code in self._mv_errors: - raise Error(self._mv_errors[err.code] % errdata) - logger.debug("movepage: Unexpected error code '%s' received." - % err.code) - raise - finally: - self.unlock_page(page) - if "move" not in result: - logger.error("movepage: %s" % result) - raise Error("movepage: unexpected response") - # TODO: Check for talkmove-error messages - if "talkmove-error-code" in result["move"]: - logger.warning(u"movepage: Talk page %s not moved" - % (page.toggleTalkPage().title(asLink=True))) - return pywikibot.Page(page, newtitle) - - # catalog of rollback errors for use in error messages - _rb_errors = { - "noapiwrite": - "API editing not enabled on %(site)s wiki", - "writeapidenied": - "User %(user)s not allowed to edit through the API", - "alreadyrolled": - "Page [[%(title)s]] already rolled back; action aborted.", - } # other errors shouldn't arise because we check for those errors - - def rollbackpage(self, page, summary=u''): - """Roll back page to version before last user's edits. - - As a precaution against errors, this method will fail unless - the page history contains at least two revisions, and at least - one that is not by the same user who made the last edit. - - @param page: the Page to be rolled back (must exist) - @param summary: edit summary (defaults to a standardized message) - - """ - if len(page._revisions) < 2: - raise pywikibot.Error( - u"Rollback of %s aborted; load revision history first." - % page.title(asLink=True)) - last_rev = page._revisions[page.latestRevision()] - last_user = last_rev.user - for rev in sorted(page._revisions.keys(), reverse=True): - # start with most recent revision first - if rev.user != last_user: - prev_user = rev.user - break - else: - raise pywikibot.Error( - u"Rollback of %s aborted; only one user in revision history." - % page.title(asLink=True)) - summary = summary or ( -u"Reverted edits by [[Special:Contributions/%(last_user)s|%(last_user)s]] " -u"([[User talk:%(last_user)s|Talk]]) to last version by %(prev_user)s" - % locals()) - token = self.token(page, "rollback") - self.lock_page(page) - req = api.Request(site=self, action="rollback", - title=page.title(withSection=False), - user=last_user, - token=token) - try: - result = req.submit() - except api.APIError, err: - errdata = { - 'site': self, - 'title': page.title(withSection=False), - 'user': self.user(), - } - if err.code in self._rb_errors: - raise Error(self._rb_errors[err.code] % errdata) - logger.debug("rollback: Unexpected error code '%s' received." - % err.code) - raise - finally: - self.unlock_page(page) - - # catalog of delete errors for use in error messages - _dl_errors = { - "noapiwrite": - "API editing not enabled on %(site)s wiki", - "writeapidenied": - "User %(user)s not allowed to edit through the API", - "permissiondenied": - "User %(user)s not authorized to delete pages on %(site)s wiki.", - "cantdelete": - "Could not delete [[%(title)s]]. Maybe it was deleted already.", - } # other errors shouldn't occur because of pre-submission checks - - def deletepage(self, page, summary): - """Delete page from the wiki. Requires appropriate privilege level. - - @param page: Page to be deleted. - @param summary: Edit summary (required!). - - """ - try: - self.login(sysop=True) - except pywikibot.Error, e: - raise Error("delete: Unable to login as sysop (%s)" - % e.__class__.__name__) - if not self.logged_in(sysop=True): - raise Error("delete: Unable to login as sysop") - token = self.token("delete") - req = api.Request(site=self, action="delete", token=token, - title=page.title(withSection=False), - reason=summary) - try: - result = req.submit() - except api.APIError, err: - errdata = { - 'site': self, - 'title': page.title(withSection=False), - 'user': self.user(), - } - if err.code in self._dl_errors: - raise Error(self._dl_errors[err.code] % errdata) - logger.debug("delete: Unexpected error code '%s' received." - % err.code) - raise - finally: - self.unlock_page(page) - - # TODO: implement undelete - - # TODO: implement patrol - - def linksearch(self, siteurl, limit=500): - """Backwards-compatible interface to exturlusage()""" - return self.exturlusage(siteurl, limit=limit) - - @deprecate_arg("repeat", None) - def newimages(self, number=100, lestart=None, leend=None, leuser=None, - letitle=None): - """Yield ImagePages from most recent uploads""" - return self.logevents(logtype="upload", limit=number, start=lestart, - end=leend, user=leuser, title=letitle) - - def getImagesFromAnHash(self, hash_found=None): - """Return all images that have the same hash. - - Useful to find duplicates or nowcommons. - - NOTE: it returns also the image itself, if you don't want it, just - filter the list returned. - - NOTE 2: it returns the image title WITHOUT the image namespace. - - """ - if hash_found == None: # If the hash is none return None and not continue - return None - return [image.title(withNamespace=False) - for image in self.allimages(sha1=hash_found)] - - -#### METHODS NOT IMPLEMENTED YET #### -class NotImplementedYet: - - # TODO: is this needed any more? can it be obtained from the http module? - def cookies(self, sysop = False): - """Return a string containing the user's current cookies.""" - self._loadCookies(sysop = sysop) - index = self._userIndex(sysop) - return self._cookies[index] - - def _loadCookies(self, sysop = False): - """Retrieve session cookies for login""" - index = self._userIndex(sysop) - if self._cookies[index] is not None: - return - try: - if sysop: - try: - username = config.sysopnames[self.family.name - ][self.code] - except KeyError: - raise NoUsername("""\ -You tried to perform an action that requires admin privileges, but you haven't -entered your sysop name in your user-config.py. Please add -sysopnames['%s']['%s']='name' to your user-config.py""" - % (self.family.name, self.code)) - else: - username = pywikiobt.config2.usernames[self.family.name - ][self.code] - except KeyError: - self._cookies[index] = None - self._isLoggedIn[index] = False - else: - tmp = '%s-%s-%s-login.data' % ( - self.family.name, self.code, username) - fn = config.datafilepath('login-data', tmp) - if not os.path.exists(fn): - self._cookies[index] = None - self._isLoggedIn[index] = False - else: - f = open(fn) - self._cookies[index] = '; '.join([x.strip() for x in f.readlines()]) - f.close() - - # THESE ARE FUNCTIONS NOT YET IMPLEMENTED IN THE API - # TODO: avoid code duplication for the following methods - def newpages(self, number = 10, get_redirect = False, repeat = False): - """Yield new articles (as Page objects) from Special:Newpages. - - Starts with the newest article and fetches the number of articles - specified in the first argument. If repeat is True, it fetches - Newpages again. If there is no new page, it blocks until there is - one, sleeping between subsequent fetches of Newpages. - - The objects yielded are tuples composed of the Page object, - timestamp (unicode), length (int), an empty unicode string, username - or IP address (str), comment (unicode). - - """ - # TODO: in recent MW versions Special:Newpages takes a namespace parameter, - # and defaults to 0 if not specified. - # TODO: Detection of unregistered users is broken - # TODO: Repeat mechanism doesn't make much sense as implemented; - # should use both offset and limit parameters, and have an - # option to fetch older rather than newer pages - seen = set() - while True: - path = self.newpages_address(n=number) - # The throttling is important here, so always enabled. - get_throttle() - html = self.getUrl(path) - - entryR = re.compile( -'<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"' -' title="(?P<title>.+?)">.+?</a>.+?[([](?P<length>[\d,.]+)[^)]]*[)]]' -' .?<a href=".+?" title=".+?:(?P<username>.+?)">' - ) - for m in entryR.finditer(html): - date = m.group('date') - title = m.group('title') - title = title.replace('"', '"') - length = int(re.sub("[,.]", "", m.group('length'))) - loggedIn = u'' - username = m.group('username') - comment = u'' - - if title not in seen: - seen.add(title) - page = Page(self, title) - yield page, date, length, loggedIn, username, comment - if not repeat: - break - - def longpages(self, number = 10, repeat = False): - """Yield Pages from Special:Longpages. - - Return values are a tuple of Page object, length(int). - - """ - #TODO: should use offset and limit parameters; 'repeat' as now - # implemented is fairly useless - # this comment applies to all the XXXXpages methods following, as well - seen = set() - while True: - path = self.longpages_address(n=number) - get_throttle() - html = self.getUrl(path) - entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">hist</a>) <a href=".+?" title="(?P<title>.+?)">.+?</a> [(?P<length>\d+)(.+?)]</li>') - for m in entryR.finditer(html): - title = m.group('title') - length = int(m.group('length')) - if title not in seen: - seen.add(title) - page = Page(self, title) - yield page, length - if not repeat: - break - - def shortpages(self, number = 10, repeat = False): - """Yield Pages and lengths from Special:Shortpages.""" - throttle = True - seen = set() - while True: - path = self.shortpages_address(n = number) - get_throttle() - html = self.getUrl(path) - entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">hist</a>) <a href=".+?" title="(?P<title>.+?)">.+?</a> [(?P<length>\d+)(.+?)]</li>') - for m in entryR.finditer(html): - title = m.group('title') - length = int(m.group('length')) - - if title not in seen: - seen.add(title) - page = Page(self, title) - yield page, length - if not repeat: - break - - def deadendpages(self, number = 10, repeat = False): - """Yield Page objects retrieved from Special:Deadendpages.""" - seen = set() - while True: - path = self.deadendpages_address(n=number) - get_throttle() - html = self.getUrl(path) - entryR = re.compile( - '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') - for m in entryR.finditer(html): - title = m.group('title') - - if title not in seen: - seen.add(title) - page = Page(self, title) - yield page - if not repeat: - break - - def ancientpages(self, number = 10, repeat = False): - """Yield Pages, datestamps from Special:Ancientpages.""" - seen = set() - while True: - path = self.ancientpages_address(n=number) - get_throttle() - html = self.getUrl(path) - entryR = re.compile( -'<li><a href=".+?" title="(?P<title>.+?)">.+?</a> (?P<date>.+?)</li>') - for m in entryR.finditer(html): - title = m.group('title') - date = m.group('date') - if title not in seen: - seen.add(title) - page = Page(self, title) - yield page, date - if not repeat: - break - - def lonelypages(self, number = 10, repeat = False): - """Yield Pages retrieved from Special:Lonelypages.""" - throttle = True - seen = set() - while True: - path = self.lonelypages_address(n=number) - get_throttle() - html = self.getUrl(path) - entryR = re.compile( - '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') - for m in entryR.finditer(html): - title = m.group('title') - - if title not in seen: - seen.add(title) - page = Page(self, title) - yield page - if not repeat: - break - - def unwatchedpages(self, number = 10, repeat = False): - """Yield Pages from Special:Unwatchedpages (requires Admin privileges).""" - seen = set() - while True: - path = self.unwatchedpages_address(n=number) - get_throttle() - html = self.getUrl(path, sysop = True) - entryR = re.compile( - '<li><a href=".+?" title="(?P<title>.+?)">.+?</a>.+?</li>') - for m in entryR.finditer(html): - title = m.group('title') - if title not in seen: - seen.add(title) - page = Page(self, title) - yield page - if not repeat: - break - - def uncategorizedcategories(self, number = 10, repeat = False): - """Yield Categories from Special:Uncategorizedcategories.""" - import catlib - seen = set() - while True: - path = self.uncategorizedcategories_address(n=number) - get_throttle() - html = self.getUrl(path) - entryR = re.compile( - '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') - for m in entryR.finditer(html): - title = m.group('title') - if title not in seen: - seen.add(title) - page = catlib.Category(self, title) - yield page - if not repeat: - break - - def newimages(self, number = 10, repeat = False): - """Yield ImagePages from Special:Log&type=upload""" - - seen = set() - regexp = re.compile('<li[^>]*>(?P<date>.+?)\s+<a href=.*?>(?P<user>.+?)</a>\s+(.+?</a>).*?<a href=".*?"(?P<new> class="new")? title="(?P<image>.+?)"\s*>(?:.*?<span class="comment">(?P<comment>.*?)</span>)?', re.UNICODE) - - while True: - path = self.log_address(number, mode = 'upload') - get_throttle() - html = self.getUrl(path) - - for m in regexp.finditer(html): - image = m.group('image') - - if image not in seen: - seen.add(image) - - if m.group('new'): - output(u"Image '%s' has been deleted." % image) - continue - - date = m.group('date') - user = m.group('user') - comment = m.group('comment') or '' - - yield ImagePage(self, image), date, user, comment - if not repeat: - break - - def uncategorizedimages(self, number = 10, repeat = False): - """Yield ImagePages from Special:Uncategorizedimages.""" - seen = set() - ns = self.image_namespace() - entryR = re.compile( - '<a href=".+?" title="(?P<title>%s:.+?)">.+?</a>' % ns) - while True: - path = self.uncategorizedimages_address(n=number) - get_throttle() - html = self.getUrl(path) - for m in entryR.finditer(html): - title = m.group('title') - if title not in seen: - seen.add(title) - page = ImagePage(self, title) - yield page - if not repeat: - break - - def uncategorizedpages(self, number = 10, repeat = False): - """Yield Pages from Special:Uncategorizedpages.""" - seen = set() - while True: - path = self.uncategorizedpages_address(n=number) - get_throttle() - html = self.getUrl(path) - entryR = re.compile( - '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') - for m in entryR.finditer(html): - title = m.group('title') - - if title not in seen: - seen.add(title) - page = Page(self, title) - yield page - if not repeat: - break - - def unusedcategories(self, number = 10, repeat = False): - """Yield Category objects from Special:Unusedcategories.""" - import catlib - seen = set() - while True: - path = self.unusedcategories_address(n=number) - get_throttle() - html = self.getUrl(path) - entryR = re.compile('<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') - for m in entryR.finditer(html): - title = m.group('title') - - if title not in seen: - seen.add(title) - page = catlib.Category(self, title) - yield page - if not repeat: - break - - def unusedfiles(self, number = 10, repeat = False, extension = None): - """Yield ImagePage objects from Special:Unusedimages.""" - seen = set() - ns = self.image_namespace() - entryR = re.compile( - '<a href=".+?" title="(?P<title>%s:.+?)">.+?</a>' % ns) - while True: - path = self.unusedfiles_address(n=number) - get_throttle() - html = self.getUrl(path) - for m in entryR.finditer(html): - fileext = None - title = m.group('title') - if extension: - fileext = title[len(title)-3:] - if title not in seen and fileext == extension: - ## Check whether the media is used in a Proofread page - # code disabled because it slows this method down, and - # because it is unclear what it's supposed to do. - #basename = title[6:] - #page = Page(self, 'Page:' + basename) - - #if not page.exists(): - seen.add(title) - image = ImagePage(self, title) - yield image - if not repeat: - break - - def withoutinterwiki(self, number=10, repeat=False): - """Yield Pages without language links from Special:Withoutinterwiki.""" - seen = set() - while True: - path = self.withoutinterwiki_address(n=number) - get_throttle() - html = self.getUrl(path) - entryR = re.compile('<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') - for m in entryR.finditer(html): - title = m.group('title') - if title not in seen: - seen.add(title) - page = Page(self, title) - yield page - if not repeat: - break - - def linksearch(self, siteurl): - """Yield Pages from results of Special:Linksearch for 'siteurl'.""" - if siteurl.startswith('*.'): - siteurl = siteurl[2:] - output(u'Querying [[Special:Linksearch]]...') - cache = [] - for url in [siteurl, '*.' + siteurl]: - path = self.linksearch_address(url) - get_throttle() - html = self.getUrl(path) - loc = html.find('<div class="mw-spcontent">') - if loc > -1: - html = html[loc:] - loc = html.find('<div class="printfooter">') - if loc > -1: - html = html[:loc] - R = re.compile('title ?="(.*?)"') - for title in R.findall(html): - if not siteurl in title: - # the links themselves have similar form - if title in cache: - continue - else: - cache.append(title) - yield Page(self, title) - + # -*- coding: utf-8 -*- +""" +Objects representing MediaWiki sites (wikis) and families (groups of wikis +on the same topic in different languages). +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import pywikibot +from pywikibot import deprecate_arg +from pywikibot import config +from pywikibot.throttle import Throttle +from pywikibot.data import api +from pywikibot.exceptions import * + +try: + from hashlib import md5 +except ImportError: + from md5 import md5 +import logging +import os +import re +import sys +import threading +import urllib + +logger = logging.getLogger("wiki") + +class PageInUse(pywikibot.Error): + """Page cannot be reserved for writing due to existing lock.""" + + +def Family(fam=None, fatal=True): + """Import the named family. + + @param fam: family name (if omitted, uses the configured default) + @type fam: str + @param fatal: if True, the bot will stop running if the given family is + unknown. If False, it will only raise a ValueError exception. + @param fatal: bool + @return: a Family instance configured for the named family. + + """ + if fam == None: + fam = config.family + try: + # first try the built-in families + exec "import pywikibot.families.%s_family as myfamily" % fam + except ImportError: + # next see if user has defined a local family module + try: + sys.path.append(config.datafilepath('families')) + exec "import %s_family as myfamily" % fam + except ImportError: + if fatal: + logger.exception(u"""\ +Error importing the %s family. This probably means the family +does not exist. Also check your configuration file.""" + % fam) + sys.exit(1) + else: + raise Error("Family %s does not exist" % fam) + return myfamily.Family() + + +class BaseSite(object): + """Site methods that are independent of the communication interface.""" + # to implement a specific interface, define a Site class that inherits + # from this + + def __init__(self, code, fam=None, user=None, sysop=None): + """ + @param code: the site's language code + @type code: str + @param fam: wiki family name (optional) + @type fam: str or Family + @param user: bot user name (optional) + @type user: str + @param sysop: sysop account user name (optional) + @type sysop: str + + """ + self.__code = code.lower() + if isinstance(fam, basestring) or fam is None: + self.__family = Family(fam, fatal=False) + else: + self.__family = fam + + # if we got an outdated language code, use the new one instead. + if self.__family.obsolete.has_key(self.__code): + if self.__family.obsolete[self.__code] is not None: + self.__code = self.__family.obsolete[self.__code] + else: + # no such language anymore + raise NoSuchSite("Language %s in family %s is obsolete" + % (self.__code, self.__family.name)) + if self.__code not in self.languages(): + if self.__code == 'zh-classic' and 'zh-classical' in self.languages(): + self.__code = 'zh-classical' + # database hack (database is varchar[10] -> zh-classical + # is cut to zh-classic. + else: + raise NoSuchSite("Language %s does not exist in family %s" + % (self.__code, self.__family.name)) + + self._username = [user, sysop] + + # following are for use with lock_page and unlock_page methods + self._pagemutex = threading.Lock() + self._locked_pages = [] + + @property + def throttle(self): + """Return this Site's throttle. Initialize a new one if needed.""" + + if not hasattr(self, "_throttle"): + self._throttle = Throttle(self, multiplydelay=True, + verbosedelay=True) + try: + self.login(False) + except pywikibot.NoUsername: + pass + return self._throttle + + @property + def family(self): + """The Family object for this Site's wiki family.""" + + return self.__family + + @property + def code(self): + """The identifying code for this Site.""" + + return self.__code + + @property + def lang(self): + """The ISO language code for this Site. + + Presumed to be equal to the wiki prefix, but this can be overridden. + + """ + return self.__code + + def __cmp__(self, other): + """Perform equality and inequality tests on Site objects.""" + + if not isinstance(other, BaseSite): + return 1 + if self.family == other.family: + return cmp(self.code, other.code) + return cmp(self.family.name, other.family.name) + + def user(self): + """Return the currently-logged in bot user, or None.""" + + if self.logged_in(True): + return self._username[True] + elif self.logged_in(False): + return self._username[False] + return None + + def username(self, sysop = False): + return self._username[sysop] + + def __getattr__(self, attr): + """Calls to methods not defined in this object are passed to Family.""" + + if hasattr(self.__class__, attr): + return self.__class__.attr + try: + method = getattr(self.family, attr) + f = lambda *args, **kwargs: \ + method(self.code, *args, **kwargs) + if hasattr(method, "__doc__"): + f.__doc__ = method.__doc__ + return f + except AttributeError: + raise AttributeError("%s instance has no attribute '%s'" + % (self.__class__.__name__, attr) ) + + def sitename(self): + """Return string representing this Site's name and language.""" + + return self.family.name+':'+self.code + + __str__ = sitename + + def __repr__(self): + return 'Site("%s", "%s")' % (self.code, self.family.name) + + def __hash__(self): + return hash(repr(self)) + + def linktrail(self): + """Return regex for trailing chars displayed as part of a link. + + Returns a string, not a compiled regular expression object. + + This reads from the family file, and ''not'' from + [[MediaWiki:Linktrail]], because the MW software currently uses a + built-in linktrail from its message files and ignores the wiki + value. + + """ + return self.family.linktrail(self.code) + + def languages(self): + """Return list of all valid language codes for this site's Family.""" + + return self.family.langs.keys() + + def validLanguageLinks(self): + """Return list of language codes that can be used in interwiki links.""" + + nsnames = sum(self.namespaces().values(), []) + return [l for l in self.languages() + if l[:1].upper() + l[1:] not in self.namespaces()] + + def ns_index(self, namespace): + """Given a namespace name, return its int index, or None if invalid.""" + + for ns in self.namespaces(): + if namespace.lower() in [name.lower() + for name in self.namespaces()[ns]]: + return ns + return None + + getNamespaceIndex = ns_index # for backwards-compatibility + + def namespaces(self): + """Return dict of valid namespaces on this wiki.""" + + return self._namespaces + + def ns_normalize(self, value): + """Return canonical local form of namespace name. + + @param value: A namespace name + @type value: unicode + + """ + index = self.ns_index(value) + return self.namespace(index) + + normalizeNamespace = ns_normalize # for backwards-compatibility + + def redirect(self, default=True): + """Return the localized redirect tag for the site. + + If default is True, falls back to 'REDIRECT' if the site has no + special redirect tag. + + """ + if default: + return self.family.redirect.get(self.code, [u"REDIRECT"])[0] + else: + return self.family.redirect.get(self.code, None) + + def lock_page(self, page, block=True): + """Lock page for writing. Must be called before writing any page. + + We don't want different threads trying to write to the same page + at the same time, even to different sections. + + @param page: the page to be locked + @type page: pywikibot.Page + @param block: if true, wait until the page is available to be locked; + otherwise, raise an exception if page can't be locked + + """ + self._pagemutex.acquire() + try: + while page in self._locked_pages: + if not block: + raise PageInUse + time.sleep(.25) + self._locked_pages.append(page.title(withSection=False)) + finally: + self._pagemutex.release() + + def unlock_page(self, page): + """Unlock page. Call as soon as a write operation has completed. + + @param page: the page to be locked + @type page: pywikibot.Page + + """ + self._pagemutex.acquire() + try: + self._locked_pages.remove(page.title(withSection=False)) + finally: + self._pagemutex.release() + + def disambcategory(self): + """Return Category in which disambig pages are listed.""" + + try: + name = self.namespace(14)+':'+self.family.disambcatname[self.code] + except KeyError: + raise Error(u"No disambiguation category name found for %(site)s" + % {'site': self}) + return pywikibot.Category(pywikibot.Link(name, self)) + + def linkto(self, title, othersite = None): + """Return unicode string in the form of a wikilink to 'title' + + Use optional Site argument 'othersite' to generate an interwiki link. + + """ + logger.debug("Site.linkto() method is deprecated; use pywikibot.Link") + return pywikibot.Link(title, self).astext(othersite) + + def isInterwikiLink(self, s): + """Return True if s is in the form of an interwiki link. + + If a link object constructed using "s" as the link text parses as + belonging to a different site, this method returns True. + + """ + return (pywikibot.Link(s, self).site != self) + + def redirectRegex(self): + """Return a compiled regular expression matching on redirect pages. + + Group 1 in the regex match object will be the target title. + + """ + #TODO: is this needed, since the API identifies redirects? + # (maybe, the API can give false positives) + default = 'REDIRECT' + try: + keywords = set(self.family.redirect[self.code]) + keywords.add(default) + pattern = r'(?:' + '|'.join(keywords) + ')' + except KeyError: + # no localized keyword for redirects + pattern = r'%s' % default + # A redirect starts with hash (#), followed by a keyword, then + # arbitrary stuff, then a wikilink. The wikilink may contain + # a label, although this is not useful. + return re.compile(r'\s*#%(pattern)s\s*:?\s*[[(.+?)(?:|.*?)?]]' + % locals(), + re.IGNORECASE | re.UNICODE | re.DOTALL) + + # namespace shortcuts for backwards-compatibility + + def special_namespace(self): + return self.namespace(-1) + + def image_namespace(self): + return self.namespace(6) + + def mediawiki_namespace(self): + return self.namespace(8) + + def template_namespace(self): + return self.namespace(10) + + def category_namespace(self): + return self.namespace(14) + + def category_namespaces(self): + return self.namespace(14, all=True) + + # site-specific formatting preferences + + def category_on_one_line(self): + """Return True if this site wants all category links on one line.""" + + return self.code in self.family.category_on_one_line + + def interwiki_putfirst(self): + """Return list of language codes for ordering of interwiki links.""" + + return self.family.interwiki_putfirst.get(self.code, None) + + def interwiki_putfirst_doubled(self, list_of_links): + # TODO: is this even needed? No family in the framework has this + # dictionary defined! + if self.lang in self.family.interwiki_putfirst_doubled: + if len(list_of_links) >= \ + self.family.interwiki_putfirst_doubled[self.lang][0]: + links2 = [lang.language() for lang in list_of_links] + result = [] + for lang in self.family.interwiki_putfirst_doubled[self.lang][1]: + try: + result.append(list_of_links[links2.index(lang)]) + except ValueError: + pass + return result + else: + return False + else: + return False + + def getSite(self, code): + """Return Site object for language 'code' in this Family.""" + + return pywikibot.Site(code=code, fam=self.family, user=self.user) + + # deprecated methods for backwards-compatibility + + def fam(self): + """Return Family object for this Site.""" + return self.family + + def urlEncode(self, query): + """DEPRECATED""" + return urllib.urlencode(query) + + def getUrl(self, path, retry=True, sysop=False, data=None, + compress=True, no_hostname=False, cookie_only=False): + """DEPRECATED. + + Retained for compatibility only. All arguments except path and data + are ignored. + + """ + if data: + if not isinstance(data, basestring): + data = urllib.urlencode(data) + return pywikibot.comms.data.request(self, path, method="PUT", + body=data) + else: + return pywikibot.comms.data.request(self, path) + + def postForm(self, address, predata, sysop=False, cookies=None): + """DEPRECATED""" + return self.getUrl(address, data=predata) + + def postData(self, address, data, contentType=None, sysop=False, + compress=True, cookies=None): + """DEPRECATED""" + return self.getUrl(address, data=data) + + # unsupported methods from version 1 + + def checkCharset(self, charset): + raise NotImplementedError + def getToken(self, getalways=True, getagain=False, sysop=False): + raise NotImplementedError + def export_address(self): + raise NotImplementedError + def move_address(self): + raise NotImplementedError + def delete_address(self, s): + raise NotImplementedError + def undelete_view_address(self, s, ts=''): + raise NotImplementedError + def undelete_address(self): + raise NotImplementedError + def protect_address(self, s): + raise NotImplementedError + def unprotect_address(self, s): + raise NotImplementedError + def put_address(self, s): + raise NotImplementedError + def get_address(self, s): + raise NotImplementedError + def nice_get_address(self, s): + raise NotImplementedError + def edit_address(self, s): + raise NotImplementedError + def purge_address(self, s): + raise NotImplementedError + def block_address(self): + raise NotImplementedError + def unblock_address(self): + raise NotImplementedError + def blocksearch_address(self, s): + raise NotImplementedError + def linksearch_address(self, s, limit=500, offset=0): + raise NotImplementedError + def search_address(self, q, n=50, ns=0): + raise NotImplementedError + def allpages_address(self, s, ns = 0): + raise NotImplementedError + def log_address(self, n=50, mode = ''): + raise NotImplementedError + def newpages_address(self, n=50): + raise NotImplementedError + def longpages_address(self, n=500): + raise NotImplementedError + def shortpages_address(self, n=500): + raise NotImplementedError + def unusedfiles_address(self, n=500): + raise NotImplementedError + def categories_address(self, n=500): + raise NotImplementedError + def deadendpages_address(self, n=500): + raise NotImplementedError + def ancientpages_address(self, n=500): + raise NotImplementedError + def lonelypages_address(self, n=500): + raise NotImplementedError + def protectedpages_address(self, n=500): + raise NotImplementedError + def unwatchedpages_address(self, n=500): + raise NotImplementedError + def uncategorizedcategories_address(self, n=500): + raise NotImplementedError + def uncategorizedimages_address(self, n=500): + raise NotImplementedError + def uncategorizedpages_address(self, n=500): + raise NotImplementedError + def unusedcategories_address(self, n=500): + raise NotImplementedError + def withoutinterwiki_address(self, n=500): + raise NotImplementedError + def references_address(self, s): + raise NotImplementedError + def allmessages_address(self): + raise NotImplementedError + def upload_address(self): + raise NotImplementedError + def double_redirects_address(self, default_limit = True): + raise NotImplementedError + def broken_redirects_address(self, default_limit = True): + raise NotImplementedError + def login_address(self): + raise NotImplementedError + def captcha_image_address(self, id): + raise NotImplementedError + def watchlist_address(self): + raise NotImplementedError + def contribs_address(self, target, limit=500, offset=''): + raise NotImplementedError + + +class APISite(BaseSite): + """API interface to MediaWiki site. + + Do not use directly; use pywikibot.Site function. + + """ +## Site methods from version 1.0 (as these are implemented in this file, +## or declared deprecated/obsolete, they will be removed from this list) +########## +## cookies: return user's cookies as a string +## +## urlEncode: Encode a query to be sent using an http POST request. +## postForm: Post form data to an address at this site. +## postData: Post encoded form data to an http address at this site. +## +## shared_image_repository: Return tuple of image repositories used by this +## site. +## version: Return MediaWiki version string from Family file. +## versionnumber: Return int identifying the MediaWiki version. +## live_version: Return version number read from Special:Version. +## checkCharset(charset): Warn if charset doesn't match family file. +## +## linktrail: Return regex for trailing chars displayed as part of a link. +## disambcategory: Category in which disambiguation pages are listed. +## +## Methods that yield Page objects derived from a wiki's Special: pages +## (note, some methods yield other information in a tuple along with the +## Pages; see method docs for details) -- +## +## newpages(): Special:Newpages +## newimages(): Special:Log&type=upload +## longpages(): Special:Longpages +## shortpages(): Special:Shortpages +## deadendpages(): Special:Deadendpages +## ancientpages(): Special:Ancientpages +## lonelypages(): Special:Lonelypages +## unwatchedpages(): Special:Unwatchedpages (sysop accounts only) +## uncategorizedcategories(): Special:Uncategorizedcategories (yields +## Category objects) +## uncategorizedpages(): Special:Uncategorizedpages +## uncategorizedimages(): Special:Uncategorizedimages (yields +## ImagePage objects) +## unusedcategories(): Special:Unusuedcategories (yields Category) +## unusedfiles(): Special:Unusedimages (yields ImagePage) +## withoutinterwiki: Special:Withoutinterwiki +## linksearch: Special:Linksearch + + def __init__(self, code, fam=None, user=None, sysop=None): + BaseSite.__init__(self, code, fam, user, sysop) + self._namespaces = { + # these are the MediaWiki built-in names, which always work + # localized names are loaded later upon accessing the wiki + # namespace prefixes are always case-insensitive, but the + # canonical forms are capitalized + -2: [u"Media"], + -1: [u"Special"], + 0: [u""], + 1: [u"Talk"], + 2: [u"User"], + 3: [u"User talk"], + 4: [u"Project"], + 5: [u"Project talk"], + 6: [u"Image"], + 7: [u"Image talk"], + 8: [u"MediaWiki"], + 9: [u"MediaWiki talk"], + 10: [u"Template"], + 11: [u"Template talk"], + 12: [u"Help"], + 13: [u"Help talk"], + 14: [u"Category"], + 15: [u"Category talk"], + } + self.sitelock = threading.Lock() + self._msgcache = {} + return + +# ANYTHING BELOW THIS POINT IS NOT YET IMPLEMENTED IN __init__() + self.nocapitalize = self.__code in self.family.nocapitalize + # Calculating valid languages took quite long, so we calculate it once + # in initialization instead of each time it is used. + self._validlanguages = [] + for language in self.languages(): + if not language[:1].upper() + language[1:] in self.namespaces(): + self._validlanguages.append(language) + + def logged_in(self, sysop=False): + """Return True if logged in with specified privileges, otherwise False. + + @param sysop: if True, require sysop privileges. + + """ + if self.userinfo['name'] != self._username[sysop]: + return False + return (not sysop) or 'sysop' in self.userinfo['groups'] + + def loggedInAs(self, sysop = False): + """Return the current username if logged in, otherwise return None. + + DEPRECATED (use .user() method instead) + + """ + logger.debug("Site.loggedInAs() method is deprecated.") + return self.logged_in(sysop) and self.user() + + def login(self, sysop=False): + """Log the user in if not already logged in.""" + if not hasattr(self, "_siteinfo"): + self._getsiteinfo() + # check whether a login cookie already exists for this user + if hasattr(self, "_userinfo"): + if self.userinfo['name'] == self._username[sysop]: + return + if not self.logged_in(sysop): + loginMan = api.LoginManager(site=self, sysop=sysop, + user=self._username[sysop]) + if loginMan.login(retry = True): + self._username[sysop] = loginMan.username + if hasattr(self, "_userinfo"): + del self._userinfo + self.getuserinfo() + + forceLogin = login # alias for backward-compatibility + + def getuserinfo(self): + """Retrieve userinfo from site and store in _userinfo attribute. + + self._userinfo will be a dict with the following keys and values: + + - id: user id (numeric str) + - name: username (if user is logged in) + - anon: present if user is not logged in + - groups: list of groups (could be empty) + - rights: list of rights (could be empty) + - message: present if user has a new message on talk page + - blockinfo: present if user is blocked (dict) + + """ + if (not hasattr(self, "_userinfo") + or "rights" not in self._userinfo + or self._userinfo['name'] + != self._username["sysop" in self._userinfo["groups"]]): + uirequest = api.Request( + site=self, + action="query", + meta="userinfo", + uiprop="blockinfo|hasmsg|groups|rights" + ) + uidata = uirequest.submit() + assert 'query' in uidata, \ + "API userinfo response lacks 'query' key" + assert 'userinfo' in uidata['query'], \ + "API userinfo response lacks 'userinfo' key" + self._userinfo = uidata['query']['userinfo'] + return self._userinfo + + userinfo = property(fget=getuserinfo, doc=getuserinfo.__doc__) + + def is_blocked(self, sysop=False): + """Return true if and only if user is blocked. + + @param sysop: If true, log in to sysop account (if available) + + """ + if not self.logged_in(sysop): + self.login(sysop) + return 'blockinfo' in self._userinfo + + def isBlocked(self, sysop=False): + """Deprecated synonym for is_blocked""" + logger.debug( + "Site method 'isBlocked' should be changed to 'is_blocked'") + return self.is_blocked(sysop) + + def checkBlocks(self, sysop = False): + """Check if the user is blocked, and raise an exception if so.""" + if self.is_blocked(sysop): + # User blocked + raise UserBlocked('User is blocked in site %s' % self) + + def has_right(self, right, sysop=False): + """Return true if and only if the user has a specific right. + + Possible values of 'right' may vary depending on wiki settings, + but will usually include: + + * Actions: edit, move, delete, protect, upload + * User levels: autoconfirmed, sysop, bot + + """ + if not self.logged_in(sysop): + self.login(sysop) + return right.lower() in self._userinfo['rights'] + + def isAllowed(self, right, sysop=False): + """Deprecated; retained for backwards-compatibility""" + logger.debug("Site.isAllowed() method is deprecated; use has_right()") + return self.has_right(right, sysop) + + def has_group(self, group, sysop=False): + """Return true if and only if the user is a member of specified group. + + Possible values of 'group' may vary depending on wiki settings, + but will usually include bot. + + """ + if not self.logged_in(sysop): + self.login(sysop) + return group.lower() in self._userinfo['groups'] + + def messages(self, sysop=False): + """Returns true if the user has new messages, and false otherwise.""" + if not self.logged_in(sysop): + self.login(sysop) + return 'hasmsg' in self._userinfo + + def mediawiki_message(self, key): + """Return the MediaWiki message text for key "key" """ + if not key in self._msgcache: + msg_query = api.QueryGenerator(site=self, meta="allmessages", + amfilter=key) + for msg in msg_query: + if msg['name'] == key and not 'missing' in msg: + self._msgcache[key] = msg['*'] + break + else: + raise KeyError("Site %(self)s has no message '%(key)s'" + % locals()) + return self._msgcache[key] + + def has_mediawiki_message(self, key): + """Return True iff this site defines a MediaWiki message for 'key'.""" + try: + v = self.mediawiki_message(key) + return True + except KeyError: + return False + + def getcurrenttimestamp(self): + """Return (Mediawiki) timestamp, {{CURRENTTIMESTAMP}}, the server time. + + Format is yyyymmddhhmmss + + """ + r = api.Request(site=self, + action="parse", + text="{{CURRENTTIMESTAMP}}") + result = r.submit() + return re.search('\d+', result['parse']['text']['*']).group() + + def _getsiteinfo(self): + """Retrieve siteinfo and namespaces from site.""" + sirequest = api.Request( + site=self, + action="query", + meta="siteinfo", + siprop="general|namespaces|namespacealiases" + ) + try: + sidata = sirequest.submit() + except api.APIError: + # hack for older sites that don't support 1.12 properties + # probably should delete if we're not going to support pre-1.12 + sirequest = api.Request( + site=self, + action="query", + meta="siteinfo", + siprop="general|namespaces" + ) + sidata = sirequest.submit() + + assert 'query' in sidata, \ + "API siteinfo response lacks 'query' key" + sidata = sidata['query'] + assert 'general' in sidata, \ + "API siteinfo response lacks 'general' key" + assert 'namespaces' in sidata, \ + "API siteinfo response lacks 'namespaces' key" + self._siteinfo = sidata['general'] + nsdata = sidata['namespaces'] + for nskey in nsdata: + ns = int(nskey) + if ns in self._namespaces: + if nsdata[nskey]["*"] in self._namespaces[ns]: + continue + # this is the preferred form so it goes at front of list + self._namespaces[ns].insert(0, nsdata[nskey]["*"]) + else: + self._namespaces[ns] = [nsdata[nskey]["*"]] + if 'namespacealiases' in sidata: + aliasdata = sidata['namespacealiases'] + for item in aliasdata: + if item["*"] in self._namespaces[int(item['id'])]: + continue + # this is a less preferred form so it goes at the end + self._namespaces[int(item['id'])].append(item["*"]) + + @property + def siteinfo(self): + """Site information dict.""" + + if not hasattr(self, "_siteinfo"): + self._getsiteinfo() + return self._siteinfo + + def case(self): + """Return this site's capitalization rule.""" + + return self.siteinfo['case'] + + def language(self): + """Return the code for the language of this Site.""" + + return self.siteinfo['lang'] + + lang = property(fget=language, doc=language.__doc__) + + def namespaces(self): + """Return dict of valid namespaces on this wiki.""" + + if not hasattr(self, "_siteinfo"): + self._getsiteinfo() + return self._namespaces + + def namespace(self, num, all=False): + """Return string containing local name of namespace 'num'. + + If optional argument 'all' is true, return a list of all recognized + values for this namespace. + + """ + if all: + return self.namespaces()[num] + return self.namespaces()[num][0] + + def live_version(self): + """Return the 'real' version number found on [[Special:Version]] + + Return value is a tuple (int, int, str) of the major and minor + version numbers and any other text contained in the version. + + """ + versionstring = self.siteinfo['generator'] + m = re.match(r"^MediaWiki ([0-9]+).([0-9]+)(.*)$", versionstring) + if m: + return (int(m.group(1)), int(m.group(2)), m.group(3)) + else: + return None + + def loadpageinfo(self, page): + """Load page info from api and save in page attributes""" + title = page.title(withSection=False) + query = api.PropertyGenerator("info", site=self, + titles=title.encode(self.encoding()), + inprop="protection") + for pageitem in query: + if pageitem['title'] != title: + raise Error( + u"loadpageinfo: Query on %s returned data on '%s'" + % (page, pageitem['title'])) + api.update_page(page, pageitem) + + def loadimageinfo(self, page, history=False): + """Load image info from api and save in page attributes + + @param history: if true, return the image's version history + + """ + title = page.title(withSection=False) + query = api.PropertyGenerator("imageinfo", site=self, + titles=title.encode(self.encoding()), + iiprop=["timestamp", "user", "comment", + "url", "size", "sha1", "mime", + "metadata", "archivename"]) + if history: + query.request["iilimit"] = "max" + for pageitem in query: + if pageitem['title'] != title: + raise Error( + u"loadpageinfo: Query on %s returned data on '%s'" + % (page, pageitem['title'])) + api.update_page(page, pageitem) + if history: + return pageitem['imageinfo'] + + def page_exists(self, page): + """Return True if and only if page is an existing page on site.""" + if not hasattr(page, "_pageid"): + self.loadpageinfo(page) + return page._pageid > 0 + + def page_restrictions(self, page): + """Returns a dictionary reflecting page protections""" + if not self.page_exists(page): + raise NoPage(u'No page %s.' % page) + if not hasattr(page, "_protection"): + self.loadpageinfo(page) + return page._protection + + def page_can_be_edited(self, page): + """ + Returns True if and only if: + - page is unprotected, and bot has an account for this site, or + - page is protected, and bot has a sysop account for this site. + + """ + rest = self.page_restrictions(page) + sysop_protected = rest.has_key('edit') and rest['edit'][0] == 'sysop' + try: + api.LoginManager(site=self, sysop=sysop_protected) + except NoUsername: + return False + return True + + def page_isredirect(self, page): + """Return True if and only if page is a redirect.""" + if not hasattr(page, "_redir"): + self.loadpageinfo(page) + return bool(page._redir) + + def getredirtarget(self, page): + """Return Page object for the redirect target of page.""" + if not hasattr(page, "_redir"): + self.loadpageinfo(page) + if not page._redir: + raise pywikibot.IsNotRedirectPage(page.title()) + title = page.title(withSection=False) + query = api.Request(site=self, action="query", property="info", + inprop="protection|talkid|subjectid", + titles=title.encode(self.encoding()), + redirects="") + result = query.submit() + if "query" not in result or "redirects" not in result["query"]: + raise RuntimeError( + "getredirtarget: No 'redirects' found for page %s." + % title) + redirmap = dict((item['from'], item['to']) + for item in result['query']['redirects']) + if title not in redirmap: + raise RuntimeError( + "getredirtarget: 'redirects' contains no key for page %s." + % title) + if "pages" not in result['query']: + # no "pages" element indicates a circular redirect + raise pywikibot.CircularRedirect(redirmap[title]) + for pagedata in result['query']['pages'].values(): + # there should be only one value in 'pages', and it is the target + if pagedata['title'] not in redirmap.values(): + raise RuntimeError( + "getredirtarget: target page '%s' not found in 'redirects'" + % pagedata['title']) + target = pywikibot.Page(self, pagedata['title'], pagedata['ns']) + api.update_page(target, pagedata) + page._redir = target + + def preloadpages(self, pagelist, groupsize=60): + """Return a generator to a list of preloaded pages. + + Note that [at least in current implementation] pages may be iterated + in a different order than in the underlying pagelist. + + @param pagelist: an iterable that returns Page objects + @param groupsize: how many Pages to query at a time + @type groupsize: int + + """ + from pywikibot.tools import itergroup + for sublist in itergroup(pagelist, groupsize): + pageids = [str(p._pageid) for p in sublist + if hasattr(p, "_pageid") + and p._pageid > 0] + cache = dict((p.title(withSection=False), p) for p in sublist) + rvgen = api.PropertyGenerator("revisions|info", site=self) + rvgen.limit = -1 + if len(pageids) == len(sublist): + # only use pageids if all pages have them + rvgen.request["pageids"] = "|".join(pageids) + else: + rvgen.request["titles"] = "|".join(cache.keys()) + rvgen.request[u"rvprop"] = \ + u"ids|flags|timestamp|user|comment|content" + logger.info(u"Retrieving %s pages from %s." + % (len(cache), self) + ) + for pagedata in rvgen: + logger.debug("Preloading %s" % pagedata) + try: + if pagedata['title'] not in cache: + raise Error( + u"preloadpages: Query returned unexpected title '%s'" + % pagedata['title'] + ) + except KeyError: + logger.debug("No 'title' in %s" % pagedata) + logger.debug("pageids=%s" % pageids) + logger.debug("titles=%s" % cache.keys()) + continue + page = cache[pagedata['title']] + api.update_page(page, pagedata) + yield page + + def token(self, page, tokentype): + """Return token retrieved from wiki to allow changing page content. + + @param page: the Page for which a token should be retrieved + @param tokentype: the type of token (e.g., "edit", "move", "delete"); + see API documentation for full list of types + + """ + query = api.PropertyGenerator("info|revisions", site=self, + titles=page.title(withSection=False), + intoken=tokentype) + for item in query: + if item['title'] != page.title(withSection=False): + raise Error( + u"token: Query on page %s returned data on page [[%s]]" + % (page.title(withSection=False, asLink=True), + item['title'])) + api.update_page(page, item) + logging.debug(str(item)) + return item[tokentype + "token"] + + # following group of methods map more-or-less directly to API queries + + def pagebacklinks(self, page, followRedirects=False, filterRedirects=None, + namespaces=None): + """Iterate all pages that link to the given page. + + @param page: The Page to get links to. + @param followRedirects: Also return links to redirects pointing to + the given page. + @param filterRedirects: If True, only return redirects to the given + page. If False, only return non-redirect links. If None, return + both (no filtering). + @param namespaces: If present, only return links from the namespaces + in this list. + + """ + bltitle = page.title(withSection=False).encode(self.encoding()) + blgen = api.PageGenerator("backlinks", gbltitle=bltitle, site=self) + if isinstance(namespaces, list): + blgen.request["gblnamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + blgen.request["gblnamespace"] = str(namespaces) + if filterRedirects is not None: + blgen.request["gblfilterredir"] = filterRedirects and "redirects"\ + or "nonredirects" + if followRedirects: + # bug: see http://bugzilla.wikimedia.org/show_bug.cgi?id=16218 + # links identified by MediaWiki as redirects may not really be, + # so we have to check each "redirect" page and see if it + # really redirects to this page + blgen.request["gblfilterredir"] = "nonredirects" + redirgen = api.PageGenerator("backlinks", gbltitle=bltitle, + site=self, gblfilterredir="redirects") + if "gblnamespace" in blgen.request: + redirgen.request["gblnamespace"] = blgen.request["gblnamespace"] + genlist = [blgen] + for redir in redirgen: + if redir.getRedirectTarget() == page: + genlist.append( + self.pagebacklinks( + redir, True, None, namespaces)) + import itertools + return itertools.chain(*genlist) + return blgen + + def page_embeddedin(self, page, filterRedirects=None, namespaces=None): + """Iterate all pages that embedded the given page as a template. + + @param page: The Page to get inclusions for. + @param filterRedirects: If True, only return redirects that embed + the given page. If False, only return non-redirect links. If + None, return both (no filtering). + @param namespaces: If present, only return links from the namespaces + in this list. + + """ + eititle = page.title(withSection=False).encode(self.encoding()) + eigen = api.PageGenerator("embeddedin", geititle=eititle, site=self) + if isinstance(namespaces, list): + eigen.request["geinamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + eigen.request["geinamespace"] = str(namespaces) + if filterRedirects is not None: + eigen.request["geifilterredir"] = filterRedirects and "redirects"\ + or "nonredirects" + return eigen + + def pagereferences(self, page, followRedirects=False, filterRedirects=None, + withTemplateInclusion=True, onlyTemplateInclusion=False, + namespaces=None): + """Convenience method combining pagebacklinks and page_embeddedin.""" + + if onlyTemplateInclusion: + return self.page_embeddedin(page, namespaces=namespaces) + if not withTemplateInclusion: + return self.pagebacklinks(page, followRedirects, + namespaces=namespaces) + import itertools + return itertools.chain( + self.pagebacklinks(page, followRedirects, + filterRedirects, namespaces=namespaces), + self.page_embeddedin(page, filterRedirects, + namespaces=namespaces) + ) + + def pagelinks(self, page, namespaces=None, follow_redirects=False, + limit=None): + """Iterate internal wikilinks contained (or transcluded) on page. + + @param namespaces: Only iterate pages in these namespaces (default: all) + @type namespaces: list of ints + @param follow_redirects: if True, yields the target of any redirects, + rather than the redirect page + + """ + plgen = api.PageGenerator("links", site=self) + if isinstance(limit, int): + plgen.limit = limit + if hasattr(page, "_pageid"): + plgen.request['pageids'] = str(page._pageid) + else: + pltitle = page.title(withSection=False).encode(self.encoding()) + plgen.request['titles'] = pltitle + if follow_redirects: + plgen.request['redirects'] = '' + if isinstance(namespaces, list): + plgen.request["gplnamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + plgen.request["gplnamespace"] = str(namespaces) + return plgen + + @deprecate_arg("withSortKey", None) # Sortkey doesn't work with generator + def pagecategories(self, page, withSortKey=None): + """Iterate categories to which page belongs.""" + + clgen = api.CategoryPageGenerator("categories", site=self) + if hasattr(page, "_pageid"): + clgen.request['pageids'] = str(page._pageid) + else: + cltitle = page.title(withSection=False).encode(self.encoding()) + clgen.request['titles'] = cltitle + return clgen + + def pageimages(self, page): + """Iterate images used (not just linked) on the page.""" + + imtitle = page.title(withSection=False).encode(self.encoding()) + imgen = api.ImagePageGenerator("images", titles=imtitle, site=self) + return imgen + + def pagetemplates(self, page, namespaces=None): + """Iterate templates transcluded (not just linked) on the page.""" + + tltitle = page.title(withSection=False).encode(self.encoding()) + tlgen = api.PageGenerator("templates", titles=tltitle, site=self) + if isinstance(namespaces, list): + tlgen.request["gtlnamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + tlgen.request["gtlnamespace"] = str(namespaces) + return tlgen + + def categorymembers(self, category, namespaces=None, limit=None): + """Iterate members of specified category. + + @param category: The Category to iterate. + @param namespaces: If present, only return category members from + these namespaces. For example, use namespaces=[14] to yield + subcategories, use namespaces=[6] to yield image files, etc. Note, + however, that the iterated values are always Page objects, even + if in the Category or Image namespace. + @type namespaces: list of ints + @param limit: maximum number of pages to iterate (default: all) + @type limit: int + + """ + if category.namespace() != 14: + raise Error( + u"categorymembers: non-Category page '%s' specified" + % category.title()) + cmtitle = category.title(withSection=False).encode(self.encoding()) + cmgen = api.PageGenerator("categorymembers", gcmtitle=cmtitle, + gcmprop="ids|title|sortkey", site=self) + if isinstance(namespaces, list): + cmgen.request["gcmnamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + cmgen.request["gcmnamespace"] = str(namespaces) + if isinstance(limit, int): + cmgen.limit = limit + return cmgen + + def loadrevisions(self, page=None, getText=False, revids=None, + limit=None, startid=None, endid=None, starttime=None, + endtime=None, rvdir=None, user=None, excludeuser=None, + section=None, sysop=False): + """Retrieve and store revision information. + + By default, retrieves the last (current) revision of the page, + I{unless} any of the optional parameters revids, startid, endid, + starttime, endtime, rvdir, user, excludeuser, or limit are + specified. Unless noted below, all parameters not specified + default to False. + + If rvdir is False or not specified, startid must be greater than + endid if both are specified; likewise, starttime must be greater + than endtime. If rvdir is True, these relationships are reversed. + + @param page: retrieve revisions of this Page (required unless ids + is specified) + @param getText: if True, retrieve the wiki-text of each revision; + otherwise, only retrieve the revision metadata (default) + @param section: if specified, retrieve only this section of the text + (getText must be True); section must be given by number (top of + the article is section 0), not name + @type section: int + @param revids: retrieve only the specified revision ids (required + unless page is specified) + @type revids: list of ints + @param limit: Retrieve no more than this number of revisions + @type limit: int + @param startid: retrieve revisions starting with this revid + @param endid: stop upon retrieving this revid + @param starttime: retrieve revisions starting at this timestamp + @param endtime: stop upon reaching this timestamp + @param rvdir: if false, retrieve newest revisions first (default); + if true, retrieve earliest first + @param user: retrieve only revisions authored by this user + @param excludeuser: retrieve all revisions not authored by this user + @param sysop: if True, switch to sysop account (if available) to + retrieve this page + + """ + latest = (revids is None and + startid is None and + endid is None and + starttime is None and + endtime is None and + rvdir is None and + user is None and + excludeuser is None and + limit is None) # if True, we are retrieving current revision + + # check for invalid argument combinations + if page is None and revids is None: + raise ValueError( + "loadrevisions: either page or revids argument required") + if (startid is not None or endid is not None) and \ + (starttime is not None or endtime is not None): + raise ValueError( + "loadrevisions: startid/endid combined with starttime/endtime") + if starttime is not None and endtime is not None: + if rvdir and starttime >= endtime: + raise ValueError( + "loadrevisions: starttime > endtime with rvdir=True") + if (not rvdir) and endtime >= starttime: + raise ValueError( + "loadrevisions: endtime > starttime with rvdir=False") + if startid is not None and endid is not None: + if rvdir and startid >= endid: + raise ValueError( + "loadrevisions: startid > endid with rvdir=True") + if (not rvdir) and endid >= startid: + raise ValueError( + "loadrevisions: endid > startid with rvdir=False") + + # assemble API request + if revids is None: + rvtitle = page.title(withSection=False).encode(self.encoding()) + rvgen = api.PropertyGenerator(u"info|revisions", titles=rvtitle, + site=self) + else: + if isinstance(revids, (int, basestring)): + ids = unicode(revids) + else: + ids = u"|".join(unicode(r) for r in revids) + rvgen = api.PropertyGenerator(u"info|revisions", revids=ids, + site=self) + if getText: + rvgen.request[u"rvprop"] = \ + u"ids|flags|timestamp|user|comment|content" + if section is not None: + rvgen.request[u"rvsection"] = unicode(section) + if latest or "revids" in rvgen.request: + rvgen.limit = -1 # suppress use of rvlimit parameter + elif isinstance(limit, int): + rvgen.limit = limit + if rvdir: + rvgen.request[u"rvdir"] = u"newer" + elif rvdir is not None: + rvgen.request[u"rvdir"] = u"older" + if startid: + rvgen.request[u"rvstartid"] = startid + if endid: + rvgen.request[u"rvendid"] = endid + if starttime: + rvgen.request[u"rvstart"] = starttime + if endtime: + rvgen.request[u"rvend"] = endtime + if user: + rvgen.request[u"rvuser"] = user + elif excludeuser: + rvgen.request[u"rvexcludeuser"] = excludeuser + # TODO if sysop: something + rvgen.continuekey = "revisions" + for pagedata in rvgen: + if page is not None: + if pagedata['title'] != page.title(withSection=False): + raise Error( + u"loadrevisions: Query on %s returned data on '%s'" + % (page, pagedata['title'])) + if pagedata.has_key('missing'): + raise NoPage(u'Page %s does not exist' + % page.title(asLink=True)) + else: + page = Page(self, pagedata['title']) + api.update_page(page, pagedata) + + def pageinterwiki(self, page): + # No such function in the API (this method isn't called anywhere) + raise NotImplementedError + + def pagelanglinks(self, page): + """Iterate all interlanguage links on page, yielding Link objects.""" + lltitle = page.title(withSection=False) + llquery = api.PropertyGenerator("langlinks", + titles=lltitle.encode(self.encoding()), + site=self) + for pageitem in llquery: + if pageitem['title'] != lltitle: + raise Error( + u"getlanglinks: Query on %s returned data on '%s'" + % (page, pageitem['title'])) + if 'langlinks' not in pageitem: + continue + for linkdata in pageitem['langlinks']: + yield pywikibot.Link(linkdata['*'], + source=pywikibot.Site(linkdata['lang'])) + + def page_extlinks(self, page): + """Iterate all external links on page, yielding URL strings.""" + eltitle = page.title(withSection=False) + elquery = api.PropertyGenerator("extlinks", + titles=eltitle.encode(self.encoding()), + site=self) + for pageitem in elquery: + if pageitem['title'] != eltitle: + raise RuntimeError( + "getlanglinks: Query on %s returned data on '%s'" + % (page, pageitem['title'])) + if 'extlinks' not in pageitem: + continue + for linkdata in pageitem['extlinks']: + yield linkdata['*'] + + @deprecate_arg("throttle", None) + @deprecate_arg("includeredirects", "filterredir") + def allpages(self, start="!", prefix="", namespace=0, filterredir=None, + filterlanglinks=None, minsize=None, maxsize=None, + protect_type=None, protect_level=None, limit=None, + reverse=False, includeredirects=None): + """Iterate pages in a single namespace. + + Note: parameters includeRedirects and throttle are deprecated and + included only for backwards compatibility. + + @param start: Start at this title (page need not exist). + @param prefix: Only yield pages starting with this string. + @param namespace: Iterate pages from this (single) namespace + (default: 0) + @param filterredir: if True, only yield redirects; if False (and not + None), only yield non-redirects (default: yield both) + @param filterlanglinks: if True, only yield pages with language links; + if False (and not None), only yield pages without language links + (default: yield both) + @param minsize: if present, only yield pages at least this many + bytes in size + @param maxsize: if present, only yield pages at most this many bytes + in size + @param protect_type: only yield pages that have a protection of the + specified type + @type protect_type: str + @param protect_level: only yield pages that have protection at this + level; can only be used if protect_type is specified + @param limit: maximum number of pages to iterate (default: iterate + all pages in namespace) + @param reverse: if True, iterate in reverse Unicode lexigraphic + order (default: iterate in forward order) + @param includeredirects: DEPRECATED, use filterredirs instead + + """ + if not isinstance(namespace, int): + raise Error("allpages: only one namespace permitted.") + if includeredirects is not None: + logger.debug( +"allpages: 'includeRedirects' argument is deprecated; use 'filterredirs'.") + if includeredirects: + if includeredirects == "only": + filterredirs = True + else: + filterredirs = None + else: + filterredirs = False + + apgen = api.PageGenerator("allpages", gapnamespace=str(namespace), + gapfrom=start, site=self) + if prefix: + apgen.request["gapprefix"] = prefix + if filterredir is not None: + apgen.request["gapfilterredir"] = (filterredir + and "redirects" + or "nonredirects") + if filterlanglinks is not None: + apgen.request["gapfilterlanglinks"] = (filterlanglinks + and "withlanglinks" + or "withoutlanglinks") + if isinstance(minsize, int): + apgen.request["gapminsize"] = str(minsize) + if isinstance(maxsize, int): + apgen.request["gapmaxsize"] = str(maxsize) + if isinstance(protect_type, basestring): + apgen.request["gapprtype"] = protect_type + if isinstance(protect_level, basestring): + apgen.request["gapprlevel"] = protect_level + if isinstance(limit, int): + apgen.limit = limit + if reverse: + apgen.request["gapdir"] = "descending" + return apgen + + def prefixindex(self, prefix, namespace=0, includeredirects=True): + """Yield all pages with a given prefix. Deprecated. + + Use allpages() with the prefix= parameter instead of this method. + + """ + logger.debug("Site.prefixindex() is deprecated; use allpages instead.") + return self.allpages(prefix=prefix, namespace=namespace, + includeredirects=includeredirects) + + + def alllinks(self, start="!", prefix="", namespace=0, unique=False, + limit=None, fromids=False): + """Iterate all links to pages (which need not exist) in one namespace. + + Note that, in practice, links that were found on pages that have + been deleted may not have been removed from the links table, so this + method can return false positives. + + @param start: Start at this title (page need not exist). + @param prefix: Only yield pages starting with this string. + @param namespace: Iterate pages from this (single) namespace + (default: 0) + @param unique: If True, only iterate each link title once (default: + iterate once for each linking page) + @param limit: maximum number of pages to iterate (default: iterate + all pages in namespace) + @param fromids: if True, include the pageid of the page containing + each link (default: False) as the '_fromid' attribute of the Page; + cannot be combined with unique + + """ + if unique and fromids: + raise Error("alllinks: unique and fromids cannot both be True.") + if not isinstance(namespace, int): + raise Error("alllinks: only one namespace permitted.") + algen = api.ListGenerator("alllinks", alnamespace=str(namespace), + alfrom=start, site=self) + if prefix: + algen.request["alprefix"] = prefix + if isinstance(limit, int): + algen.limit = limit + if unique: + algen.request["alunique"] = "" + if fromids: + algen.request["alprop"] = "title|ids" + for link in algen: + p = pywikibot.Page(self, link['title'], link['ns']) + if fromids: + p._fromid = link['fromid'] + yield p + + def allcategories(self, start="!", prefix="", limit=None, + reverse=False): + """Iterate categories used (which need not have a Category page). + + Iterator yields Category objects. Note that, in practice, links that + were found on pages that have been deleted may not have been removed + from the database table, so this method can return false positives. + + @param start: Start at this category title (category need not exist). + @param prefix: Only yield categories starting with this string. + @param limit: maximum number of categories to iterate (default: + iterate all) + @param reverse: if True, iterate in reverse Unicode lexigraphic + order (default: iterate in forward order) + + """ + acgen = api.CategoryPageGenerator("allcategories", + gacfrom=start, site=self) + if prefix: + acgen.request["gacprefix"] = prefix + if isinstance(limit, int): + acgen.limit = limit + if reverse: + acgen.request["gacdir"] = "descending" + return acgen + + def categories(self, number=10, repeat=False): + """Deprecated; retained for backwards-compatibility""" + logger.debug( + "Site.categories() method is deprecated; use .allcategories()") + if repeat: + limit = None + else: + limit = number + return self.allcategories(limit=limit) + + def allusers(self, start="!", prefix="", limit=None, group=None): + """Iterate registered users, ordered by username. + + Iterated values are dicts containing 'name', 'editcount', + 'registration', and (sometimes) 'groups' keys. 'groups' will be + present only if the user is a member of at least 1 group, and will + be a list of unicodes; all the other values are unicodes and should + always be present. + + @param start: start at this username (name need not exist) + @param prefix: only iterate usernames starting with this substring + @param limit: maximum number of users to iterate (default: all) + @param group: only iterate users that are members of this group + @type group: str + + """ + augen = api.ListGenerator("allusers", aufrom=start, + auprop="editcount|groups|registration", + site=self) + if prefix: + augen.request["auprefix"] = prefix + if group: + augen.request["augroup"] = group + if isinstance(limit, int): + augen.limit = limit + return augen + + def allimages(self, start="!", prefix="", minsize=None, maxsize=None, + limit=None, reverse=False, sha1=None, sha1base36=None): + """Iterate all images, ordered by image title. + + Yields ImagePages, but these pages need not exist on the wiki. + + @param start: start at this title (name need not exist) + @param prefix: only iterate titles starting with this substring + @param limit: maximum number of titles to iterate (default: all) + @param minsize: only iterate images of at least this many bytes + @param maxsize: only iterate images of no more than this many bytes + @param reverse: if True, iterate in reverse lexigraphic order + @param sha1: only iterate image (it is theoretically possible there + could be more than one) with this sha1 hash + @param sha1base36: same as sha1 but in base 36 + + """ + aigen = api.ImagePageGenerator("allimages", gaifrom=start, + site=self) + if prefix: + aigen.request["gaiprefix"] = prefix + if isinstance(limit, int): + aigen.limit = limit + if isinstance(minsize, int): + aigen.request["gaiminsize"] = str(minsize) + if isinstance(maxsize, int): + aigen.request["gaimaxsize"] = str(maxsize) + if reverse: + aigen.request["gaidir"] = "descending" + if sha1: + aigen.request["gaisha1"] = sha1 + if sha1base36: + aigen.request["gaisha1base36"] = sha1base36 + return aigen + + def blocks(self, starttime=None, endtime=None, reverse=False, + blockids=None, users=None, limit=None): + """Iterate all current blocks, in order of creation. + + Note that logevents only logs user blocks, while this method + iterates all blocks including IP ranges. The iterator yields dicts + containing keys corresponding to the block properties (see + http://www.mediawiki.org/wiki/API:Query_-_Lists for documentation). + + @param starttime: start iterating at this timestamp + @param endtime: stop iterating at this timestamp + @param reverse: if True, iterate oldest blocks first (default: newest) + @param blockids: only iterate blocks with these id numbers + @param users: only iterate blocks affecting these usernames or IPs + @param limit: maximum number of blocks to iterate (default: all) + + """ + if starttime and endtime: + if reverse: + if starttime > endtime: + raise pywikibot.Error( + "blocks: starttime must be before endtime with reverse=True") + else: + if endtime > starttime: + raise pywikibot.Error( + "blocks: endtime must be before starttime with reverse=False") + bkgen = api.ListGenerator("blocks", site=self) + bkgen.request["bkprop"] = \ + "id|user|by|timestamp|expiry|reason|range|flags" + if starttime: + bkgen.request["bkstart"] = starttime + if endtime: + bkgen.request["bkend"] = endtime + if reverse: + bkgen.request["bkdir"] = "newer" + if blockids: + bkgen.request["bkids"] = blockids + if users: + bkgen.request["bkusers"] = users + if isinstance(limit, int): + bkgen.limit = limit + return bkgen + + def exturlusage(self, url, protocol="http", namespaces=None, + limit=None): + """Iterate Pages that contain links to the given URL. + + @param url: The URL to search for (without the protocol prefix); + this many include a '*' as a wildcard, only at the start of the + hostname + @param protocol: The protocol prefix (default: "http") + @param namespaces: Only iterate pages in these namespaces (default: all) + @type namespaces: list of ints + @param limit: Only iterate this many linking pages (default: all) + + """ + eugen = api.PageGenerator("exturlusage", geuquery=url, + geuprotocol=protocol, site=self) + if isinstance(namespaces, list): + eugen.request["geunamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + eugen.request["geunamespace"] = str(namespaces) + if isinstance(limit, int): + eugen.limit = limit + return eugen + + def imageusage(self, image, namespaces=None, filterredir=None, + limit=None): + """Iterate Pages that contain links to the given ImagePage. + + @param image: the image to search for (ImagePage need not exist on the wiki) + @type image: ImagePage + @param namespaces: Only iterate pages in these namespaces (default: all) + @type namespaces: list of ints + @param filterredir: if True, only yield redirects; if False (and not + None), only yield non-redirects (default: yield both) + @param limit: Only iterate this many linking pages (default: all) + + """ + iugen = api.PageGenerator("imageusage", site=self, + giutitle=image.title(withSection=False)) + if isinstance(namespaces, list): + iugen.request["giunamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + iugen.request["giunamespace"] = str(namespaces) + if isinstance(limit, int): + iugen.limit = limit + if filterredir is not None: + iugen.request["giufilterredir"] = (filterredir and "redirects" + or "nonredirects") + return iugen + + def logevents(self, logtype=None, user=None, page=None, + start=None, end=None, reverse=False, limit=None): + """Iterate all log entries. + + @param logtype: only iterate entries of this type (see wiki + documentation for available types, which will include "block", + "protect", "rights", "delete", "upload", "move", "import", + "patrol", "merge") + @param user: only iterate entries that match this user name + @param page: only iterate entries affecting this page + @param start: only iterate entries from and after this timestamp + @param end: only iterate entries up to and through this timestamp + @param reverse: if True, iterate oldest entries first (default: newest) + @param limit: only iterate up to this many entries + + """ + if start and end: + if reverse: + if end < start: + raise Error( + "logevents: end must be later than start with reverse=True") + else: + if start < end: + raise Error( + "logevents: start must be later than end with reverse=False") + legen = api.ListGenerator("logevents", site=self) + if logtype is not None: + legen.request["letype"] = logtype + if user is not None: + legen.request["leuser"] = user + if page is not None: + legen.request["letitle"] = page.title(withSection=False) + if start is not None: + legen.request["lestart"] = start + if end is not None: + legen.request["leend"] = end + if reverse: + legen.request["ledir"] = "newer" + if isinstance(limit, int): + legen.limit = limit + return legen + + def recentchanges(self, start=None, end=None, reverse=False, limit=None, + namespaces=None, pagelist=None, changetype=None, + showMinor=None, showBot=None, showAnon=None, + showRedirects=None, showPatrolled=None): + """Iterate recent changes. + + @param start: timestamp to start listing from + @param end: timestamp to end listing at + @param reverse: if True, start with oldest changes (default: newest) + @param limit: iterate no more than this number of entries + @param namespaces: iterate changes to pages in these namespaces only + @type namespaces: list of ints + @param pagelist: iterate changes to pages in this list only + @param pagelist: list of Pages + @param changetype: only iterate changes of this type ("edit" for + edits to existing pages, "new" for new pages, "log" for log + entries) + @param showMinor: if True, only list minor edits; if False (and not + None), only list non-minor edits + @param showBot: if True, only list bot edits; if False (and not + None), only list non-bot edits + @param showAnon: if True, only list anon edits; if False (and not + None), only list non-anon edits + @param showRedirects: if True, only list edits to redirect pages; if + False (and not None), only list edits to non-redirect pages + @param showPatrolled: if True, only list patrolled edits; if False + (and not None), only list non-patrolled edits + + """ + if start and end: + if reverse: + if end < start: + raise Error( + "recentchanges: end must be later than start with reverse=True") + else: + if start < end: + raise Error( + "recentchanges: start must be later than end with reverse=False") + rcgen = api.ListGenerator("recentchanges", site=self, + rcprop="user|comment|timestamp|title|ids" + "|redirect|patrolled|loginfo|flags") + if start is not None: + rcgen.request["rcstart"] = start + if end is not None: + rcgen.request["rcend"] = end + if reverse: + rcgen.request["rcdir"] = "newer" + if isinstance(limit, int): + rcgen.limit = limit + if isinstance(namespaces, list): + rcgen.request["rcnamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + rcgen.request["rcnamespace"] = str(namespaces) + if pagelist: + rcgen.request["rctitles"] = u"|".join(p.title(withSection=False) + for p in pagelist) + if changetype: + rcgen.request["rctype"] = changetype + filters = {'minor': showMinor, + 'bot': showBot, + 'anon': showAnon, + 'redirect': showRedirects, + 'patrolled': showPatrolled} + rcshow = [] + for item in filters: + if filters[item] is not None: + rcshow.append(filters[item] and item or ("!"+item)) + if rcshow: + rcgen.request["rcshow"] = "|".join(rcshow) + return rcgen + + @deprecate_arg("number", "limit") + def search(self, searchstring, namespaces=None, where="text", + getredirects=False, limit=None): + """Iterate Pages that contain the searchstring. + + Note that this may include non-existing Pages if the wiki's database + table contains outdated entries. + + @param searchstring: the text to search for + @type searchstring: unicode + @param where: Where to search; value must be "text" or "titles" (many + wikis do not support title search) + @param namespaces: search only in these namespaces (defaults to 0) + @type namespaces: list of ints + @param getredirects: if True, include redirects in results + @param limit: maximum number of results to iterate + + """ + if not searchstring: + raise Error("search: searchstring cannot be empty") + if where not in ("text", "titles"): + raise Error("search: unrecognized 'where' value: %s" % where) + srgen = api.PageGenerator("search", gsrsearch=searchstring, + gsrwhat=where, site=self) + if not namespaces: + logger.warning("search: namespaces cannot be empty; using [0].") + namespaces = [0] + if isinstance(namespaces, list): + srgen.request["gsrnamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + else: + srgen.request["gsrnamespace"] = str(namespaces) + if getredirects: + srgen.request["gsrredirects"] = "" + if isinstance(limit, int): + srgen.limit = limit + return srgen + + def usercontribs(self, user=None, userprefix=None, start=None, end=None, + reverse=False, limit=None, namespaces=None, + showMinor=None): + """Iterate contributions by a particular user. + + Iterated values are in the same format as recentchanges. + + @param user: Iterate contributions by this user (name or IP) + @param userprefix: Iterate contributions by all users whose names + or IPs start with this substring + @param start: Iterate contributions starting at this timestamp + @param end: Iterate contributions ending at this timestamp + @param reverse: Iterate oldest contributions first (default: newest) + @param limit: Maximum number of contributions to iterate + @param namespaces: Only iterate contributions in these namespaces + @type namespaces: list of ints + @param showMinor: if True, iterate only minor edits; if False and + not None, iterate only non-minor edits (default: iterate both) + + """ + if not (user or userprefix): + raise Error( + "usercontribs: either user or userprefix must be non-empty") + if start and end: + if reverse: + if end < start: + raise Error( + "usercontribs: end must be later than start with reverse=True") + else: + if start < end: + raise Error( + "usercontribs: start must be later than end with reverse=False") + ucgen = api.ListGenerator("usercontribs", site=self, + ucprop="ids|title|timestamp|comment|flags") + if user: + ucgen.request["ucuser"] = user + if userprefix: + ucgen.request["ucuserprefix"] = userprefix + if start is not None: + ucgen.request["ucstart"] = start + if end is not None: + ucgen.request["ucend"] = end + if reverse: + ucgen.request["ucdir"] = "newer" + if isinstance(limit, int): + ucgen.limit = limit + if isinstance(namespaces, list): + ucgen.request["ucnamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + ucgen.request["ucnamespace"] = str(namespaces) + if showMinor is not None: + ucgen.request["ucshow"] = showMinor and "minor" or "!minor" + return ucgen + + def watchlist_revs(self, start=None, end=None, reverse=False, + namespaces=None, showMinor=None, showBot=None, + showAnon=None, limit=None): + """Iterate revisions to pages on the bot user's watchlist. + + Iterated values will be in same format as recentchanges. + + @param start: Iterate revisions starting at this timestamp + @param end: Iterate revisions ending at this timestamp + @param reverse: Iterate oldest revisions first (default: newest) + @param namespaces: only iterate revisions to pages in these + namespaces (default: all) + @type namespaces: list of ints + @param showMinor: if True, only list minor edits; if False (and not + None), only list non-minor edits + @param showBot: if True, only list bot edits; if False (and not + None), only list non-bot edits + @param showAnon: if True, only list anon edits; if False (and not + None), only list non-anon edits + @param limit: Maximum number of revisions to iterate + + """ + if start and end: + if reverse: + if end < start: + raise Error( + "watchlist_revs: end must be later than start with reverse=True") + else: + if start < end: + raise Error( + "watchlist_revs: start must be later than end with reverse=False") + wlgen = api.ListGenerator("watchlist", wlallrev="", site=self, + wlprop="user|comment|timestamp|title|ids|flags") + #TODO: allow users to ask for "patrol" as well? + if start is not None: + wlgen.request["wlstart"] = start + if end is not None: + wlgen.request["wlend"] = end + if reverse: + wlgen.request["wldir"] = "newer" + if isinstance(limit, int): + wlgen.limit = limit + if isinstance(namespaces, list): + wlgen.request["wlnamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + wlgen.request["wlnamespace"] = str(namespaces) + filters = {'minor': showMinor, + 'bot': showBot, + 'anon': showAnon} + wlshow = [] + for item in filters: + if filters[item] is not None: + wlshow.append(filters[item] and item or ("!"+item)) + if wlshow: + wlgen.request["wlshow"] = "|".join(wlshow) + return wlgen + + def deletedrevs(self, page, start=None, end=None, reverse=None, limit=None, + get_text=False): + """Iterate deleted revisions. + + Each value returned by the iterator will be a dict containing the + 'title' and 'ns' keys for a particular Page and a 'revisions' key + whose value is a list of revisions in the same format as + recentchanges (plus a 'content' element if requested). If get_text + is true, the toplevel dict will contain a 'token' key as well. + + @param page: The page to check for deleted revisions + @param start: Iterate revisions starting at this timestamp + @param end: Iterate revisions ending at this timestamp + @param reverse: Iterate oldest revisions first (default: newest) + @param limit: Iterate no more than this number of revisions. + @param get_text: If True, retrieve the content of each revision and + an undelete token + + """ + if start and end: + if reverse: + if end < start: + raise Error( +"deletedrevs: end must be later than start with reverse=True") + else: + if start < end: + raise Error( +"deletedrevs: start must be later than end with reverse=False") + if not self.logged_in(): + self.login() + if "deletedhistory" not in self.userinfo['rights']: + try: + self.login(True) + except NoUsername: + pass + if "deletedhistory" not in self.userinfo['rights']: + raise Error( +"deletedrevs: User:%s not authorized to access deleted revisions." + % self.user()) + if get_text: + if "undelete" not in self.userinfo['rights']: + try: + self.login(True) + except NoUsername: + pass + if "undelete" not in self.userinfo['rights']: + raise Error( +"deletedrevs: User:%s not authorized to view deleted content." + % self.user()) + + drgen = api.ListGenerator("deletedrevs", site=self, + titles=page.title(withSection=False), + drprop="revid|user|comment|minor") + if get_text: + drgen.request['drprop'] = drgen.request['drprop'] + "|content|token" + if start is not None: + drgen.request["drstart"] = start + if end is not None: + drgen.request["drend"] = end + if reverse: + drgen.request["drdir"] = "newer" + if isinstance(limit, int): + drgen.limit = limit + return drgen + + def users(self, usernames): + """Iterate info about a list of users by name or IP. + + @param usernames: a list of user names + @type usernames: list, or other iterable, of unicodes + + """ + if not isinstance(usernames, basestring): + usernames = u"|".join(usernames) + usgen = api.ListGenerator("users", ususers=usernames, site=self, + usprop="blockinfo|groups|editcount|registration") + return usgen + + def randompages(self, limit=1, namespaces=None, redirects=False): + """Iterate a number of random pages. + + Pages are listed in a fixed sequence, only the starting point is + random. + + @param limit: the maximum number of pages to iterate (default: 1) + @param namespaces: only iterate pages in these namespaces. + @param redirects: if True, include only redirect pages in results + (default: include only non-redirects) + + """ + rngen = api.PageGenerator("random", site=self) + rngen.limit = limit + if isinstance(namespaces, list): + rngen.request["grnnamespace"] = u"|".join(unicode(ns) + for ns in namespaces) + elif namespaces is not None: + rngen.request["grnnamespace"] = str(namespaces) + if redirects: + rngen.request["grnredirect"] = "" + return rngen + + # catalog of editpage error codes, for use in generating messages + _ep_errors = { + "noapiwrite": "API editing not enabled on %(site)s wiki", + "writeapidenied": +"User %(user)s is not authorized to edit on %(site)s wiki", + "protectedtitle": +"Title %(title)s is protected against creation on %(site)s", + "cantcreate": +"User %(user)s not authorized to create new pages on %(site)s wiki", + "cantcreate-anon": +"""Bot is not logged in, and anon users are not authorized to create new pages +on %(site)s wiki""", + "articleexists": "Page %(title)s already exists on %(site)s wiki", + "noimageredirect-anon": +"""Bot is not logged in, and anon users are not authorized to create image +redirects on %(site)s wiki""", + "noimageredirect": +"User %(user)s not authorized to create image redirects on %(site)s wiki", + "spamdetected": +"Edit to page %(title)s rejected by spam filter due to content:\n", + "filtered": "%(info)s", + "contenttoobig": "%(info)s", + "noedit-anon": +"""Bot is not logged in, and anon users are not authorized to edit on +%(site)s wiki""", + "noedit": "User %(user)s not authorized to edit pages on %(site)s wiki", + "pagedeleted": +"Page %(title)s has been deleted since last retrieved from %(site)s wiki", + "editconflict": "Page %(title)s not saved due to edit conflict.", + } + + def editpage(self, page, summary, minor=True, notminor=False, + recreate=True, createonly=False, watch=False, unwatch=False): + """Submit an edited Page object to be saved to the wiki. + + @param page: The Page to be saved; its .text property will be used + as the new text to be saved to the wiki + @param token: the edit token retrieved using Site.token() + @param summary: the edit summary (required!) + @param minor: if True (default), mark edit as minor + @param notminor: if True, override account preferences to mark edit + as non-minor + @param recreate: if True (default), create new page even if this + title has previously been deleted + @param createonly: if True, raise an error if this title already + exists on the wiki + @param watch: if True, add this Page to bot's watchlist + @param unwatch: if True, remove this Page from bot's watchlist if + possible + @return: True if edit succeeded, False if it failed + + """ + text = page.text + if not text: + raise Error("editpage: no text to be saved") + try: + lastrev = page.latestRevision() + except NoPage: + lastrev = None + if not recreate: + raise Error("Page %s does not exist on %s wiki." + % (page.title(withSection=False), self)) + token = self.token(page, "edit") + self.lock_page(page) + if lastrev is not None and page.latestRevision() != lastrev: + raise Error("editpage: Edit conflict detected; saving aborted.") + req = api.Request(site=self, action="edit", + title=page.title(withSection=False), + text=text, token=token, summary=summary) +## if lastrev is not None: +## req["basetimestamp"] = page._revisions[lastrev].timestamp + if minor: + req['minor'] = "" + elif notminor: + req['notminor'] = "" + if 'bot' in self.userinfo['groups']: + req['bot'] = "" + if recreate: + req['recreate'] = "" + if createonly: + req['createonly'] = "" + if watch: + req['watch'] = "" + elif unwatch: + req['unwatch'] = "" +## FIXME: API gives 'badmd5' error +## md5hash = md5() +## md5hash.update(urllib.quote_plus(text.encode(self.encoding()))) +## req['md5'] = md5hash.digest() + while True: + try: + result = req.submit() + logger.debug("editpage response: %s" % result) + except api.APIError, err: + self.unlock_page(page) + if err.code.endswith("anon") and self.logged_in(): + logger.debug( +"editpage: received '%s' even though bot is logged in" % err.code) + errdata = { + 'site': self, + 'title': page.title(withSection=False), + 'user': self.user(), + 'info': err.info + } + if err.code == "spamdetected": + raise SpamfilterError(self._ep_errors[err.code] % errdata + + err.info[ err.info.index("fragment: ") + 9: ]) + + if err.code == "editconflict": + raise EditConflict(self._ep_errors[err.code] % errdata) + if err.code in self._ep_errors: + raise Error(self._ep_errors[err.code] % errdata) + logger.debug("editpage: Unexpected error code '%s' received." + % err.code) + raise + assert ("edit" in result and "result" in result["edit"]), result + if result["edit"]["result"] == "Success": + self.unlock_page(page) + if "nochange" in result["edit"]: + # null edit, page not changed + # TODO: do we want to notify the user of this? + return True + page._revid = result["edit"]["newrevid"] + # see http://www.mediawiki.org/wiki/API:Wikimania_2006_API_discussion#Notes + # not safe to assume that saved text is the same as sent + self.loadrevisions(page, getText=True) + return True + elif result["edit"]["result"] == "Failure": + if "captcha" in result["edit"]: + captcha = result["edit"]["captcha"] + req['captchaid'] = captcha['id'] + if captcha["type"] == "math": + req['captchaword'] = input(captcha["question"]) + continue + elif "url" in captcha: + webbrowser.open(url) + req['captchaword'] = cap_answerwikipedia.input( +"Please view CAPTCHA in your browser, then type answer here:") + continue + else: + self.unlock_page(page) + logger.error( +"editpage: unknown CAPTCHA response %s, page not saved" + % captcha) + return False + else: + self.unlock_page(page) + logger.error("editpage: unknown failure reason %s" + % str(result)) + return False + else: + self.unlock_page(page) + logger.error( +"editpage: Unknown result code '%s' received; page not saved" + % result["edit"]["result"]) + logger.error(str(result)) + return False + + # catalog of move errors for use in error messages + _mv_errors = { + "noapiwrite": "API editing not enabled on %(site)s wiki", + "writeapidenied": +"User %(user)s is not authorized to edit on %(site)s wiki", + "nosuppress": +"User %(user)s is not authorized to move pages without creating redirects", + "cantmove-anon": +"""Bot is not logged in, and anon users are not authorized to move pages on +%(site)s wiki""", + "cantmove": +"User %(user)s is not authorized to move pages on %(site)s wiki", + "immobilenamespace": +"Pages in %(oldnamespace)s namespace cannot be moved on %(site)s wiki", + "articleexists": +"Cannot move because page [[%(newtitle)s]] already exists on %(site)s wiki", + "protectedpage": +"Page [[%(oldtitle)s]] is protected against moving on %(site)s wiki", + "protectedtitle": +"Page [[%(newtitle)s]] is protected against creation on %(site)s wiki", + "nonfilenamespace": +"Cannot move a file to %(newnamespace)s namespace on %(site)s wiki", + "filetypemismatch": +"[[%(newtitle)s]] file extension does not match content of [[%(oldtitle)s]]" + } + + def movepage(self, page, newtitle, summary, movetalk=True, + noredirect=False): + """Move a Page to a new title. + + @param page: the Page to be moved (must exist) + @param newtitle: the new title for the Page + @type newtitle: unicode + @param summary: edit summary (required!) + @param movetalk: if True (default), also move the talk page if possible + @param noredirect: if True, suppress creation of a redirect from the + old title to the new one + @return: Page object with the new title + + """ + oldtitle = page.title(withSection=False) + newlink = pywikibot.Link(newtitle, self) + if newlink.namespace: + newtitle = self.namespace(newlink.namespace) + ":" + newlink.title + else: + newtitle = newlink.title + if oldtitle == newtitle: + raise Error("Cannot move page %s to its own title." + % oldtitle) + if not page.exists(): + raise Error("Cannot move page %s because it does not exist on %s." + % (oldtitle, self)) + token = self.token(page, "move") + self.lock_page(page) + req = api.Request(site=self, action="move", to=newtitle, + token=token, reason=summary) + req['from'] = oldtitle # "from" is a python keyword + if movetalk: + req['movetalk'] = "" + if noredirect: + req['noredirect'] = "" + try: + result = req.submit() + logger.debug("movepage response: %s" % result) + except api.APIError, err: + if err.code.endswith("anon") and self.logged_in(): + logger.debug( +"movepage: received '%s' even though bot is logged in" % err.code) + errdata = { + 'site': self, + 'oldtitle': oldtitle, + 'oldnamespace': self.namespace(page.namespace()), + 'newtitle': newtitle, + 'newnamespace': self.namespace(newlink.namespace), + 'user': self.user(), + } + if err.code in self._mv_errors: + raise Error(self._mv_errors[err.code] % errdata) + logger.debug("movepage: Unexpected error code '%s' received." + % err.code) + raise + finally: + self.unlock_page(page) + if "move" not in result: + logger.error("movepage: %s" % result) + raise Error("movepage: unexpected response") + # TODO: Check for talkmove-error messages + if "talkmove-error-code" in result["move"]: + logger.warning(u"movepage: Talk page %s not moved" + % (page.toggleTalkPage().title(asLink=True))) + return pywikibot.Page(page, newtitle) + + # catalog of rollback errors for use in error messages + _rb_errors = { + "noapiwrite": + "API editing not enabled on %(site)s wiki", + "writeapidenied": + "User %(user)s not allowed to edit through the API", + "alreadyrolled": + "Page [[%(title)s]] already rolled back; action aborted.", + } # other errors shouldn't arise because we check for those errors + + def rollbackpage(self, page, summary=u''): + """Roll back page to version before last user's edits. + + As a precaution against errors, this method will fail unless + the page history contains at least two revisions, and at least + one that is not by the same user who made the last edit. + + @param page: the Page to be rolled back (must exist) + @param summary: edit summary (defaults to a standardized message) + + """ + if len(page._revisions) < 2: + raise pywikibot.Error( + u"Rollback of %s aborted; load revision history first." + % page.title(asLink=True)) + last_rev = page._revisions[page.latestRevision()] + last_user = last_rev.user + for rev in sorted(page._revisions.keys(), reverse=True): + # start with most recent revision first + if rev.user != last_user: + prev_user = rev.user + break + else: + raise pywikibot.Error( + u"Rollback of %s aborted; only one user in revision history." + % page.title(asLink=True)) + summary = summary or ( +u"Reverted edits by [[Special:Contributions/%(last_user)s|%(last_user)s]] " +u"([[User talk:%(last_user)s|Talk]]) to last version by %(prev_user)s" + % locals()) + token = self.token(page, "rollback") + self.lock_page(page) + req = api.Request(site=self, action="rollback", + title=page.title(withSection=False), + user=last_user, + token=token) + try: + result = req.submit() + except api.APIError, err: + errdata = { + 'site': self, + 'title': page.title(withSection=False), + 'user': self.user(), + } + if err.code in self._rb_errors: + raise Error(self._rb_errors[err.code] % errdata) + logger.debug("rollback: Unexpected error code '%s' received." + % err.code) + raise + finally: + self.unlock_page(page) + + # catalog of delete errors for use in error messages + _dl_errors = { + "noapiwrite": + "API editing not enabled on %(site)s wiki", + "writeapidenied": + "User %(user)s not allowed to edit through the API", + "permissiondenied": + "User %(user)s not authorized to delete pages on %(site)s wiki.", + "cantdelete": + "Could not delete [[%(title)s]]. Maybe it was deleted already.", + } # other errors shouldn't occur because of pre-submission checks + + def deletepage(self, page, summary): + """Delete page from the wiki. Requires appropriate privilege level. + + @param page: Page to be deleted. + @param summary: Edit summary (required!). + + """ + try: + self.login(sysop=True) + except pywikibot.Error, e: + raise Error("delete: Unable to login as sysop (%s)" + % e.__class__.__name__) + if not self.logged_in(sysop=True): + raise Error("delete: Unable to login as sysop") + token = self.token("delete") + req = api.Request(site=self, action="delete", token=token, + title=page.title(withSection=False), + reason=summary) + try: + result = req.submit() + except api.APIError, err: + errdata = { + 'site': self, + 'title': page.title(withSection=False), + 'user': self.user(), + } + if err.code in self._dl_errors: + raise Error(self._dl_errors[err.code] % errdata) + logger.debug("delete: Unexpected error code '%s' received." + % err.code) + raise + finally: + self.unlock_page(page) + + # TODO: implement undelete + + # TODO: implement patrol + + def linksearch(self, siteurl, limit=500): + """Backwards-compatible interface to exturlusage()""" + return self.exturlusage(siteurl, limit=limit) + + @deprecate_arg("repeat", None) + def newimages(self, number=100, lestart=None, leend=None, leuser=None, + letitle=None): + """Yield ImagePages from most recent uploads""" + return self.logevents(logtype="upload", limit=number, start=lestart, + end=leend, user=leuser, title=letitle) + + def getImagesFromAnHash(self, hash_found=None): + """Return all images that have the same hash. + + Useful to find duplicates or nowcommons. + + NOTE: it returns also the image itself, if you don't want it, just + filter the list returned. + + NOTE 2: it returns the image title WITHOUT the image namespace. + + """ + if hash_found == None: # If the hash is none return None and not continue + return None + return [image.title(withNamespace=False) + for image in self.allimages(sha1=hash_found)] + + +#### METHODS NOT IMPLEMENTED YET #### +class NotImplementedYet: + + # TODO: is this needed any more? can it be obtained from the http module? + def cookies(self, sysop = False): + """Return a string containing the user's current cookies.""" + self._loadCookies(sysop = sysop) + index = self._userIndex(sysop) + return self._cookies[index] + + def _loadCookies(self, sysop = False): + """Retrieve session cookies for login""" + index = self._userIndex(sysop) + if self._cookies[index] is not None: + return + try: + if sysop: + try: + username = config.sysopnames[self.family.name + ][self.code] + except KeyError: + raise NoUsername("""\ +You tried to perform an action that requires admin privileges, but you haven't +entered your sysop name in your user-config.py. Please add +sysopnames['%s']['%s']='name' to your user-config.py""" + % (self.family.name, self.code)) + else: + username = pywikiobt.config2.usernames[self.family.name + ][self.code] + except KeyError: + self._cookies[index] = None + self._isLoggedIn[index] = False + else: + tmp = '%s-%s-%s-login.data' % ( + self.family.name, self.code, username) + fn = config.datafilepath('login-data', tmp) + if not os.path.exists(fn): + self._cookies[index] = None + self._isLoggedIn[index] = False + else: + f = open(fn) + self._cookies[index] = '; '.join([x.strip() for x in f.readlines()]) + f.close() + + # THESE ARE FUNCTIONS NOT YET IMPLEMENTED IN THE API + # TODO: avoid code duplication for the following methods + def newpages(self, number = 10, get_redirect = False, repeat = False): + """Yield new articles (as Page objects) from Special:Newpages. + + Starts with the newest article and fetches the number of articles + specified in the first argument. If repeat is True, it fetches + Newpages again. If there is no new page, it blocks until there is + one, sleeping between subsequent fetches of Newpages. + + The objects yielded are tuples composed of the Page object, + timestamp (unicode), length (int), an empty unicode string, username + or IP address (str), comment (unicode). + + """ + # TODO: in recent MW versions Special:Newpages takes a namespace parameter, + # and defaults to 0 if not specified. + # TODO: Detection of unregistered users is broken + # TODO: Repeat mechanism doesn't make much sense as implemented; + # should use both offset and limit parameters, and have an + # option to fetch older rather than newer pages + seen = set() + while True: + path = self.newpages_address(n=number) + # The throttling is important here, so always enabled. + get_throttle() + html = self.getUrl(path) + + entryR = re.compile( +'<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"' +' title="(?P<title>.+?)">.+?</a>.+?[([](?P<length>[\d,.]+)[^)]]*[)]]' +' .?<a href=".+?" title=".+?:(?P<username>.+?)">' + ) + for m in entryR.finditer(html): + date = m.group('date') + title = m.group('title') + title = title.replace('"', '"') + length = int(re.sub("[,.]", "", m.group('length'))) + loggedIn = u'' + username = m.group('username') + comment = u'' + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, date, length, loggedIn, username, comment + if not repeat: + break + + def longpages(self, number = 10, repeat = False): + """Yield Pages from Special:Longpages. + + Return values are a tuple of Page object, length(int). + + """ + #TODO: should use offset and limit parameters; 'repeat' as now + # implemented is fairly useless + # this comment applies to all the XXXXpages methods following, as well + seen = set() + while True: + path = self.longpages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">hist</a>) <a href=".+?" title="(?P<title>.+?)">.+?</a> [(?P<length>\d+)(.+?)]</li>') + for m in entryR.finditer(html): + title = m.group('title') + length = int(m.group('length')) + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, length + if not repeat: + break + + def shortpages(self, number = 10, repeat = False): + """Yield Pages and lengths from Special:Shortpages.""" + throttle = True + seen = set() + while True: + path = self.shortpages_address(n = number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">hist</a>) <a href=".+?" title="(?P<title>.+?)">.+?</a> [(?P<length>\d+)(.+?)]</li>') + for m in entryR.finditer(html): + title = m.group('title') + length = int(m.group('length')) + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, length + if not repeat: + break + + def deadendpages(self, number = 10, repeat = False): + """Yield Page objects retrieved from Special:Deadendpages.""" + seen = set() + while True: + path = self.deadendpages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def ancientpages(self, number = 10, repeat = False): + """Yield Pages, datestamps from Special:Ancientpages.""" + seen = set() + while True: + path = self.ancientpages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( +'<li><a href=".+?" title="(?P<title>.+?)">.+?</a> (?P<date>.+?)</li>') + for m in entryR.finditer(html): + title = m.group('title') + date = m.group('date') + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, date + if not repeat: + break + + def lonelypages(self, number = 10, repeat = False): + """Yield Pages retrieved from Special:Lonelypages.""" + throttle = True + seen = set() + while True: + path = self.lonelypages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def unwatchedpages(self, number = 10, repeat = False): + """Yield Pages from Special:Unwatchedpages (requires Admin privileges).""" + seen = set() + while True: + path = self.unwatchedpages_address(n=number) + get_throttle() + html = self.getUrl(path, sysop = True) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a>.+?</li>') + for m in entryR.finditer(html): + title = m.group('title') + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def uncategorizedcategories(self, number = 10, repeat = False): + """Yield Categories from Special:Uncategorizedcategories.""" + import catlib + seen = set() + while True: + path = self.uncategorizedcategories_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + if title not in seen: + seen.add(title) + page = catlib.Category(self, title) + yield page + if not repeat: + break + + def newimages(self, number = 10, repeat = False): + """Yield ImagePages from Special:Log&type=upload""" + + seen = set() + regexp = re.compile('<li[^>]*>(?P<date>.+?)\s+<a href=.*?>(?P<user>.+?)</a>\s+(.+?</a>).*?<a href=".*?"(?P<new> class="new")? title="(?P<image>.+?)"\s*>(?:.*?<span class="comment">(?P<comment>.*?)</span>)?', re.UNICODE) + + while True: + path = self.log_address(number, mode = 'upload') + get_throttle() + html = self.getUrl(path) + + for m in regexp.finditer(html): + image = m.group('image') + + if image not in seen: + seen.add(image) + + if m.group('new'): + output(u"Image '%s' has been deleted." % image) + continue + + date = m.group('date') + user = m.group('user') + comment = m.group('comment') or '' + + yield ImagePage(self, image), date, user, comment + if not repeat: + break + + def uncategorizedimages(self, number = 10, repeat = False): + """Yield ImagePages from Special:Uncategorizedimages.""" + seen = set() + ns = self.image_namespace() + entryR = re.compile( + '<a href=".+?" title="(?P<title>%s:.+?)">.+?</a>' % ns) + while True: + path = self.uncategorizedimages_address(n=number) + get_throttle() + html = self.getUrl(path) + for m in entryR.finditer(html): + title = m.group('title') + if title not in seen: + seen.add(title) + page = ImagePage(self, title) + yield page + if not repeat: + break + + def uncategorizedpages(self, number = 10, repeat = False): + """Yield Pages from Special:Uncategorizedpages.""" + seen = set() + while True: + path = self.uncategorizedpages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def unusedcategories(self, number = 10, repeat = False): + """Yield Category objects from Special:Unusedcategories.""" + import catlib + seen = set() + while True: + path = self.unusedcategories_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile('<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + + if title not in seen: + seen.add(title) + page = catlib.Category(self, title) + yield page + if not repeat: + break + + def unusedfiles(self, number = 10, repeat = False, extension = None): + """Yield ImagePage objects from Special:Unusedimages.""" + seen = set() + ns = self.image_namespace() + entryR = re.compile( + '<a href=".+?" title="(?P<title>%s:.+?)">.+?</a>' % ns) + while True: + path = self.unusedfiles_address(n=number) + get_throttle() + html = self.getUrl(path) + for m in entryR.finditer(html): + fileext = None + title = m.group('title') + if extension: + fileext = title[len(title)-3:] + if title not in seen and fileext == extension: + ## Check whether the media is used in a Proofread page + # code disabled because it slows this method down, and + # because it is unclear what it's supposed to do. + #basename = title[6:] + #page = Page(self, 'Page:' + basename) + + #if not page.exists(): + seen.add(title) + image = ImagePage(self, title) + yield image + if not repeat: + break + + def withoutinterwiki(self, number=10, repeat=False): + """Yield Pages without language links from Special:Withoutinterwiki.""" + seen = set() + while True: + path = self.withoutinterwiki_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile('<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def linksearch(self, siteurl): + """Yield Pages from results of Special:Linksearch for 'siteurl'.""" + if siteurl.startswith('*.'): + siteurl = siteurl[2:] + output(u'Querying [[Special:Linksearch]]...') + cache = [] + for url in [siteurl, '*.' + siteurl]: + path = self.linksearch_address(url) + get_throttle() + html = self.getUrl(path) + loc = html.find('<div class="mw-spcontent">') + if loc > -1: + html = html[loc:] + loc = html.find('<div class="printfooter">') + if loc > -1: + html = html[:loc] + R = re.compile('title ?="(.*?)"') + for title in R.findall(html): + if not siteurl in title: + # the links themselves have similar form + if title in cache: + continue + else: + cache.append(title) + yield Page(self, title) +
Property changes on: branches/rewrite/pywikibot/site.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native
Modified: branches/rewrite/pywikibot/textlib.py =================================================================== --- branches/rewrite/pywikibot/textlib.py 2008-12-16 19:34:48 UTC (rev 6155) +++ branches/rewrite/pywikibot/textlib.py 2008-12-16 19:40:20 UTC (rev 6156) @@ -1,675 +1,675 @@ -# -*- coding: utf-8 -*- -""" -Functions for manipulating wiki-text. - -Unless otherwise noted, all functions take a unicode string as the argument -and return a unicode string. - -""" -# -# (C) Pywikipedia bot team, 2008 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id: $' - - -import pywikibot -import re - - -def unescape(s): - """Replace escaped HTML-special characters by their originals""" - if '&' not in s: - return s - s = s.replace("<", "<") - s = s.replace(">", ">") - s = s.replace("'", "'") - s = s.replace(""", '"') - s = s.replace("&", "&") # Must be last - return s - - -def replaceExcept(text, old, new, exceptions, caseInsensitive=False, - allowoverlap=False, marker = '', site = None): - """ - Return text with 'old' replaced by 'new', ignoring specified types of text. - - Skips occurences of 'old' within exceptions; e.g., within nowiki tags or - HTML comments. If caseInsensitive is true, then use case insensitive - regex matching. If allowoverlap is true, overlapping occurences are all - replaced (watch out when using this, it might lead to infinite loops!). - - Parameters: - text - a unicode string - old - a compiled regular expression - new - a unicode string (which can contain regular - expression references), or a function which takes - a match object as parameter. See parameter repl of - re.sub(). - exceptions - a list of strings which signal what to leave out, - e.g. ['math', 'table', 'template'] - caseInsensitive - a boolean - marker - a string that will be added to the last replacement; - if nothing is changed, it is added at the end - - """ - if site is None: - site = pywikibot.getSite() - - exceptionRegexes = { - 'comment': re.compile(r'(?s)<!--.*?-->'), - # section headers - 'header': re.compile(r'\r\n=+.+=+ *\r\n'), - 'includeonly': re.compile(r'(?is)<includeonly>.*?</includeonly>'), - 'math': re.compile(r'(?is)<math>.*?</math>'), - 'noinclude': re.compile(r'(?is)<noinclude>.*?</noinclude>'), - # wiki tags are ignored inside nowiki tags. - 'nowiki': re.compile(r'(?is)<nowiki>.*?</nowiki>'), - # preformatted text - 'pre': re.compile(r'(?ism)<pre>.*?</pre>'), - 'source': re.compile(r'(?is)<source .*?</source>'), - # inline references - 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'), - 'timeline': re.compile(r'(?is)<timeline>.*?</timeline>'), - # lines that start with a space are shown in a monospace font and - # have whitespace preserved. - 'startspace': re.compile(r'(?m)^ (.*?)$'), - # tables often have whitespace that is used to improve wiki - # source code readability. - # TODO: handle nested tables. - 'table': re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'), - # templates with parameters often have whitespace that is used to - # improve wiki source code readability. - # 'template': re.compile(r'(?s){{.*?}}'), - # The regex above fails on nested templates. This regex can handle - # templates cascaded up to level 3, but no deeper. For arbitrary - # depth, we'd need recursion which can't be done in Python's re. - # After all, the language of correct parenthesis words is not regular. - 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'), - 'hyperlink': compileLinkR(), - 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), - # this matches internal wikilinks, but also interwiki, categories, and - # images. - 'link': re.compile(r'[[[^]|]*(|[^]]*)?]]'), - 'interwiki': re.compile(r'(?i)[[(%s)\s?:[^]]*]][\s]*' - % '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())), - - } - - # if we got a string, compile it as a regular expression - if type(old) is str or type(old) is unicode: - if caseInsensitive: - old = re.compile(old, re.IGNORECASE | re.UNICODE) - else: - old = re.compile(old) - - dontTouchRegexes = [] - for exc in exceptions: - if isinstance(exc, str) or isinstance(exc, unicode): - # assume it's a reference to the exceptionRegexes dictionary - # defined above. - if not exceptionRegexes.has_key(exc): - raise ValueError("Unknown tag type: " + exc) - dontTouchRegexes.append(exceptionRegexes[exc]) - else: - # assume it's a regular expression - dontTouchRegexes.append(exc) - index = 0 - markerpos = len(text) - while True: - match = old.search(text, index) - if not match: - # nothing left to replace - break - - # check which exception will occur next. - nextExceptionMatch = None - for dontTouchR in dontTouchRegexes: - excMatch = dontTouchR.search(text, index) - if excMatch and ( - nextExceptionMatch is None or - excMatch.start() < nextExceptionMatch.start()): - nextExceptionMatch = excMatch - - if nextExceptionMatch is not None and nextExceptionMatch.start() <= match.start(): - # an HTML comment or text in nowiki tags stands before the next valid match. Skip. - index = nextExceptionMatch.end() - else: - # We found a valid match. Replace it. - if callable(new): - # the parameter new can be a function which takes the match as a parameter. - replacement = new(match) - else: - # it is not a function, but a string. - - # it is a little hack to make \n work. It would be better to fix it - # previously, but better than nothing. - new = new.replace('\n', '\n') - - # We cannot just insert the new string, as it may contain regex - # group references such as \2 or \g<name>. - # On the other hand, this approach does not work because it can't - # handle lookahead or lookbehind (see bug #1731008): - #replacement = old.sub(new, text[match.start():match.end()]) - #text = text[:match.start()] + replacement + text[match.end():] - - # So we have to process the group references manually. - replacement = new - - groupR = re.compile(r'\(?P<number>\d+)|\g<(?P<name>.+?)>') - while True: - groupMatch = groupR.search(replacement) - if not groupMatch: - break - groupID = groupMatch.group('name') or int(groupMatch.group('number')) - replacement = replacement[:groupMatch.start()] + match.group(groupID) + replacement[groupMatch.end():] - text = text[:match.start()] + replacement + text[match.end():] - - # continue the search on the remaining text - if allowoverlap: - index = match.start() + 1 - else: - index = match.start() + len(replacement) - markerpos = match.start() + len(replacement) - text = text[:markerpos] + marker + text[markerpos:] - return text - - -def removeDisabledParts(text, tags = ['*']): - """ - Return text without portions where wiki markup is disabled - - Parts that can/will be removed are -- - * HTML comments - * nowiki tags - * pre tags - * includeonly tags - - The exact set of parts which should be removed can be passed as the - 'parts' parameter, which defaults to all. - """ - regexes = { - 'comments' : r'<!--.*?-->', - 'includeonly': r'<includeonly>.*?</includeonly>', - 'nowiki': r'<nowiki>.*?</nowiki>', - 'pre': r'<pre>.*?</pre>', - 'source': r'<source .*?</source>', - } - if '*' in tags: - tags = regexes.keys() - toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]), - re.IGNORECASE | re.DOTALL) - return toRemoveR.sub('', text) - - -def isDisabled(text, index, tags = ['*']): - """ - Return True if text[index] is disabled, e.g. by a comment or by nowiki tags. - - For the tags parameter, see removeDisabledParts() above. - """ - # Find a marker that is not already in the text. - marker = '@@' - while marker in text: - marker += '@' - text = text[:index] + marker + text[index:] - text = removeDisabledParts(text, tags) - return (marker not in text) - - -# Functions dealing with interwiki language links - -# Note - MediaWiki supports two kinds of interwiki links; interlanguage and -# interproject. These functions only deal with links to a -# corresponding page in another language on the same project (e.g., -# Wikipedia, Wiktionary, etc.) in another language. They do not find -# or change links to a different project, or any that are formatted -# as in-line interwiki links (e.g., "[[:es:Articulo]]". (CONFIRM) - -def getLanguageLinks(text, insite = None, pageLink = "[[]]"): - """ - Return a dict of interlanguage links found in text. - - Dict uses language codes as keys and Page objects as values. - Do not call this routine directly, use Page.interwiki() method - instead. - - """ - if insite == None: - insite = pywikibot.getSite() - result = {} - # Ignore interwiki links within nowiki tags, includeonly tags, pre tags, - # and HTML comments - text = removeDisabledParts(text) - - # This regular expression will find every link that is possibly an - # interwiki link. - # NOTE: language codes are case-insensitive and only consist of basic latin - # letters and hyphens. - interwikiR = re.compile(r'[[([a-zA-Z-]+)\s?:([^[]\n]*)]]') - for lang, pagetitle in interwikiR.findall(text): - lang = lang.lower() - # Check if it really is in fact an interwiki link to a known - # language, or if it's e.g. a category tag or an internal link - if lang in insite.family.obsolete: - lang = insite.family.obsolete[lang] - if lang in insite.validLanguageLinks(): - if '|' in pagetitle: - # ignore text after the pipe - pagetitle = pagetitle[:pagetitle.index('|')] - # we want the actual page objects rather than the titles - site = insite.getSite(code = lang) - try: - result[site] = pywikibot.Page(site, pagetitle, insite = insite) - except InvalidTitle: - output( - u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]." - % (lang, pagetitle)) - continue - return result - - -def removeLanguageLinks(text, site = None, marker = ''): - """Return text with all interlanguage links removed. - - If a link to an unknown language is encountered, a warning is printed. - If a marker is defined, that string is placed at the location of the - last occurence of an interwiki link (at the end if there are no - interwiki links). - - """ - if site == None: - site = pywikibot.getSite() - if not site.validLanguageLinks(): - return text - # This regular expression will find every interwiki link, plus trailing - # whitespace. - languages = '|'.join(site.validLanguageLinks() + site.family.obsolete.keys()) - interwikiR = re.compile(r'[[(%s)\s?:[^]]*]][\s]*' - % languages, re.IGNORECASE) - text = replaceExcept(text, interwikiR, '', - ['nowiki', 'comment', 'math', 'pre', 'source'], marker=marker) - return text.strip() - - -def replaceLanguageLinks(oldtext, new, site = None): - """Replace interlanguage links in the text with a new set of links. - - 'new' should be a dict with the Site objects as keys, and Page objects - as values (i.e., just like the dict returned by getLanguageLinks - function). - - """ - # Find a marker that is not already in the text. - marker = '@@' - while marker in oldtext: - marker += '@' - if site == None: - site = pywikibot.getSite() - s = interwikiFormat(new, insite = site) - s2 = removeLanguageLinks(oldtext, site = site, marker = marker) - if s: - if site.language() in site.family.interwiki_attop: - newtext = s + site.family.interwiki_text_separator + s2.replace(marker,'').strip() - else: - # calculate what was after the language links on the page - firstafter = s2.find(marker) + len(marker) - # Is there any text in the 'after' part that means we should keep it after? - if "</noinclude>" in s2[firstafter:]: - newtext = s2[:firstafter] + s + s2[firstafter:] - elif site.language() in site.family.categories_last: - cats = getCategoryLinks(s2, site = site) - s2 = removeCategoryLinks(s2.replace(marker,'').strip(), site) + site.family.interwiki_text_separator + s - newtext = replaceCategoryLinks(s2, cats, site=site) - else: - newtext = s2.replace(marker,'').strip() + site.family.interwiki_text_separator + s - newtext = newtext.replace(marker,'') - else: - newtext = s2.replace(marker,'') - return newtext - - -def interwikiFormat(links, insite = None): - """Convert interwiki link dict into a wikitext string. - - 'links' should be a dict with the Site objects as keys, and Page - objects as values. - - Return a unicode string that is formatted for inclusion in insite - (defaulting to the current site). - """ - if insite is None: - insite = pywikibot.getSite() - if not links: - return '' - - ar = interwikiSort(links.keys(), insite) - s = [] - for site in ar: - try: - link = links[site].aslink(forceInterwiki=True) - s.append(link) - except AttributeError: - s.append(pywikibot.getSite(site).linkto(links[site], - othersite=insite)) - if insite.lang in insite.family.interwiki_on_one_line: - sep = u' ' - else: - sep = u'\r\n' - s=sep.join(s) + u'\r\n' - return s - - -# Sort sites according to local interwiki sort logic -def interwikiSort(sites, insite = None): - if insite is None: - insite = pywikibot.getSite() - if not sites: - return [] - - sites.sort() - putfirst = insite.interwiki_putfirst() - if putfirst: - #In this case I might have to change the order - firstsites = [] - for code in putfirst: - # The code may not exist in this family? - if code in insite.family.obsolete: - code = insite.family.obsolete[code] - if code in insite.validLanguageLinks(): - site = insite.getSite(code = code) - if site in sites: - del sites[sites.index(site)] - firstsites = firstsites + [site] - sites = firstsites + sites - if insite.interwiki_putfirst_doubled(sites): #some implementations return False - sites = insite.interwiki_putfirst_doubled(sites) + sites - return sites - - -# Functions dealing with category links - -def getCategoryLinks(text, site): - """Return a list of category links found in text. - - List contains Category objects. - Do not call this routine directly, use Page.categories() instead. - - """ - result = [] - # Ignore category links within nowiki tags, pre tags, includeonly tags, - # and HTML comments - text = removeDisabledParts(text) - catNamespace = '|'.join(site.category_namespaces()) - R = re.compile(r'[[\s*(?P<namespace>%s)\s*:\s*(?P<catName>.+?)' - r'(?:|(?P<sortKey>.+?))?\s*]]' - % catNamespace, re.I) - for match in R.finditer(text): - cat = pywikibot.Category(site, - '%s:%s' % (match.group('namespace'), - match.group('catName')), - sortKey = match.group('sortKey')) - result.append(cat) - return result - - -def removeCategoryLinks(text, site, marker = ''): - """Return text with all category links removed. - - Put the string marker after the last replacement (at the end of the text - if there is no replacement). - - """ - # This regular expression will find every link that is possibly an - # interwiki link, plus trailing whitespace. The language code is grouped. - # NOTE: This assumes that language codes only consist of non-capital - # ASCII letters and hyphens. - catNamespace = '|'.join(site.category_namespaces()) - categoryR = re.compile(r'[[\s*(%s)\s*:.*?]]\s*' % catNamespace, re.I) - text = replaceExcept(text, categoryR, '', ['nowiki', 'comment', 'math', 'pre', 'source'], marker = marker) - if marker: - #avoid having multiple linefeeds at the end of the text - text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker, text.strip()) - return text.strip() - - -def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None): - """Replace the category oldcat with the category newcat and return - the modified text. - - """ - if site is None: - site = pywikibot.getSite() - - catNamespace = '|'.join(site.category_namespaces()) - title = oldcat.titleWithoutNamespace() - if not title: - return - # title might contain regex special characters - title = re.escape(title) - # title might not be capitalized correctly on the wiki - if title[0].isalpha() and not site.nocapitalize: - title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:] - # spaces and underscores in page titles are interchangeable, and collapsible - title = title.replace(r"\ ", "[ _]+").replace(r"_", "[ _]+") - categoryR = re.compile(r'[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])' - % (catNamespace, title), re.I) - if newcat is None: - text = replaceExcept(oldtext, categoryR, '', - ['nowiki', 'comment', 'math', 'pre', 'source']) - else: - text = replaceExcept(oldtext, categoryR, - '[[%s:%s\2' % (site.namespace(14), - newcat.titleWithoutNamespace()), - ['nowiki', 'comment', 'math', 'pre', 'source']) - return text - - -def replaceCategoryLinks(oldtext, new, site = None, addOnly = False): - """Replace the category links given in the wikitext given - in oldtext by the new links given in new. - - 'new' should be a list of Category objects. - - If addOnly is True, the old category won't be deleted and - the category(s) given will be added - (and so they won't replace anything). - """ - - # Find a marker that is not already in the text. - marker = '@@' - while marker in oldtext: - marker += '@' - - if site is None: - site = pywikibot.getSite() - if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext: - raise Error('The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia on pages that contain the person data template because of the non-standard placement of that template. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006#...') - - s = categoryFormat(new, insite = site) - if addOnly: - s2 = oldtext - else: - s2 = removeCategoryLinks(oldtext, site = site, marker = marker) - - if s: - if site.language() in site.family.category_attop: - newtext = s + site.family.category_text_separator + s2 - else: - # calculate what was after the categories links on the page - firstafter = s2.find(marker) - # Is there any text in the 'after' part that means we should keep it after? - if "</noinclude>" in s2[firstafter:]: - newtext = s2[:firstafter] + s + s2[firstafter:] - elif site.language() in site.family.categories_last: - newtext = s2.replace(marker,'').strip() + site.family.category_text_separator + s - else: - interwiki = getLanguageLinks(s2) - s2 = removeLanguageLinks(s2.replace(marker,''), site) + site.family.category_text_separator + s - newtext = replaceLanguageLinks(s2, interwiki, site) - newtext = newtext.replace(marker,'') - else: - s2 = s2.replace(marker,'') - return s2 - return newtext.strip() - - -def categoryFormat(categories, insite = None): - """Return a string containing links to all categories in a list. - - 'categories' should be a list of Category objects. - - The string is formatted for inclusion in insite. - - """ - if not categories: - return '' - if insite is None: - insite = pywikibot.getSite() - catLinks = [category.aslink(noInterwiki = True) for category in categories] - if insite.category_on_one_line(): - sep = ' ' - else: - sep = '\r\n' - # Some people don't like the categories sorted - #catLinks.sort() - return sep.join(catLinks) + '\r\n' - - -def compileLinkR(withoutBracketed=False, onlyBracketed=False): - """Return a regex that matches external links.""" - # RFC 2396 says that URLs may only contain certain characters. - # For this regex we also accept non-allowed characters, so that the bot - # will later show these links as broken ('Non-ASCII Characters in URL'). - # Note: While allowing parenthesis inside URLs, MediaWiki will regard - # right parenthesis at the end of the URL as not part of that URL. - # The same applies to dot, comma, colon and some other characters. - notAtEnd = ']\s).:;,<>"' - # So characters inside the URL can be anything except whitespace, - # closing squared brackets, quotation marks, greater than and less - # than, and the last character also can't be parenthesis or another - # character disallowed by MediaWiki. - notInside = ']\s<>"' - # The first half of this regular expression is required because '' is - # not allowed inside links. For example, in this wiki text: - # ''Please see http://www.example.org.'' - # .'' shouldn't be considered as part of the link. - regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside + ']*[^' + notAtEnd + '])' - - if withoutBracketed: - regex = r'(?<![)' + regex - elif onlyBracketed: - regex = r'[' + regex - linkR = re.compile(regex) - return linkR - -def extract_templates_and_params(text, get_redirect=False): - """Return list of template calls found in text. - - Return value is a list of tuples. There is one tuple for each use of a - template in the page, with the template title as the first entry and a - dict of parameters as the second entry. Parameters are indexed by - strings; as in MediaWiki, an unnamed parameter is given a parameter name - with an integer value corresponding to its position among the unnnamed - parameters, and if this results multiple parameters with the same name - only the last value provided will be returned. - - """ - # remove commented-out stuff etc. - thistxt = removeDisabledParts(text) - - # marker for inside templates or parameters - marker = u'@@' - while marker in thistxt: - marker += u'@' - - # marker for links - marker2 = u'##' - while marker2 in thistxt: - marker2 += u'#' - - # marker for math - marker3 = u'%%' - while marker2 in thistxt: - marker3 += u'%' - - result = [] - inside = {} - count = 0 - Rtemplate = re.compile( - ur'{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}') - Rmath = re.compile(ur'<math>[^<]+</math>') - Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker)) - Rmarker2 = re.compile(ur'%s(\d+)%s' % (marker2, marker2)) - Rmarker3 = re.compile(ur'%s(\d+)%s' % (marker3, marker3)) - - # Replace math with markers - maths = {} - count = 0 - for m in Rmath.finditer(thistxt): - count += 1 - text = m.group() - thistxt = thistxt.replace(text, '%s%d%s' % (marker3, count, marker3)) - maths[count] = text - - while Rtemplate.search(thistxt) is not None: - for m in Rtemplate.finditer(thistxt): - # Make sure it is not detected again - count += 1 - text = m.group() - thistxt = thistxt.replace(text, - '%s%d%s' % (marker, count, marker)) - # Make sure stored templates don't contain markers - for m2 in Rmarker.finditer(text): - text = text.replace(m2.group(), inside[int(m2.group(1))]) - for m2 in Rmarker3.finditer(text): - text = text.replace(m2.group(), maths[int(m2.group(1))]) - inside[count] = text - - # Name - name = m.group('name').strip() - m2 = Rmarker.search(name) or Rmath.search(name) - if m2 is not None: - # Doesn't detect templates whose name changes, - # or templates whose name contains math tags - continue - # Parameters - paramString = m.group('params') - params = {} - numbered_param = 1 - if paramString: - # Replace wikilinks with markers - links = {} - count2 = 0 - for m2 in pywikibot.link_regex.finditer(paramString): - count2 += 1 - text = m2.group(0) - paramString = paramString.replace(text, - '%s%d%s' % (marker2, count2, marker2)) - links[count2] = text - # Parse string - markedParams = paramString.split('|') - # Replace markers - for param in markedParams: - if "=" in param: - param_name, param_val = param.split("=", 1) - else: - param_name = unicode(numbered_param) - param_val = param - numbered_param += 1 - for m2 in Rmarker.finditer(param_val): - param_val = param_val.replace(m2.group(), - inside[int(m2.group(1))]) - for m2 in Rmarker2.finditer(param_val): - param_val = param_val.replace(m2.group(), - links[int(m2.group(1))]) - for m2 in Rmarker3.finditer(param_val): - param_val = param_val.replace(m2.group(), - maths[int(m2.group(1))]) - params[param_name] = param_val - - # Add it to the result - result.append((name, params)) - return result - +# -*- coding: utf-8 -*- +""" +Functions for manipulating wiki-text. + +Unless otherwise noted, all functions take a unicode string as the argument +and return a unicode string. + +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + + +import pywikibot +import re + + +def unescape(s): + """Replace escaped HTML-special characters by their originals""" + if '&' not in s: + return s + s = s.replace("<", "<") + s = s.replace(">", ">") + s = s.replace("'", "'") + s = s.replace(""", '"') + s = s.replace("&", "&") # Must be last + return s + + +def replaceExcept(text, old, new, exceptions, caseInsensitive=False, + allowoverlap=False, marker = '', site = None): + """ + Return text with 'old' replaced by 'new', ignoring specified types of text. + + Skips occurences of 'old' within exceptions; e.g., within nowiki tags or + HTML comments. If caseInsensitive is true, then use case insensitive + regex matching. If allowoverlap is true, overlapping occurences are all + replaced (watch out when using this, it might lead to infinite loops!). + + Parameters: + text - a unicode string + old - a compiled regular expression + new - a unicode string (which can contain regular + expression references), or a function which takes + a match object as parameter. See parameter repl of + re.sub(). + exceptions - a list of strings which signal what to leave out, + e.g. ['math', 'table', 'template'] + caseInsensitive - a boolean + marker - a string that will be added to the last replacement; + if nothing is changed, it is added at the end + + """ + if site is None: + site = pywikibot.getSite() + + exceptionRegexes = { + 'comment': re.compile(r'(?s)<!--.*?-->'), + # section headers + 'header': re.compile(r'\r\n=+.+=+ *\r\n'), + 'includeonly': re.compile(r'(?is)<includeonly>.*?</includeonly>'), + 'math': re.compile(r'(?is)<math>.*?</math>'), + 'noinclude': re.compile(r'(?is)<noinclude>.*?</noinclude>'), + # wiki tags are ignored inside nowiki tags. + 'nowiki': re.compile(r'(?is)<nowiki>.*?</nowiki>'), + # preformatted text + 'pre': re.compile(r'(?ism)<pre>.*?</pre>'), + 'source': re.compile(r'(?is)<source .*?</source>'), + # inline references + 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'), + 'timeline': re.compile(r'(?is)<timeline>.*?</timeline>'), + # lines that start with a space are shown in a monospace font and + # have whitespace preserved. + 'startspace': re.compile(r'(?m)^ (.*?)$'), + # tables often have whitespace that is used to improve wiki + # source code readability. + # TODO: handle nested tables. + 'table': re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'), + # templates with parameters often have whitespace that is used to + # improve wiki source code readability. + # 'template': re.compile(r'(?s){{.*?}}'), + # The regex above fails on nested templates. This regex can handle + # templates cascaded up to level 3, but no deeper. For arbitrary + # depth, we'd need recursion which can't be done in Python's re. + # After all, the language of correct parenthesis words is not regular. + 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'), + 'hyperlink': compileLinkR(), + 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), + # this matches internal wikilinks, but also interwiki, categories, and + # images. + 'link': re.compile(r'[[[^]|]*(|[^]]*)?]]'), + 'interwiki': re.compile(r'(?i)[[(%s)\s?:[^]]*]][\s]*' + % '|'.join(site.validLanguageLinks() + site.family.obsolete.keys())), + + } + + # if we got a string, compile it as a regular expression + if type(old) is str or type(old) is unicode: + if caseInsensitive: + old = re.compile(old, re.IGNORECASE | re.UNICODE) + else: + old = re.compile(old) + + dontTouchRegexes = [] + for exc in exceptions: + if isinstance(exc, str) or isinstance(exc, unicode): + # assume it's a reference to the exceptionRegexes dictionary + # defined above. + if not exceptionRegexes.has_key(exc): + raise ValueError("Unknown tag type: " + exc) + dontTouchRegexes.append(exceptionRegexes[exc]) + else: + # assume it's a regular expression + dontTouchRegexes.append(exc) + index = 0 + markerpos = len(text) + while True: + match = old.search(text, index) + if not match: + # nothing left to replace + break + + # check which exception will occur next. + nextExceptionMatch = None + for dontTouchR in dontTouchRegexes: + excMatch = dontTouchR.search(text, index) + if excMatch and ( + nextExceptionMatch is None or + excMatch.start() < nextExceptionMatch.start()): + nextExceptionMatch = excMatch + + if nextExceptionMatch is not None and nextExceptionMatch.start() <= match.start(): + # an HTML comment or text in nowiki tags stands before the next valid match. Skip. + index = nextExceptionMatch.end() + else: + # We found a valid match. Replace it. + if callable(new): + # the parameter new can be a function which takes the match as a parameter. + replacement = new(match) + else: + # it is not a function, but a string. + + # it is a little hack to make \n work. It would be better to fix it + # previously, but better than nothing. + new = new.replace('\n', '\n') + + # We cannot just insert the new string, as it may contain regex + # group references such as \2 or \g<name>. + # On the other hand, this approach does not work because it can't + # handle lookahead or lookbehind (see bug #1731008): + #replacement = old.sub(new, text[match.start():match.end()]) + #text = text[:match.start()] + replacement + text[match.end():] + + # So we have to process the group references manually. + replacement = new + + groupR = re.compile(r'\(?P<number>\d+)|\g<(?P<name>.+?)>') + while True: + groupMatch = groupR.search(replacement) + if not groupMatch: + break + groupID = groupMatch.group('name') or int(groupMatch.group('number')) + replacement = replacement[:groupMatch.start()] + match.group(groupID) + replacement[groupMatch.end():] + text = text[:match.start()] + replacement + text[match.end():] + + # continue the search on the remaining text + if allowoverlap: + index = match.start() + 1 + else: + index = match.start() + len(replacement) + markerpos = match.start() + len(replacement) + text = text[:markerpos] + marker + text[markerpos:] + return text + + +def removeDisabledParts(text, tags = ['*']): + """ + Return text without portions where wiki markup is disabled + + Parts that can/will be removed are -- + * HTML comments + * nowiki tags + * pre tags + * includeonly tags + + The exact set of parts which should be removed can be passed as the + 'parts' parameter, which defaults to all. + """ + regexes = { + 'comments' : r'<!--.*?-->', + 'includeonly': r'<includeonly>.*?</includeonly>', + 'nowiki': r'<nowiki>.*?</nowiki>', + 'pre': r'<pre>.*?</pre>', + 'source': r'<source .*?</source>', + } + if '*' in tags: + tags = regexes.keys() + toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]), + re.IGNORECASE | re.DOTALL) + return toRemoveR.sub('', text) + + +def isDisabled(text, index, tags = ['*']): + """ + Return True if text[index] is disabled, e.g. by a comment or by nowiki tags. + + For the tags parameter, see removeDisabledParts() above. + """ + # Find a marker that is not already in the text. + marker = '@@' + while marker in text: + marker += '@' + text = text[:index] + marker + text[index:] + text = removeDisabledParts(text, tags) + return (marker not in text) + + +# Functions dealing with interwiki language links + +# Note - MediaWiki supports two kinds of interwiki links; interlanguage and +# interproject. These functions only deal with links to a +# corresponding page in another language on the same project (e.g., +# Wikipedia, Wiktionary, etc.) in another language. They do not find +# or change links to a different project, or any that are formatted +# as in-line interwiki links (e.g., "[[:es:Articulo]]". (CONFIRM) + +def getLanguageLinks(text, insite = None, pageLink = "[[]]"): + """ + Return a dict of interlanguage links found in text. + + Dict uses language codes as keys and Page objects as values. + Do not call this routine directly, use Page.interwiki() method + instead. + + """ + if insite == None: + insite = pywikibot.getSite() + result = {} + # Ignore interwiki links within nowiki tags, includeonly tags, pre tags, + # and HTML comments + text = removeDisabledParts(text) + + # This regular expression will find every link that is possibly an + # interwiki link. + # NOTE: language codes are case-insensitive and only consist of basic latin + # letters and hyphens. + interwikiR = re.compile(r'[[([a-zA-Z-]+)\s?:([^[]\n]*)]]') + for lang, pagetitle in interwikiR.findall(text): + lang = lang.lower() + # Check if it really is in fact an interwiki link to a known + # language, or if it's e.g. a category tag or an internal link + if lang in insite.family.obsolete: + lang = insite.family.obsolete[lang] + if lang in insite.validLanguageLinks(): + if '|' in pagetitle: + # ignore text after the pipe + pagetitle = pagetitle[:pagetitle.index('|')] + # we want the actual page objects rather than the titles + site = insite.getSite(code = lang) + try: + result[site] = pywikibot.Page(site, pagetitle, insite = insite) + except InvalidTitle: + output( + u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]." + % (lang, pagetitle)) + continue + return result + + +def removeLanguageLinks(text, site = None, marker = ''): + """Return text with all interlanguage links removed. + + If a link to an unknown language is encountered, a warning is printed. + If a marker is defined, that string is placed at the location of the + last occurence of an interwiki link (at the end if there are no + interwiki links). + + """ + if site == None: + site = pywikibot.getSite() + if not site.validLanguageLinks(): + return text + # This regular expression will find every interwiki link, plus trailing + # whitespace. + languages = '|'.join(site.validLanguageLinks() + site.family.obsolete.keys()) + interwikiR = re.compile(r'[[(%s)\s?:[^]]*]][\s]*' + % languages, re.IGNORECASE) + text = replaceExcept(text, interwikiR, '', + ['nowiki', 'comment', 'math', 'pre', 'source'], marker=marker) + return text.strip() + + +def replaceLanguageLinks(oldtext, new, site = None): + """Replace interlanguage links in the text with a new set of links. + + 'new' should be a dict with the Site objects as keys, and Page objects + as values (i.e., just like the dict returned by getLanguageLinks + function). + + """ + # Find a marker that is not already in the text. + marker = '@@' + while marker in oldtext: + marker += '@' + if site == None: + site = pywikibot.getSite() + s = interwikiFormat(new, insite = site) + s2 = removeLanguageLinks(oldtext, site = site, marker = marker) + if s: + if site.language() in site.family.interwiki_attop: + newtext = s + site.family.interwiki_text_separator + s2.replace(marker,'').strip() + else: + # calculate what was after the language links on the page + firstafter = s2.find(marker) + len(marker) + # Is there any text in the 'after' part that means we should keep it after? + if "</noinclude>" in s2[firstafter:]: + newtext = s2[:firstafter] + s + s2[firstafter:] + elif site.language() in site.family.categories_last: + cats = getCategoryLinks(s2, site = site) + s2 = removeCategoryLinks(s2.replace(marker,'').strip(), site) + site.family.interwiki_text_separator + s + newtext = replaceCategoryLinks(s2, cats, site=site) + else: + newtext = s2.replace(marker,'').strip() + site.family.interwiki_text_separator + s + newtext = newtext.replace(marker,'') + else: + newtext = s2.replace(marker,'') + return newtext + + +def interwikiFormat(links, insite = None): + """Convert interwiki link dict into a wikitext string. + + 'links' should be a dict with the Site objects as keys, and Page + objects as values. + + Return a unicode string that is formatted for inclusion in insite + (defaulting to the current site). + """ + if insite is None: + insite = pywikibot.getSite() + if not links: + return '' + + ar = interwikiSort(links.keys(), insite) + s = [] + for site in ar: + try: + link = links[site].aslink(forceInterwiki=True) + s.append(link) + except AttributeError: + s.append(pywikibot.getSite(site).linkto(links[site], + othersite=insite)) + if insite.lang in insite.family.interwiki_on_one_line: + sep = u' ' + else: + sep = u'\r\n' + s=sep.join(s) + u'\r\n' + return s + + +# Sort sites according to local interwiki sort logic +def interwikiSort(sites, insite = None): + if insite is None: + insite = pywikibot.getSite() + if not sites: + return [] + + sites.sort() + putfirst = insite.interwiki_putfirst() + if putfirst: + #In this case I might have to change the order + firstsites = [] + for code in putfirst: + # The code may not exist in this family? + if code in insite.family.obsolete: + code = insite.family.obsolete[code] + if code in insite.validLanguageLinks(): + site = insite.getSite(code = code) + if site in sites: + del sites[sites.index(site)] + firstsites = firstsites + [site] + sites = firstsites + sites + if insite.interwiki_putfirst_doubled(sites): #some implementations return False + sites = insite.interwiki_putfirst_doubled(sites) + sites + return sites + + +# Functions dealing with category links + +def getCategoryLinks(text, site): + """Return a list of category links found in text. + + List contains Category objects. + Do not call this routine directly, use Page.categories() instead. + + """ + result = [] + # Ignore category links within nowiki tags, pre tags, includeonly tags, + # and HTML comments + text = removeDisabledParts(text) + catNamespace = '|'.join(site.category_namespaces()) + R = re.compile(r'[[\s*(?P<namespace>%s)\s*:\s*(?P<catName>.+?)' + r'(?:|(?P<sortKey>.+?))?\s*]]' + % catNamespace, re.I) + for match in R.finditer(text): + cat = pywikibot.Category(site, + '%s:%s' % (match.group('namespace'), + match.group('catName')), + sortKey = match.group('sortKey')) + result.append(cat) + return result + + +def removeCategoryLinks(text, site, marker = ''): + """Return text with all category links removed. + + Put the string marker after the last replacement (at the end of the text + if there is no replacement). + + """ + # This regular expression will find every link that is possibly an + # interwiki link, plus trailing whitespace. The language code is grouped. + # NOTE: This assumes that language codes only consist of non-capital + # ASCII letters and hyphens. + catNamespace = '|'.join(site.category_namespaces()) + categoryR = re.compile(r'[[\s*(%s)\s*:.*?]]\s*' % catNamespace, re.I) + text = replaceExcept(text, categoryR, '', ['nowiki', 'comment', 'math', 'pre', 'source'], marker = marker) + if marker: + #avoid having multiple linefeeds at the end of the text + text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker, text.strip()) + return text.strip() + + +def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None): + """Replace the category oldcat with the category newcat and return + the modified text. + + """ + if site is None: + site = pywikibot.getSite() + + catNamespace = '|'.join(site.category_namespaces()) + title = oldcat.titleWithoutNamespace() + if not title: + return + # title might contain regex special characters + title = re.escape(title) + # title might not be capitalized correctly on the wiki + if title[0].isalpha() and not site.nocapitalize: + title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:] + # spaces and underscores in page titles are interchangeable, and collapsible + title = title.replace(r"\ ", "[ _]+").replace(r"_", "[ _]+") + categoryR = re.compile(r'[[\s*(%s)\s*:\s*%s\s*((?:|[^]]+)?]])' + % (catNamespace, title), re.I) + if newcat is None: + text = replaceExcept(oldtext, categoryR, '', + ['nowiki', 'comment', 'math', 'pre', 'source']) + else: + text = replaceExcept(oldtext, categoryR, + '[[%s:%s\2' % (site.namespace(14), + newcat.titleWithoutNamespace()), + ['nowiki', 'comment', 'math', 'pre', 'source']) + return text + + +def replaceCategoryLinks(oldtext, new, site = None, addOnly = False): + """Replace the category links given in the wikitext given + in oldtext by the new links given in new. + + 'new' should be a list of Category objects. + + If addOnly is True, the old category won't be deleted and + the category(s) given will be added + (and so they won't replace anything). + """ + + # Find a marker that is not already in the text. + marker = '@@' + while marker in oldtext: + marker += '@' + + if site is None: + site = pywikibot.getSite() + if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext: + raise Error('The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia on pages that contain the person data template because of the non-standard placement of that template. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006#...') + + s = categoryFormat(new, insite = site) + if addOnly: + s2 = oldtext + else: + s2 = removeCategoryLinks(oldtext, site = site, marker = marker) + + if s: + if site.language() in site.family.category_attop: + newtext = s + site.family.category_text_separator + s2 + else: + # calculate what was after the categories links on the page + firstafter = s2.find(marker) + # Is there any text in the 'after' part that means we should keep it after? + if "</noinclude>" in s2[firstafter:]: + newtext = s2[:firstafter] + s + s2[firstafter:] + elif site.language() in site.family.categories_last: + newtext = s2.replace(marker,'').strip() + site.family.category_text_separator + s + else: + interwiki = getLanguageLinks(s2) + s2 = removeLanguageLinks(s2.replace(marker,''), site) + site.family.category_text_separator + s + newtext = replaceLanguageLinks(s2, interwiki, site) + newtext = newtext.replace(marker,'') + else: + s2 = s2.replace(marker,'') + return s2 + return newtext.strip() + + +def categoryFormat(categories, insite = None): + """Return a string containing links to all categories in a list. + + 'categories' should be a list of Category objects. + + The string is formatted for inclusion in insite. + + """ + if not categories: + return '' + if insite is None: + insite = pywikibot.getSite() + catLinks = [category.aslink(noInterwiki = True) for category in categories] + if insite.category_on_one_line(): + sep = ' ' + else: + sep = '\r\n' + # Some people don't like the categories sorted + #catLinks.sort() + return sep.join(catLinks) + '\r\n' + + +def compileLinkR(withoutBracketed=False, onlyBracketed=False): + """Return a regex that matches external links.""" + # RFC 2396 says that URLs may only contain certain characters. + # For this regex we also accept non-allowed characters, so that the bot + # will later show these links as broken ('Non-ASCII Characters in URL'). + # Note: While allowing parenthesis inside URLs, MediaWiki will regard + # right parenthesis at the end of the URL as not part of that URL. + # The same applies to dot, comma, colon and some other characters. + notAtEnd = ']\s).:;,<>"' + # So characters inside the URL can be anything except whitespace, + # closing squared brackets, quotation marks, greater than and less + # than, and the last character also can't be parenthesis or another + # character disallowed by MediaWiki. + notInside = ']\s<>"' + # The first half of this regular expression is required because '' is + # not allowed inside links. For example, in this wiki text: + # ''Please see http://www.example.org.'' + # .'' shouldn't be considered as part of the link. + regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' + notAtEnd + '](?=[' + notAtEnd+ ']*'')|http[s]?://[^' + notInside + ']*[^' + notAtEnd + '])' + + if withoutBracketed: + regex = r'(?<![)' + regex + elif onlyBracketed: + regex = r'[' + regex + linkR = re.compile(regex) + return linkR + +def extract_templates_and_params(text, get_redirect=False): + """Return list of template calls found in text. + + Return value is a list of tuples. There is one tuple for each use of a + template in the page, with the template title as the first entry and a + dict of parameters as the second entry. Parameters are indexed by + strings; as in MediaWiki, an unnamed parameter is given a parameter name + with an integer value corresponding to its position among the unnnamed + parameters, and if this results multiple parameters with the same name + only the last value provided will be returned. + + """ + # remove commented-out stuff etc. + thistxt = removeDisabledParts(text) + + # marker for inside templates or parameters + marker = u'@@' + while marker in thistxt: + marker += u'@' + + # marker for links + marker2 = u'##' + while marker2 in thistxt: + marker2 += u'#' + + # marker for math + marker3 = u'%%' + while marker2 in thistxt: + marker3 += u'%' + + result = [] + inside = {} + count = 0 + Rtemplate = re.compile( + ur'{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}') + Rmath = re.compile(ur'<math>[^<]+</math>') + Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker)) + Rmarker2 = re.compile(ur'%s(\d+)%s' % (marker2, marker2)) + Rmarker3 = re.compile(ur'%s(\d+)%s' % (marker3, marker3)) + + # Replace math with markers + maths = {} + count = 0 + for m in Rmath.finditer(thistxt): + count += 1 + text = m.group() + thistxt = thistxt.replace(text, '%s%d%s' % (marker3, count, marker3)) + maths[count] = text + + while Rtemplate.search(thistxt) is not None: + for m in Rtemplate.finditer(thistxt): + # Make sure it is not detected again + count += 1 + text = m.group() + thistxt = thistxt.replace(text, + '%s%d%s' % (marker, count, marker)) + # Make sure stored templates don't contain markers + for m2 in Rmarker.finditer(text): + text = text.replace(m2.group(), inside[int(m2.group(1))]) + for m2 in Rmarker3.finditer(text): + text = text.replace(m2.group(), maths[int(m2.group(1))]) + inside[count] = text + + # Name + name = m.group('name').strip() + m2 = Rmarker.search(name) or Rmath.search(name) + if m2 is not None: + # Doesn't detect templates whose name changes, + # or templates whose name contains math tags + continue + # Parameters + paramString = m.group('params') + params = {} + numbered_param = 1 + if paramString: + # Replace wikilinks with markers + links = {} + count2 = 0 + for m2 in pywikibot.link_regex.finditer(paramString): + count2 += 1 + text = m2.group(0) + paramString = paramString.replace(text, + '%s%d%s' % (marker2, count2, marker2)) + links[count2] = text + # Parse string + markedParams = paramString.split('|') + # Replace markers + for param in markedParams: + if "=" in param: + param_name, param_val = param.split("=", 1) + else: + param_name = unicode(numbered_param) + param_val = param + numbered_param += 1 + for m2 in Rmarker.finditer(param_val): + param_val = param_val.replace(m2.group(), + inside[int(m2.group(1))]) + for m2 in Rmarker2.finditer(param_val): + param_val = param_val.replace(m2.group(), + links[int(m2.group(1))]) + for m2 in Rmarker3.finditer(param_val): + param_val = param_val.replace(m2.group(), + maths[int(m2.group(1))]) + params[param_name] = param_val + + # Add it to the result + result.append((name, params)) + return result +
Property changes on: branches/rewrite/pywikibot/textlib.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native
Modified: branches/rewrite/pywikibot/throttle.py =================================================================== --- branches/rewrite/pywikibot/throttle.py 2008-12-16 19:34:48 UTC (rev 6155) +++ branches/rewrite/pywikibot/throttle.py 2008-12-16 19:40:20 UTC (rev 6156) @@ -1,275 +1,275 @@ -# -*- coding: utf-8 -*- -""" -Mechanics to slow down wiki read and/or write rate. -""" -# -# (C) Pywikipedia bot team, 2008 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id: $' - -import pywikibot -from pywikibot import config2 as config - -import logging -import math -import threading -import time - -logger = logging.getLogger("wiki.throttle") - -pid = False # global process identifier - # when the first Throttle is instantiated, it will set this - # variable to a positive integer, which will apply to all - # throttle objects created by this process. - - -class Throttle(object): - """Control rate of access to wiki server - - Calling this object blocks the calling thread until at least 'delay' - seconds have passed since the previous call. - - Each Site initiates one Throttle object (site.throttle) to control the - rate of access. - - """ - def __init__(self, site, mindelay=None, maxdelay=None, writedelay=None, - multiplydelay=True, verbosedelay=False): - self.lock = threading.RLock() - self.mysite = str(site) - self.logfn = config.datafilepath('throttle.log') - self.mindelay = mindelay - if self.mindelay is None: - self.mindelay = config.minthrottle - self.maxdelay = maxdelay - if self.maxdelay is None: - self.maxdelay = config.maxthrottle - self.writedelay = writedelay - self.last_read = 0 - self.last_write = 0 - self.next_multiplicity = 1.0 - self.checkdelay = 300 # Check logfile again after this many seconds - self.dropdelay = 600 # Ignore processes that have not made - # a check in this many seconds - self.releasepid = 1200 # Free the process id after this many seconds - self.lastwait = 0.0 - self.delay = 0 - self.verbosedelay = verbosedelay - if multiplydelay: - self.checkMultiplicity() - self.setDelays() - - def checkMultiplicity(self): - """Count running processes for site and set process_multiplicity.""" - global pid - self.lock.acquire() - mysite = self.mysite - logger.debug("Checking multiplicity: pid = %(pid)s" % globals()) - try: - processes = [] - my_pid = pid or 1 # start at 1 if global pid not yet set - count = 1 - # open throttle.log - try: - f = open(self.logfn, 'r') - except IOError: - if not pid: - pass - else: - raise - else: - now = time.time() - for line in f.readlines(): - # parse line; format is "pid timestamp site" - try: - line = line.split(' ') - this_pid = int(line[0]) - ptime = int(line[1].split('.')[0]) - this_site = line[2].rstrip() - except (IndexError, ValueError): - continue # Sometimes the file gets corrupted - # ignore that line - if now - ptime > self.releasepid: - continue # process has expired, drop from file - if now - ptime <= self.dropdelay \ - and this_site == mysite \ - and this_pid != pid: - count += 1 - if this_site != self.mysite or this_pid != pid: - processes.append({'pid': this_pid, - 'time': ptime, - 'site': this_site}) - if not pid and this_pid >= my_pid: - my_pid = this_pid+1 # next unused process id - - if not pid: - pid = my_pid - self.checktime = time.time() - processes.append({'pid': pid, - 'time': self.checktime, - 'site': mysite}) - f = open(self.logfn, 'w') - processes.sort(key=lambda p:(p['pid'], p['site'])) - for p in processes: - f.write("%(pid)s %(time)s %(site)s\n" % p) - f.close() - self.process_multiplicity = count - if self.verbosedelay: - logger.info( -u"Found %(count)s %(mysite)s processes running, including this one." - % locals()) - finally: - self.lock.release() - - def setDelays(self, delay=None, writedelay=None, absolute=False): - """Set the nominal delays in seconds. Defaults to config values.""" - self.lock.acquire() - try: - maxdelay = self.maxdelay - if delay is None: - delay = self.mindelay - if writedelay is None: - writedelay = config.put_throttle - if absolute: - self.maxdelay = delay - self.mindelay = delay - self.delay = delay - self.writedelay = min(max(self.mindelay, writedelay), - self.maxdelay) - # Start the delay count now, not at the next check - self.last_read = self.last_write = time.time() - finally: - self.lock.release() - - def getDelay(self, write=False): - """Return the actual delay, accounting for multiple processes. - - This value is the maximum wait between reads/writes, not taking - account of how much time has elapsed since the last access. - - """ - global pid - if write: - thisdelay = self.writedelay - else: - thisdelay = self.delay - if pid: # If set, we're checking for multiple processes - if time.time() > self.checktime + self.checkdelay: - self.checkMultiplicity() - if thisdelay < (self.mindelay * self.next_multiplicity): - thisdelay = self.mindelay * self.next_multiplicity - elif thisdelay > self.maxdelay: - thisdelay = self.maxdelay - thisdelay *= self.process_multiplicity - return thisdelay - - def waittime(self, write=False): - """Return waiting time in seconds if a query would be made right now""" - # Take the previous requestsize in account calculating the desired - # delay this time - thisdelay = self.getDelay(write=write) - now = time.time() - if write: - ago = now - self.last_write - else: - ago = now - self.last_read - if ago < thisdelay: - delta = thisdelay - ago - return delta - else: - return 0.0 - - def drop(self): - """Remove me from the list of running bot processes.""" - # drop all throttles with this process's pid, regardless of site - self.checktime = 0 - processes = [] - try: - f = open(self.logfn, 'r') - except IOError: - return - else: - now = time.time() - for line in f.readlines(): - try: - line = line.split(' ') - this_pid = int(line[0]) - ptime = int(line[1].split('.')[0]) - this_site = line[2].rstrip() - except (IndexError,ValueError): - continue # Sometimes the file gets corrupted - # ignore that line - if now - ptime <= self.releasepid \ - and this_pid != pid: - processes.append({'pid': this_pid, - 'time': ptime, - 'site': this_site}) - f = open(self.logfn, 'w') - processes.sort(key=lambda p:p['pid']) - for p in processes: - f.write("%(pid)s %(time)s %(site)s\n" % p) - f.close() - - def __call__(self, requestsize=1, write=False): - """ - Block the calling program if the throttle time has not expired. - - Parameter requestsize is the number of Pages to be read/written; - multiply delay time by an appropriate factor. - - Because this seizes the throttle lock, it will prevent any other - thread from writing to the same site until the wait expires. - - """ - self.lock.acquire() - try: - wait = self.waittime(write=write) - # Calculate the multiplicity of the next delay based on how - # big the request is that is being posted now. - # We want to add "one delay" for each factor of two in the - # size of the request. Getting 64 pages at once allows 6 times - # the delay time for the server. - self.next_multiplicity = math.log(1+requestsize)/math.log(2.0) - # Announce the delay if it exceeds a preset limit - if wait > config.noisysleep: - logger.info(u"Sleeping for %(wait).1f seconds, %(now)s" - % {'wait': wait, - 'now': time.strftime("%Y-%m-%d %H:%M:%S", - time.localtime()) - } ) - time.sleep(wait) - if write: - self.last_write = time.time() - else: - self.last_read = time.time() - finally: - self.lock.release() - - def lag(self, lagtime): - """ - Seize the throttle lock due to server lag. - - This will prevent any thread from accessing this site. - - """ - started = time.time() - self.lock.acquire() - try: - # start at 1/2 the current server lag time - # wait at least 5 seconds but not more than 120 seconds - delay = min(max(5, lagtime//2), 120) - # account for any time we waited while acquiring the lock - wait = delay - (time.time() - started) - if wait > 0: - if wait > config.noisysleep: - logger.info(u"Sleeping for %(wait).1f seconds, %(now)s" - % {'wait': wait, - 'now': time.strftime("%Y-%m-%d %H:%M:%S", - time.localtime()) - } ) - time.sleep(wait) - finally: - self.lock.release() - +# -*- coding: utf-8 -*- +""" +Mechanics to slow down wiki read and/or write rate. +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import pywikibot +from pywikibot import config2 as config + +import logging +import math +import threading +import time + +logger = logging.getLogger("wiki.throttle") + +pid = False # global process identifier + # when the first Throttle is instantiated, it will set this + # variable to a positive integer, which will apply to all + # throttle objects created by this process. + + +class Throttle(object): + """Control rate of access to wiki server + + Calling this object blocks the calling thread until at least 'delay' + seconds have passed since the previous call. + + Each Site initiates one Throttle object (site.throttle) to control the + rate of access. + + """ + def __init__(self, site, mindelay=None, maxdelay=None, writedelay=None, + multiplydelay=True, verbosedelay=False): + self.lock = threading.RLock() + self.mysite = str(site) + self.logfn = config.datafilepath('throttle.log') + self.mindelay = mindelay + if self.mindelay is None: + self.mindelay = config.minthrottle + self.maxdelay = maxdelay + if self.maxdelay is None: + self.maxdelay = config.maxthrottle + self.writedelay = writedelay + self.last_read = 0 + self.last_write = 0 + self.next_multiplicity = 1.0 + self.checkdelay = 300 # Check logfile again after this many seconds + self.dropdelay = 600 # Ignore processes that have not made + # a check in this many seconds + self.releasepid = 1200 # Free the process id after this many seconds + self.lastwait = 0.0 + self.delay = 0 + self.verbosedelay = verbosedelay + if multiplydelay: + self.checkMultiplicity() + self.setDelays() + + def checkMultiplicity(self): + """Count running processes for site and set process_multiplicity.""" + global pid + self.lock.acquire() + mysite = self.mysite + logger.debug("Checking multiplicity: pid = %(pid)s" % globals()) + try: + processes = [] + my_pid = pid or 1 # start at 1 if global pid not yet set + count = 1 + # open throttle.log + try: + f = open(self.logfn, 'r') + except IOError: + if not pid: + pass + else: + raise + else: + now = time.time() + for line in f.readlines(): + # parse line; format is "pid timestamp site" + try: + line = line.split(' ') + this_pid = int(line[0]) + ptime = int(line[1].split('.')[0]) + this_site = line[2].rstrip() + except (IndexError, ValueError): + continue # Sometimes the file gets corrupted + # ignore that line + if now - ptime > self.releasepid: + continue # process has expired, drop from file + if now - ptime <= self.dropdelay \ + and this_site == mysite \ + and this_pid != pid: + count += 1 + if this_site != self.mysite or this_pid != pid: + processes.append({'pid': this_pid, + 'time': ptime, + 'site': this_site}) + if not pid and this_pid >= my_pid: + my_pid = this_pid+1 # next unused process id + + if not pid: + pid = my_pid + self.checktime = time.time() + processes.append({'pid': pid, + 'time': self.checktime, + 'site': mysite}) + f = open(self.logfn, 'w') + processes.sort(key=lambda p:(p['pid'], p['site'])) + for p in processes: + f.write("%(pid)s %(time)s %(site)s\n" % p) + f.close() + self.process_multiplicity = count + if self.verbosedelay: + logger.info( +u"Found %(count)s %(mysite)s processes running, including this one." + % locals()) + finally: + self.lock.release() + + def setDelays(self, delay=None, writedelay=None, absolute=False): + """Set the nominal delays in seconds. Defaults to config values.""" + self.lock.acquire() + try: + maxdelay = self.maxdelay + if delay is None: + delay = self.mindelay + if writedelay is None: + writedelay = config.put_throttle + if absolute: + self.maxdelay = delay + self.mindelay = delay + self.delay = delay + self.writedelay = min(max(self.mindelay, writedelay), + self.maxdelay) + # Start the delay count now, not at the next check + self.last_read = self.last_write = time.time() + finally: + self.lock.release() + + def getDelay(self, write=False): + """Return the actual delay, accounting for multiple processes. + + This value is the maximum wait between reads/writes, not taking + account of how much time has elapsed since the last access. + + """ + global pid + if write: + thisdelay = self.writedelay + else: + thisdelay = self.delay + if pid: # If set, we're checking for multiple processes + if time.time() > self.checktime + self.checkdelay: + self.checkMultiplicity() + if thisdelay < (self.mindelay * self.next_multiplicity): + thisdelay = self.mindelay * self.next_multiplicity + elif thisdelay > self.maxdelay: + thisdelay = self.maxdelay + thisdelay *= self.process_multiplicity + return thisdelay + + def waittime(self, write=False): + """Return waiting time in seconds if a query would be made right now""" + # Take the previous requestsize in account calculating the desired + # delay this time + thisdelay = self.getDelay(write=write) + now = time.time() + if write: + ago = now - self.last_write + else: + ago = now - self.last_read + if ago < thisdelay: + delta = thisdelay - ago + return delta + else: + return 0.0 + + def drop(self): + """Remove me from the list of running bot processes.""" + # drop all throttles with this process's pid, regardless of site + self.checktime = 0 + processes = [] + try: + f = open(self.logfn, 'r') + except IOError: + return + else: + now = time.time() + for line in f.readlines(): + try: + line = line.split(' ') + this_pid = int(line[0]) + ptime = int(line[1].split('.')[0]) + this_site = line[2].rstrip() + except (IndexError,ValueError): + continue # Sometimes the file gets corrupted + # ignore that line + if now - ptime <= self.releasepid \ + and this_pid != pid: + processes.append({'pid': this_pid, + 'time': ptime, + 'site': this_site}) + f = open(self.logfn, 'w') + processes.sort(key=lambda p:p['pid']) + for p in processes: + f.write("%(pid)s %(time)s %(site)s\n" % p) + f.close() + + def __call__(self, requestsize=1, write=False): + """ + Block the calling program if the throttle time has not expired. + + Parameter requestsize is the number of Pages to be read/written; + multiply delay time by an appropriate factor. + + Because this seizes the throttle lock, it will prevent any other + thread from writing to the same site until the wait expires. + + """ + self.lock.acquire() + try: + wait = self.waittime(write=write) + # Calculate the multiplicity of the next delay based on how + # big the request is that is being posted now. + # We want to add "one delay" for each factor of two in the + # size of the request. Getting 64 pages at once allows 6 times + # the delay time for the server. + self.next_multiplicity = math.log(1+requestsize)/math.log(2.0) + # Announce the delay if it exceeds a preset limit + if wait > config.noisysleep: + logger.info(u"Sleeping for %(wait).1f seconds, %(now)s" + % {'wait': wait, + 'now': time.strftime("%Y-%m-%d %H:%M:%S", + time.localtime()) + } ) + time.sleep(wait) + if write: + self.last_write = time.time() + else: + self.last_read = time.time() + finally: + self.lock.release() + + def lag(self, lagtime): + """ + Seize the throttle lock due to server lag. + + This will prevent any thread from accessing this site. + + """ + started = time.time() + self.lock.acquire() + try: + # start at 1/2 the current server lag time + # wait at least 5 seconds but not more than 120 seconds + delay = min(max(5, lagtime//2), 120) + # account for any time we waited while acquiring the lock + wait = delay - (time.time() - started) + if wait > 0: + if wait > config.noisysleep: + logger.info(u"Sleeping for %(wait).1f seconds, %(now)s" + % {'wait': wait, + 'now': time.strftime("%Y-%m-%d %H:%M:%S", + time.localtime()) + } ) + time.sleep(wait) + finally: + self.lock.release() +
Property changes on: branches/rewrite/pywikibot/throttle.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native
Modified: branches/rewrite/pywikibot/tools.py =================================================================== --- branches/rewrite/pywikibot/tools.py 2008-12-16 19:34:48 UTC (rev 6155) +++ branches/rewrite/pywikibot/tools.py 2008-12-16 19:40:20 UTC (rev 6156) @@ -1,174 +1,174 @@ -# -*- coding: utf-8 -*- -"""Miscellaneous helper functions (not wiki-dependent)""" -# -# (C) Pywikipedia bot team, 2008 -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id: $' - -import sys -import threading -import time -import Queue - - -class ThreadedGenerator(threading.Thread): - """Look-ahead generator class. - - Runs a generator in a separate thread and queues the results; can - be called like a regular generator. - - Subclasses should override self.generator, I{not} self.run - - Important: the generator thread will stop itself if the generator's - internal queue is exhausted; but, if the calling program does not use - all the generated values, it must call the generator's stop() method to - stop the background thread. Example usage: - - >>> gen = ThreadedGenerator(target=xrange, args=(20,)) - >>> try: - ... for data in gen: - ... print data, - ... finally: - ... gen.stop() - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 - - """ - - def __init__(self, group=None, target=None, name="GeneratorThread", - args=(), kwargs=None, qsize=65536): - """Constructor. Takes same keyword arguments as threading.Thread. - - target must be a generator function (or other callable that returns - an iterable object). - - @param qsize: The size of the lookahead queue. The larger the qsize, - the more values will be computed in advance of use (which can eat - up memory and processor time). - @type qsize: int - - """ - if kwargs is None: - kwargs = {} - if target: - self.generator = target - if not hasattr(self, "generator"): - raise RuntimeError("No generator for ThreadedGenerator to run.") - self.args, self.kwargs = args, kwargs - threading.Thread.__init__(self, group=group, name=name) - self.queue = Queue.Queue(qsize) - self.finished = threading.Event() - - def __iter__(self): - """Iterate results from the queue.""" - if not self.isAlive() and not self.finished.isSet(): - self.start() - # if there is an item in the queue, yield it, otherwise wait - while not self.finished.isSet(): - try: - yield self.queue.get(True, 0.25) - except Queue.Empty: - pass - except KeyboardInterrupt: - self.stop() - - def stop(self): - """Stop the background thread.""" - self.finished.set() - - def run(self): - """Run the generator and store the results on the queue.""" - self.__gen = self.generator(*self.args, **self.kwargs) - for result in self.__gen: - while True: - if self.finished.isSet(): - return - try: - self.queue.put_nowait(result) - except Queue.Full: - time.sleep(0.25) - continue - break - # wait for queue to be emptied, then kill the thread - while not self.finished.isSet() and not self.queue.empty(): - time.sleep(0.25) - self.stop() - - -def itergroup(iterable, size): - """Make an iterator that returns lists of (up to) size items from iterable. - - Example: - - >>> i = itergroup(xrange(25), 10) - >>> print i.next() - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - >>> print i.next() - [10, 11, 12, 13, 14, 15, 16, 17, 18, 19] - >>> print i.next() - [20, 21, 22, 23, 24] - >>> print i.next() - Traceback (most recent call last): - ... - StopIteration - - """ - group = [] - for item in iterable: - group.append(item) - if len(group) == size: - yield group - group = [] - if group: - yield group - - -class ThreadList(list): - """A simple threadpool class to limit the number of simultaneous threads. - - Any threading.Thread object can be added to the pool using the append() - method. If the maximum number of simultaneous threads has not been reached, - the Thread object will be started immediately; if not, the append() call - will block until the thread is able to start. - - >>> pool = ThreadList(limit=10) - >>> def work(): - ... time.sleep(1) - ... - >>> for x in xrange(20): - ... pool.append(threading.Thread(target=work)) - ... - - """ - def __init__(self, limit=sys.maxint, *args): - self.limit = limit - list.__init__(self, *args) - for item in list(self): - if not isinstance(threading.Thread, item): - raise TypeError("Cannot add '%s' to ThreadList" % type(item)) - - def active_count(self): - """Return the number of alive threads, and delete all non-alive ones.""" - count = 0 - for item in list(self): - if item.isAlive(): - count += 1 - else: - self.remove(item) - return count - - def append(self, thd): - if not isinstance(thd, threading.Thread): - raise TypeError("Cannot append '%s' to ThreadList" % type(thd)) - while self.active_count() >= self.limit: - time.sleep(2) - list.append(self, thd) - thd.start() - - -if __name__ == "__main__": - def _test(): - import doctest - doctest.testmod() - _test() +# -*- coding: utf-8 -*- +"""Miscellaneous helper functions (not wiki-dependent)""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import sys +import threading +import time +import Queue + + +class ThreadedGenerator(threading.Thread): + """Look-ahead generator class. + + Runs a generator in a separate thread and queues the results; can + be called like a regular generator. + + Subclasses should override self.generator, I{not} self.run + + Important: the generator thread will stop itself if the generator's + internal queue is exhausted; but, if the calling program does not use + all the generated values, it must call the generator's stop() method to + stop the background thread. Example usage: + + >>> gen = ThreadedGenerator(target=xrange, args=(20,)) + >>> try: + ... for data in gen: + ... print data, + ... finally: + ... gen.stop() + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + + """ + + def __init__(self, group=None, target=None, name="GeneratorThread", + args=(), kwargs=None, qsize=65536): + """Constructor. Takes same keyword arguments as threading.Thread. + + target must be a generator function (or other callable that returns + an iterable object). + + @param qsize: The size of the lookahead queue. The larger the qsize, + the more values will be computed in advance of use (which can eat + up memory and processor time). + @type qsize: int + + """ + if kwargs is None: + kwargs = {} + if target: + self.generator = target + if not hasattr(self, "generator"): + raise RuntimeError("No generator for ThreadedGenerator to run.") + self.args, self.kwargs = args, kwargs + threading.Thread.__init__(self, group=group, name=name) + self.queue = Queue.Queue(qsize) + self.finished = threading.Event() + + def __iter__(self): + """Iterate results from the queue.""" + if not self.isAlive() and not self.finished.isSet(): + self.start() + # if there is an item in the queue, yield it, otherwise wait + while not self.finished.isSet(): + try: + yield self.queue.get(True, 0.25) + except Queue.Empty: + pass + except KeyboardInterrupt: + self.stop() + + def stop(self): + """Stop the background thread.""" + self.finished.set() + + def run(self): + """Run the generator and store the results on the queue.""" + self.__gen = self.generator(*self.args, **self.kwargs) + for result in self.__gen: + while True: + if self.finished.isSet(): + return + try: + self.queue.put_nowait(result) + except Queue.Full: + time.sleep(0.25) + continue + break + # wait for queue to be emptied, then kill the thread + while not self.finished.isSet() and not self.queue.empty(): + time.sleep(0.25) + self.stop() + + +def itergroup(iterable, size): + """Make an iterator that returns lists of (up to) size items from iterable. + + Example: + + >>> i = itergroup(xrange(25), 10) + >>> print i.next() + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + >>> print i.next() + [10, 11, 12, 13, 14, 15, 16, 17, 18, 19] + >>> print i.next() + [20, 21, 22, 23, 24] + >>> print i.next() + Traceback (most recent call last): + ... + StopIteration + + """ + group = [] + for item in iterable: + group.append(item) + if len(group) == size: + yield group + group = [] + if group: + yield group + + +class ThreadList(list): + """A simple threadpool class to limit the number of simultaneous threads. + + Any threading.Thread object can be added to the pool using the append() + method. If the maximum number of simultaneous threads has not been reached, + the Thread object will be started immediately; if not, the append() call + will block until the thread is able to start. + + >>> pool = ThreadList(limit=10) + >>> def work(): + ... time.sleep(1) + ... + >>> for x in xrange(20): + ... pool.append(threading.Thread(target=work)) + ... + + """ + def __init__(self, limit=sys.maxint, *args): + self.limit = limit + list.__init__(self, *args) + for item in list(self): + if not isinstance(threading.Thread, item): + raise TypeError("Cannot add '%s' to ThreadList" % type(item)) + + def active_count(self): + """Return the number of alive threads, and delete all non-alive ones.""" + count = 0 + for item in list(self): + if item.isAlive(): + count += 1 + else: + self.remove(item) + return count + + def append(self, thd): + if not isinstance(thd, threading.Thread): + raise TypeError("Cannot append '%s' to ThreadList" % type(thd)) + while self.active_count() >= self.limit: + time.sleep(2) + list.append(self, thd) + thd.start() + + +if __name__ == "__main__": + def _test(): + import doctest + doctest.testmod() + _test()
Property changes on: branches/rewrite/pywikibot/tools.py ___________________________________________________________________ Added: svn:keywords + Author Date Id Revision Added: svn:eol-style + native