Revision: 6156
Author: russblau
Date: 2008-12-16 19:40:20 +0000 (Tue, 16 Dec 2008)
Log Message:
-----------
update properties
Modified Paths:
--------------
branches/rewrite/pywikibot/catlib.py
branches/rewrite/pywikibot/exceptions.py
branches/rewrite/pywikibot/page.py
branches/rewrite/pywikibot/pagegenerators.py
branches/rewrite/pywikibot/site.py
branches/rewrite/pywikibot/textlib.py
branches/rewrite/pywikibot/throttle.py
branches/rewrite/pywikibot/tools.py
Property Changed:
----------------
branches/rewrite/pywikibot/__init__.py
branches/rewrite/pywikibot/bot.py
branches/rewrite/pywikibot/catlib.py
branches/rewrite/pywikibot/exceptions.py
branches/rewrite/pywikibot/page.py
branches/rewrite/pywikibot/pagegenerators.py
branches/rewrite/pywikibot/site.py
branches/rewrite/pywikibot/textlib.py
branches/rewrite/pywikibot/throttle.py
branches/rewrite/pywikibot/tools.py
Property changes on: branches/rewrite/pywikibot/__init__.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Property changes on: branches/rewrite/pywikibot/bot.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Modified: branches/rewrite/pywikibot/catlib.py
===================================================================
--- branches/rewrite/pywikibot/catlib.py 2008-12-16 19:34:48 UTC (rev 6155)
+++ branches/rewrite/pywikibot/catlib.py 2008-12-16 19:40:20 UTC (rev 6156)
@@ -1,22 +1,22 @@
-# -*- coding: utf-8 -*-
-"""
-WARNING: THIS MODULE EXISTS SOLELY TO PROVIDE BACKWARDS-COMPATIBILITY.
-
-Do not use in new scripts; use the source to find the appropriate
-function/method instead.
-
-"""
-#
-# (C) Pywikipedia bot team, 2008
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id: $'
-
-
-from pywikibot import Category
-
-
-def change_category(article, oldCat, newCat, comment=None, sortKey=None,
- inPlace=True):
- return article.change_category(oldCat, newCat, comment, sortKey, inPlace)
+# -*- coding: utf-8 -*-
+"""
+WARNING: THIS MODULE EXISTS SOLELY TO PROVIDE BACKWARDS-COMPATIBILITY.
+
+Do not use in new scripts; use the source to find the appropriate
+function/method instead.
+
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+
+from pywikibot import Category
+
+
+def change_category(article, oldCat, newCat, comment=None, sortKey=None,
+ inPlace=True):
+ return article.change_category(oldCat, newCat, comment, sortKey, inPlace)
Property changes on: branches/rewrite/pywikibot/catlib.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Added: svn:eol-style
+ native
Modified: branches/rewrite/pywikibot/exceptions.py
===================================================================
--- branches/rewrite/pywikibot/exceptions.py 2008-12-16 19:34:48 UTC (rev 6155)
+++ branches/rewrite/pywikibot/exceptions.py 2008-12-16 19:40:20 UTC (rev 6156)
@@ -1,87 +1,87 @@
-# -*- coding: utf-8 -*-
-"""
-Exception classes used throughout the framework.
-"""
-#
-# (C) Pywikipedia bot team, 2008
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id: $'
-
-
-import sys
-
-# TODO: These are copied from wikipedia.py; not certain that all of them
-# will be needed in the rewrite.
-
-class Error(Exception):
- """Wikipedia error"""
- def __init__(self, arg):
- try:
- self.string = arg.encode(sys.stderr.encoding, "xmlcharrefreplace")
- except (AttributeError, TypeError):
- self.string = arg.encode("ascii", "xmlcharrefreplace")
- def __str__(self):
- return self.string
-
-class NoUsername(Error):
- """Username is not in user-config.py"""
-
-class NoPage(Error):
- """Page does not exist"""
-
-class NoSuchSite(Error):
- """Site does not exist"""
-
-class IsRedirectPage(Error):
- """Page is a redirect page"""
-
-class IsNotRedirectPage(Error):
- """Page is not a redirect page"""
-
-class CircularRedirect(Error):
- """Page is a circular redirect
-
- Exception argument is the redirect target; this may be the same title
- as this page or a different title (in which case the target page directly
- or indirectly redirects back to this one)
-
- """
-
-class LockedPage(Error):
- """Page is locked"""
-
-class SectionError(Error):
- """The section specified by # does not exist"""
-
-class PageNotSaved(Error):
- """Saving the page has failed"""
-
-class EditConflict(PageNotSaved):
- """There has been an edit conflict while uploading the
page"""
-
-class SpamfilterError(PageNotSaved):
- """Saving the page has failed because the MediaWiki spam filter
detected a blacklisted URL."""
- def __init__(self, arg):
- self.url = arg
- self.args = arg,
-
-class ServerError(Error):
- """Got unexpected server response"""
-
-class BadTitle(Error):
- """Server responded with BadTitle."""
-
-# UserBlocked exceptions should in general not be caught. If the bot has
-# been blocked, the bot operator should address the reason for the block
-# before continuing.
-class UserBlocked(Error):
- """Your username or IP has been blocked"""
-
-class PageNotFound(Error):
- """Page not found in list"""
-
-class CaptchaError(Error):
- """Captcha is asked and config.solve_captcha ==
False."""
-
+# -*- coding: utf-8 -*-
+"""
+Exception classes used throughout the framework.
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+
+import sys
+
+# TODO: These are copied from wikipedia.py; not certain that all of them
+# will be needed in the rewrite.
+
+class Error(Exception):
+ """Wikipedia error"""
+ def __init__(self, arg):
+ try:
+ self.string = arg.encode(sys.stderr.encoding, "xmlcharrefreplace")
+ except (AttributeError, TypeError):
+ self.string = arg.encode("ascii", "xmlcharrefreplace")
+ def __str__(self):
+ return self.string
+
+class NoUsername(Error):
+ """Username is not in user-config.py"""
+
+class NoPage(Error):
+ """Page does not exist"""
+
+class NoSuchSite(Error):
+ """Site does not exist"""
+
+class IsRedirectPage(Error):
+ """Page is a redirect page"""
+
+class IsNotRedirectPage(Error):
+ """Page is not a redirect page"""
+
+class CircularRedirect(Error):
+ """Page is a circular redirect
+
+ Exception argument is the redirect target; this may be the same title
+ as this page or a different title (in which case the target page directly
+ or indirectly redirects back to this one)
+
+ """
+
+class LockedPage(Error):
+ """Page is locked"""
+
+class SectionError(Error):
+ """The section specified by # does not exist"""
+
+class PageNotSaved(Error):
+ """Saving the page has failed"""
+
+class EditConflict(PageNotSaved):
+ """There has been an edit conflict while uploading the
page"""
+
+class SpamfilterError(PageNotSaved):
+ """Saving the page has failed because the MediaWiki spam filter
detected a blacklisted URL."""
+ def __init__(self, arg):
+ self.url = arg
+ self.args = arg,
+
+class ServerError(Error):
+ """Got unexpected server response"""
+
+class BadTitle(Error):
+ """Server responded with BadTitle."""
+
+# UserBlocked exceptions should in general not be caught. If the bot has
+# been blocked, the bot operator should address the reason for the block
+# before continuing.
+class UserBlocked(Error):
+ """Your username or IP has been blocked"""
+
+class PageNotFound(Error):
+ """Page not found in list"""
+
+class CaptchaError(Error):
+ """Captcha is asked and config.solve_captcha ==
False."""
+
Property changes on: branches/rewrite/pywikibot/exceptions.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Added: svn:eol-style
+ native
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py 2008-12-16 19:34:48 UTC (rev 6155)
+++ branches/rewrite/pywikibot/page.py 2008-12-16 19:40:20 UTC (rev 6156)
@@ -1,1886 +1,1886 @@
-# -*- coding: utf-8 -*-
-"""
-Objects representing various types of MediaWiki pages.
-"""
-#
-# (C) Pywikipedia bot team, 2008
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id: $'
-
-import pywikibot
-from pywikibot import deprecate_arg
-from pywikibot import config
-import pywikibot.site
-import pywikibot.textlib
-
-import htmlentitydefs
-import logging
-import re
-import sys
-import threading
-import unicodedata
-import urllib
-
-logger = logging.getLogger("wiki")
-
-reNamespace = re.compile("^(.+?) *: *(.*)$")
-
-
-class Page(object):
- """Page: A MediaWiki page
-
- This object only implements internally methods that do not require
- reading from or writing to the wiki. All other methods are delegated
- to the Site object.
-
- """
-
- @deprecate_arg("insite", None)
- @deprecate_arg("defaultNamespace", None)
- def __init__(self, source, title=u"", ns=0):
- """Instantiate a Page object.
-
- Three calling formats are supported:
-
- - If the first argument is a Page, create a copy of that object.
- This can be used to convert an existing Page into a subclass
- object, such as Category or ImagePage. (If the title is also
- given as the second argument, creates a copy with that title;
- this is used when pages are moved.)
- - If the first argument is a Site, create a Page on that Site
- using the second argument as the title (may include a section),
- and the third as the namespace number. The namespace number is
- mandatory, even if the title includes the namespace prefix. This
- is the preferred syntax when using an already-normalized title
- obtained from api.php or a database dump. WARNING: may produce
- invalid objects if page title isn't in normal form!
- - If the first argument is a Link, create a Page from that link.
- This is the preferred syntax when using a title scraped from
- wikitext, URLs, or another non-normalized source.
-
- @param source: the source of the page
- @type source: Link, Page (or subclass), or Site
- @param title: normalized title of the page; required if source is a
- Site, ignored otherwise
- @type title: unicode
- @param ns: namespace number; required if source is a Site, ignored
- otherwise
- @type ns: int
-
- """
- if isinstance(source, pywikibot.site.BaseSite):
- self._site = source
- if ns not in source.namespaces():
- raise pywikibot.Error(
- "Invalid namespace '%i' for site %s."
- % (ns, source.sitename()))
- self._ns = ns
- if ns and not title.startswith(source.namespace(ns)+u":"):
- title = source.namespace(ns) + u":" + title
- elif not ns and u":" in title:
- pos = title.index(u':')
- nsindex = source.ns_index(title[ :pos])
- if nsindex:
- self._ns = nsindex
- if u"#" in title:
- title, self._section = title.split(u"#", 1)
- else:
- self._section = None
- if not title:
- raise pywikibot.Error(
- "Page object cannot be created from Site without
title.")
- self._title = title
- elif isinstance(source, Page):
- # copy all of source's attributes to this object
- self.__dict__ = source.__dict__
- if title:
- # overwrite title
- if ":" in title:
- prefix = title[ :title.index(":")]
- self._ns = site.ns_index(prefix)
- if self._ns is None:
- self._ns = 0
- else:
- title = title[title.index(":")+1 : ].strip("
_")
- self._title = "%s:%s" % (
- self.site().namespace(self._ns),
- self._title)
- else:
- self._ns = 0
- if "#" in title:
- self._section = title[title.index("#") + 1 : ].strip("
_")
- title = title[ : title.index("#")].strip(" _")
- self._title = title
- elif isinstance(source, Link):
- self._site = source.site
- self._section = source.section
- self._ns = source.namespace
- self._title = source.title
- # reassemble the canonical title from components
- if self._ns:
- self._title = "%s:%s" % (self.site().namespace(self._ns),
- self._title)
- else:
- raise pywikibot.Error(
- "Invalid argument type '%s' in Page constructor: %s"
- % (type(source), source))
- if self._section is not None:
- self._title = self._title + "#" + self._section
- self._revisions = {}
-
- def site(self):
- """Return the Site object for the wiki on which this Page
resides."""
- return self._site
-
- def namespace(self):
- """Return the number of the namespace of the
page."""
- return self._ns
-
- @deprecate_arg("decode", None)
- @deprecate_arg("savetitle", "asUrl")
- def title(self, underscore=False, savetitle=False, withNamespace=True,
- withSection=True, asUrl=False, asLink=False,
- allowInterwiki=True, forceInterwiki=False, textlink=False,
- as_filename=False):
- """Return the title of this Page, as a Unicode string.
-
- @param underscore: if true, replace all ' ' characters with '_'
- @param withNamespace: if false, omit the namespace prefix
- @param withSection: if false, omit the section
- @param asUrl: if true, quote title as if in an URL
- @param asLink: if true, return the title in the form of a wikilink
- @param allowInterwiki: (only used if asLink is true) if true, format
- the link as an interwiki link if necessary
- @param forceInterwiki: (only used if asLink is true) if true, always
- format the link as an interwiki link
- @param textlink: (only used if asLink is true) if true, place a ':'
- before Category: and Image: links
- @param as_filename: if true, replace any characters that are unsafe
- in filenames
-
- """
- title = self._title
- if not withNamespace and self._ns != 0:
- title = title.split(u':', 1)[1]
- if not withSection and self._section:
- title = title.split(u'#', 1)[0]
- if underscore or asUrl:
- title = title.replace(u' ', u'_')
- if asUrl:
- encodedTitle = title.encode(self.site().encoding())
- title = urllib.quote(encodedTitle)
- if asLink:
- if forceInterwiki or (allowInterwiki and
- (self.site().family.name != config.family
- or self.site().code != config.mylang)):
- if self.site().family.name != config.family \
- and self.site().family.name != self.site().code:
- return u'[[%s:%s:%s]]' % (self.site().family.name,
- self.site().code,
- self._title)
- else:
- # use this form for sites like commons, where the
- # code is the same as the family name
- return u'[[%s:%s]]' % (self.site().code,
- self._title)
- elif textlink and (self.isImage() or self.isCategory()):
- return u'[[:%s]]' % title
- else:
- return u'[[%s]]' % title
- if as_filename:
- # Replace characters that are not possible in file names on some
- # systems.
- # Spaces are possible on most systems, but are bad for URLs.
- for forbidden in ':*?/\\ ':
- title = title.replace(forbidden, '_')
- return title
-
- @deprecate_arg("decode", None)
- @deprecate_arg("underscore", None)
- def section(self):
- """Return the name of the section this Page refers to.
-
- The section is the part of the title following a '#' character, if
- any. If no section is present, return None.
-
- """
- if self._section:
- return self._section
- else:
- return None
-
- def __str__(self):
- """Return a console representation of the
pagelink."""
- return self.title(asLink=True, forceInterwiki=True
- ).encode(sys.stderr.encoding)
-
- def __unicode__(self):
- return self.title(asLink=True, forceInterwiki=True)
-
- def __repr__(self):
- """Return a more complete string
representation."""
- return u"%s(%s)" % (self.__class__.__name__,
- self.title().encode(sys.stderr.encoding))
-
- def __cmp__(self, other):
- """Test for equality and inequality of Page objects.
-
- Page objects are "equal" if and only if they are on the same site
- and have the same normalized title, including section if any.
-
- Page objects are sortable by namespace first, then by title.
-
- """
- if not isinstance(other, Page):
- # especially, return -1 if other is None
- return -1
- if not self.site() == other.site():
- return cmp(self.site(), other.site())
- if self.namespace() != other.namespace():
- return cmp(self.namespace(), other.namespace())
- owntitle = self.title(withNamespace=False)
- othertitle = other.title(withNamespace=False)
- return cmp(owntitle, othertitle)
-
- def __hash__(self):
- # Pseudo method that makes it possible to store Page objects as keys
- # in hash-tables. This relies on the fact that the string
- # representation of an instance can not change after the construction.
- return hash(unicode(self))
-
- def autoFormat(self):
- """Return L{date.autoFormat} dictName and value, if any.
-
- Value can be a year, date, etc., and dictName is 'YearBC',
- 'Year_December', or another dictionary name. Please note that two
- entries may have exactly the same autoFormat, but be in two
- different namespaces, as some sites have categories with the
- same names. Regular titles return (None, None).
-
- """
- if not hasattr(self, '_autoFormat'):
- from pywikibot import date
- self._autoFormat = date.getAutoFormat(
- self.site().code,
- self.title(withNamespace=False)
- )
- return self._autoFormat
-
- def isAutoTitle(self):
- """Return True if title of this Page is in the autoFormat
dictionary."""
- return self.autoFormat()[0] is not None
-
- @deprecate_arg("throttle", None)
- @deprecate_arg("nofollow_redirects", None)
- @deprecate_arg("change_edit_time", None)
- def get(self, force=False, get_redirect=False, sysop=False):
- """Return the wiki-text of the page.
-
- This will retrieve the page from the server if it has not been
- retrieved yet, or if force is True. This can raise the following
- exceptions that should be caught by the calling code:
-
- - NoPage: The page does not exist
- - IsRedirectPage: The page is a redirect. The argument of the
- exception is the title of the page it redirects to.
- - SectionError: The section does not exist on a page with a #
- link
-
- @param force: reload all page attributes, including errors.
- @param get_redirect: return the redirect text, do not follow the
- redirect, do not raise an exception.
- @param sysop: if the user has a sysop account, use it to retrieve
- this page
-
- """
- if force:
- # When forcing, we retry the page no matter what. Old exceptions
- # do not apply any more.
- for attr in ['_redirarg', '_getexception']:
- if hasattr(self, attr):
- delattr(self,attr)
- else:
- # Make sure we re-raise an exception we got on an earlier attempt
- if hasattr(self, '_redirarg') and not get_redirect:
- raise pywikibot.IsRedirectPage, self._redirarg
- elif hasattr(self, '_getexception'):
- raise self._getexception
- if force or not hasattr(self, "_revid") \
- or not self._revid in self._revisions \
- or self._revisions[self._revid].text is None:
- self.site().loadrevisions(self, getText=True, sysop=sysop)
- # TODO: Exception handling for no-page, redirects, etc.
-
- return self._revisions[self._revid].text
-
- @deprecate_arg("throttle", None)
- @deprecate_arg("nofollow_redirects", None)
- @deprecate_arg("change_edit_time", None)
- def getOldVersion(self, oldid, force=False, get_redirect=False,
- sysop=False):
- """Return text of an old revision of this page; same options as
get().
-
- @param oldid: The revid of the revision desired.
-
- """
- if force or not oldid in self._revisions \
- or self._revisions[oldid].text is None:
- self.site().loadrevisions(self, getText=True, revids=oldid,
- sysop=sysop)
- # TODO: what about redirects, errors?
- return self._revisions[oldid].text
-
- def permalink(self):
- """Return the permalink URL for current revision of this
page."""
- return "%s://%s/%sindex.php?title=%s&oldid=%s" \
- % (self.site().protocol(),
- self.site().hostname(),
- self.site().scriptpath(),
- self.title(asUrl=True),
- self.latestRevision())
-
- def latestRevision(self):
- """Return the current revision id for this
page."""
- if not hasattr(self, '_revid'):
- self.site().loadrevisions(self)
- return self._revid
-
- def _textgetter(self):
- """Return the current (edited) wikitext, loading it if
necessary."""
- if not hasattr(self, '_text') or self._text is None:
- try:
- self._text = self.get()
- except pywikibot.NoPage:
- # TODO: what other exceptions might be returned?
- self._text = u""
- return self._text
-
- def _textsetter(self, value):
- """Update the edited wikitext"""
- self._text = unicode(value)
-
- def _cleartext(self):
- """Delete the edited wikitext"""
- if hasattr(self, "_text"):
- del self._text
-
- text = property(_textgetter, _textsetter, _cleartext,
- "The edited wikitext (unicode) of this Page")
-
- def expand_text(self):
- """Return the page text with all templates
expanded."""
- req = pywikibot.data.api.Request(action="expandtemplates",
- text=self.text,
- title=self.title(withSection=False),
- site=self.site())
- result = req.submit()
- return result["expandtemplates"]["*"]
-
- def userName(self):
- """Return name or IP address of last user to edit
page."""
- return self._revisions[self.latestRevision()].user
-
- def isIpEdit(self):
- """Return True if last editor was unregistered."""
- return self._revisions[self.latestRevision()].anon
-
- def editTime(self):
- """Return timestamp (in ISO 8601 format) of last revision to
page."""
- return self._revisions[self.latestRevision()].timestamp
-
- def previousRevision(self):
- """Return the revision id for the previous revision of this
Page."""
- vh = self.getVersionHistory(revCount=2)
- revkey = sorted(self._revisions.keys(), reverse=True)[1]
- return revkey
-
- def exists(self):
- """Return True if page exists on the wiki, even if it's a
redirect.
-
- If the title includes a section, return False if this section isn't
- found.
-
- """
- return self.site().page_exists(self)
-
- def isRedirectPage(self):
- """Return True if this is a redirect, False if not or not
existing."""
- return self.site().page_isredirect(self)
-
- def isEmpty(self):
- """Return True if the page text has less than 4 characters.
-
- Character count ignores language links and category links.
- Can raise the same exceptions as get().
-
- """
- txt = self.get()
- txt = pywikibot.textlib.removeLanguageLinks(txt, site = self.site())
- txt = pywikibot.textlib.removeCategoryLinks(txt, site = self.site())
- if len(txt) < 4:
- return True
- else:
- return False
-
- def isTalkPage(self):
- """Return True if this page is in any talk
namespace."""
- ns = self.namespace()
- return ns >= 0 and ns % 2 == 1
-
- def toggleTalkPage(self):
- """Return other member of the article-talk page pair for this
Page.
-
- If self is a talk page, returns the associated content page;
- otherwise, returns the associated talk page. The returned page need
- not actually exist on the wiki.
-
- Returns None if self is a special page.
-
- """
- ns = self.namespace()
- if ns < 0: # Special page
- return None
- if self.isTalkPage():
- if self.namespace() == 1:
- return Page(self.site(), self.title(withNamespace=False))
- else:
- return Page(self.site(),
- self.site().namespace(ns - 1) + ':'
- + self.title(withNamespace=False))
- else:
- return Page(self.site(),
- self.site().namespace(ns + 1) + ':'
- + self.title(withNamespace=False))
-
- def isCategory(self):
- """Return True if the page is a Category, False
otherwise."""
- return self.namespace() == 14
-
- def isImage(self):
- """Return True if this is an image description page, False
otherwise."""
- return self.namespace() == 6
-
- def isDisambig(self):
- """Return True if this is a disambiguation page, False otherwise.
-
- Relies on the presence of specific templates, identified in
- the Family file or on a wiki page, to identify disambiguation
- pages.
-
- By default, loads a list of template names from the Family file;
- if the value in the Family file is None, looks for the list on
- [[MediaWiki:Disambiguationspage]].
-
- """
- if not hasattr(self, "_isDisambig"):
- if not hasattr(self.site(), "_disambigtemplates"):
- self.site()._disambigtemplates = \
- self.site().family.disambig(self.site().code)
- if self.site()._disambigtemplates is None:
- try:
- disambigpages = Page(self.site(),
- "MediaWiki:Disambiguationspage")
- self.site()._disambigtemplates = [
- link.title(withNamespace=False)
- for link in disambigpages.linkedPages()
- if link.namespace() == 10
- ]
- except NoPage:
- self.site()._disambigtemplates = ['Disambig']
- for t in self.templates():
- if t.title(withNamespace=False) in self.site()._disambigtemplates:
- self._isDisambig = True
- break
- else:
- self._isDisambig = False
- return self._isDisambig
-
- def getReferences(self, follow_redirects=True, withTemplateInclusion=True,
- onlyTemplateInclusion=False, redirectsOnly=False,
- namespaces=None):
- """Return an iterator all pages that refer to or embed the page.
-
- If you need a full list of referring pages, use
- C{pages = list(s.getReferences())}
-
- @param follow_redirects: if True, also iterate pages that link to a
- redirect pointing to the page.
- @param withTemplateInclusion: if True, also iterate pages where self
- is used as a template.
- @param onlyTemplateInclusion: if True, only iterate pages where self
- is used as a template.
- @param redirectsOnly: if True, only iterate redirects to self.
- @param namespaces: only iterate pages in these namespaces
-
- """
- # N.B.: this method intentionally overlaps with backlinks() and
- # embeddedin(). Depending on the interface, it may be more efficient
- # to implement those methods in the site interface and then combine
- # the results for this method, or to implement this method and then
- # split up the results for the others.
- return self.site().pagereferences(
- self, follow_redirects, redirectsOnly,
- withTemplateInclusion, onlyTemplateInclusion,
- namespaces)
-
- def backlinks(self, followRedirects=True, filterRedirects=None,
- namespaces=None):
- """Return an iterator for pages that link to this page.
-
- @param followRedirects: if True, also iterate pages that link to a
- redirect pointing to the page.
- @param filterRedirects: if True, only iterate redirects; if False,
- omit redirects; if None, do not filter
- @param namespaces: only iterate pages in these namespaces
-
- """
- return self.site().pagebacklinks(self, followRedirects, filterRedirects,
- namespaces)
-
- def embeddedin(self, filter_redirects=None, namespaces=None):
- """Return an iterator for pages that embed this page as a
template.
-
- @param filterRedirects: if True, only iterate redirects; if False,
- omit redirects; if None, do not filter
- @param namespaces: only iterate pages in these namespaces
-
- """
- return self.site().page_embeddedin(self, filter_redirects, namespaces)
-
- def canBeEdited(self):
- """Return bool indicating whether this page can be edited.
-
- This returns True if and only if:
- - page is unprotected, and bot has an account for this site, or
- - page is protected, and bot has a sysop account for this site.
-
- """
- return self.site().page_can_be_edited(self)
-
- def botMayEdit(self):
- """Return True if this page allows bots to edit it.
-
- This will be True if the page doesn't contain {{bots}} or
- {{nobots}}, or it contains them and the active bot is allowed to
- edit this page. (This method is only useful on those sites that
- recognize the bot-exclusion protocol; on other sites, it will always
- return True.)
-
- The framework enforces this restriction by default. It is possible
- to override this by setting ignore_bot_templates=True in
- user_config.py, or using page.put(force=True).
-
- """ # TODO: move this to Site object?
- if config.ignore_bot_templates: #Check the "master ignore switch"
- return True
- try:
- templates = self.templatesWithParams();
- except (pywikibot.NoPage,
- pywikibot.IsRedirectPage,
- pywikibot.SectionError):
- return True
- for template in templates:
- title = template[0].title(withNamespace=False)
- if title == 'Nobots':
- return False
- elif title == 'Bots':
- if len(template[1]) == 0:
- return True
- else:
- (ttype, bots) = template[1][0].split('=', 1)
- bots = bots.split(',')
- if ttype == 'allow':
- if 'all' in bots or username in bots:
- return True
- else:
- return False
- if ttype == 'deny':
- if 'all' in bots or username in bots:
- return False
- else:
- return True
- # no restricting template found
- return True
-
- def save(self, comment=None, watch=None, minor=True, force=False,
- async=False, callback=None):
- """Save the current contents of page's text to the wiki.
-
- @param comment: The edit summary for the modification (optional, but
- most wikis strongly encourage its use)
- @type comment: unicode
- @param watch: if True, add or if False, remove this Page to/from bot
- user's watchlist; if None, leave watchlist status unchanged
- @type watch: bool or None
- @param minor: if True, mark this edit as minor
- @type minor: bool
- @param force: if True, ignore botMayEdit() setting
- @type force: bool
- @param async: if True, launch a separate thread to save
- asynchronously
- @param callback: a callable object that will be called after the
- page put operation. This object must take two arguments: (1) a
- Page object, and (2) an exception instance, which will be None
- if the page was saved successfully. The callback is intended for
- use by bots that need to keep track of which saves were
- successful.
-
- """
- if not comment:
- comment = pywikibot.default_comment # needs to be defined
- if watch is None:
- unwatch = False
- watch = False
- else:
- unwatch = not watch
- if not force and not self.botMayEdit:
- raise pywikibot.PageNotSaved(
- "Page %s not saved; editing restricted by {{bots}} template"
- % self.title(asLink=True))
- if async:
- thd = threading.Thread(
- target=self._save,
- args=(comment, minor, watch, unwatch, callback)
- )
- pywikibot.threadpool.append(thd)
- thd.start()
- else:
- self._save(comment, minor, watch, unwatch, callback)
-
- def _save(self, comment, minor, watch, unwatch, callback):
- err = None
- try:
- done = self.site().editpage(self, summary=comment, minor=minor,
- watch=watch, unwatch=unwatch)
- if not done:
- logger.warn("Page %s not saved" % self.title(asLink=True))
- else:
- logger.info("Page %s saved" % self.title(asLink=True))
- except pywikibot.Error, err:
- logger.exception("Error saving page %s" % self.title(asLink=True))
- if callback:
- callback(self, err)
-
- def put(self, newtext, comment=u'', watchArticle=None, minorEdit=True,
- force=False, async=False, callback=None):
- """Save the page with the contents of the first argument as the
text.
-
- This method is maintained primarily for backwards-compatibility.
- For new code, using Page.save() is preferred. See save() method
- docs for all parameters not listed here.
-
- @param newtext: The complete text of the revised page.
- @type newtext: unicode
-
- """
- self.text = newtext
- return self.save(comment, watchArticle, minorEdit, force,
- async, callback)
-
- def put_async(self, newtext, comment=u'', watchArticle=None,
- minorEdit=True, force=False, callback=None):
- """Put page on queue to be saved to wiki asynchronously.
-
- Asynchronous version of put (takes the same arguments), which places
- pages on a queue to be saved by a daemon thread. All arguments are
- the same as for .put(). This version is maintained solely for
- backwards-compatibility.
-
- """
- return self.put(self, newtext, comment, watchArticle,
- minorEdit, force, callback, async=True)
-
- def linkedPages(self):
- """Iterate Pages that this Page links to.
-
- Only returns pages from "normal" internal links. Image and category
- links are omitted unless prefixed with ":". Embedded templates are
- omitted (but links within them are returned). All interwiki and
- external links are omitted.
-
- @return: a generator that yields Page objects.
-
- """
- return self.site().pagelinks(self)
-
- def interwiki(self, expand=True):
- """Iterate interwiki links in the page text, excluding language
links.
-
- @param expand: if True (default), include interwiki links found in
- templates transcluded onto this page; if False, only iterate
- interwiki links found in this page's own wikitext
- @return: a generator that yields Link objects
-
- """
- # This function does not exist in the API, so it has to be
- # implemented by screen-scraping
- if expand:
- text = self.expand_text()
- else:
- text = self.text
- for linkmatch in pywikibot.link_regex.finditer(
- pywikibot.textlib.removeDisabledParts(text)):
- linktitle = linkmatch.group("title")
- link = Link(linktitle, self.site())
- # only yield links that are to a different site and that
- # are not language links
- try:
- if link.site != self.site():
- if linktitle.lstrip().startswith(":"):
- # initial ":" indicates not a language link
- yield link
- elif link.site.family != self.site().family:
- # link to a different family is not a language link
- yield link
- except pywikibot.Error:
- # ignore any links with invalid contents
- continue
-
- def langlinks(self):
- """Iterate all interlanguage links on this page.
-
- @return: a generator that yields Link objects.
-
- """
- return self.site().pagelanglinks(self)
-
- @deprecate_arg("followRedirects", None)
- @deprecate_arg("loose", None)
- def imagelinks(self, followRedirects=None, loose=None):
- """Iterate ImagePage objects for images displayed on this Page.
-
- @return: a generator that yields ImagePage objects.
-
- """
- return self.site().pageimages(self)
-
- def templates(self):
- """Iterate Page objects for templates used on this Page.
-
- Template parameters are ignored. This method only returns embedded
- templates, not template pages that happen to be referenced through
- a normal link.
-
- """
- return self.site().pagetemplates(self)
-
- def templatesWithParams(self):
- """Iterate templates used on this Page.
-
- @return: a generator that yields a tuple for each use of a template
- in the page, with the template Page as the first entry and a list of
- parameters as the second entry.
-
- """
- templates = pywikibot.textlib.extract_templates_and_params(self.text)
- # backwards-compatibility: convert the dict returned as the second
- # element into a list in the format used by old scripts
- result = []
- for template in templates:
- args = template[1]
- positional = []
- named = {}
- for key in sorted(args.keys()):
- try:
- int(key)
- except ValueError:
- named[key] = args[key]
- else:
- positional.append(args[key])
- for name in named:
- positional.append("%s=%s" % (name, named[name]))
- result.append((pywikibot.Page(
- pywikibot.Link(template[0], self.site())),
- positional))
- return result
-
- @deprecate_arg("nofollow_redirects", None)
- def categories(self, withSortKey=False):
- """Iterate categories that the article is in.
-
- @param withSortKey: if True, include the sort key in each Category.
- @return: a generator that yields Category objects.
-
- """
- return self.site().pagecategories(self, withSortKey=withSortKey)
-
- def extlinks(self):
- """Iterate all external URLs (not interwiki links) from this
page.
-
- @return: a generator that yields unicode objects containing URLs.
-
- """
- return self.site().page_extlinks(self)
-
- def getRedirectTarget(self):
- """Return a Page object for the target this Page redirects to.
-
- If this page is not a redirect page, will raise an IsNotRedirectPage
- exception. This method also can raise a NoPage exception.
-
- """
- if not self.isRedirectPage():
- raise pywikibot.IsNotRedirectPage
- if not isinstance(self._redir, Page):
- self.site().getredirtarget(self)
- return self._redir
-
- @deprecate_arg("forceReload", None)
- def getVersionHistory(self, reverseOrder=False, getAll=False,
- revCount=500):
- """Load the version history page and return history information.
-
- Return value is a list of tuples, where each tuple represents one
- edit and is built of revision id, edit date/time, user name, and
- edit summary. Starts with the most current revision, unless
- reverseOrder is True. Defaults to getting the first revCount edits,
- unless getAll is True.
-
- """
- if getAll:
- limit = None
- else:
- limit = revCount
- self.site().loadrevisions(self, getText=False, rvdir=reverseOrder,
- limit=limit)
- if getAll:
- revCount = len(self._revisions)
- return [ ( self._revisions[rev].revid,
- self._revisions[rev].timestamp,
- self._revisions[rev].user,
- self._revisions[rev].comment
- ) for rev in sorted(self._revisions.keys(),
- reverse=not reverseOrder)[ : revCount]
- ]
-
- def getVersionHistoryTable(self, forceReload=False, reverseOrder=False,
- getAll=False, revCount=500):
- """Return the version history as a wiki table."""
- result = '{| border="1"\n'
- result += '! oldid || date/time || username || edit summary\n'
- for oldid, time, username, summary \
- in self.getVersionHistory(forceReload=forceReload,
- reverseOrder=reverseOrder,
- getAll=getAll, revCount=revCount):
- result += '|----\n'
- result += '| %s || %s || %s || <nowiki>%s</nowiki>\n'\
- % (oldid, time, username, summary)
- result += '|}\n'
- return result
-
- def fullVersionHistory(self):
- """Iterate all previous versions including wikitext.
-
- @return: A generator that yields tuples consisting of revision ID,
- edit date/time, user name and content
- """
- return self.site().loadrevisions(self, withText=True)
-
- def contributingUsers(self):
- """Return a set of usernames (or IPs) of users who edited this
page."""
- edits = self.getVersionHistory()
- users = set([edit[2] for edit in edits])
- return users
-
- @deprecate_arg("throttle", None)
- def move(self, newtitle, reason=None, movetalkpage=True, sysop=False,
- deleteAndMove=False, safe=True):
- """Move this page to a new title.
-
- @param newtitle: The new page title.
- @param reason: The edit summary for the move.
- @param movetalkpage: If true, move this page's talk page (if it exists)
- @param sysop: Try to move using sysop account, if available
- @param deleteAndMove: if move succeeds, delete the old page
- (usually requires sysop privileges, depending on wiki settings)
- @param safe: If false, attempt to delete existing page at newtitle
- (if there is one) and then move this page to that title
-
- """
- if reason is None:
- logger.info(u'Moving %s to [[%s]].'
- % (self.title(asLink=True), newtitle))
- reason = pywikibot.input(u'Please enter a reason for the move:')
- # TODO: implement "safe" parameter
- # TODO: implement "sysop" parameter
- return self.site().movepage(self, newtitle, reason,
- movetalk=movetalkpage,
- noredirect=deleteAndMove)
-
- @deprecate_arg("throttle", None)
- def delete(self, reason=None, prompt=True, throttle=None, mark=False):
- """Deletes the page from the wiki. Requires administrator status.
-
- @param reason: The edit summary for the deletion.
- @param prompt: If true, prompt user for confirmation before deleting.
- @param mark: if true, and user does not have sysop rights, place a
- speedy-deletion request on the page instead.
-
- """
- if reason is None:
- logger.info(u'Deleting %s.' % (self.title(asLink=True)))
- reason = pywikibot.input(u'Please enter a reason for the deletion:')
- answer = u'y'
- if prompt and not hasattr(self.site(), '_noDeletePrompt'):
- answer = pywikibot.inputChoice(u'Do you want to delete %s?'
- % self.title(asLink = True, forceInterwiki = True),
- ['Yes', 'No', 'All'],
- ['Y', 'N', 'A'],
- 'N')
- if answer in ['a', 'A']:
- answer = 'y'
- self.site()._noDeletePrompt = True
- if answer in ['y', 'Y']:
- return self.site().delete(self, reason, mark=mark)
-
- def loadDeletedRevisions(self):
- """Retrieve all deleted revisions for this Page from
Special/Undelete.
-
- Stores all revisions' timestamps, dates, editors and comments in
- self._deletedRevs attribute.
-
- @return: list of timestamps (which can be used to retrieve revisions
- later on).
-
- """
- return self.site().loadDeletedRevisions(self)
-
- def getDeletedRevision(self, timestamp, retrieveText=False):
- """Return a particular deleted revision by timestamp.
-
- @return: a list of [date, editor, comment, text, restoration
- marker]. text will be None, unless retrieveText is True (or has
- been retrieved earlier). If timestamp is not found, returns
- None.
-
- """
- return self.site().getDeletedRevision(self, timestamp,
- getText=retrieveText)
-
- def markDeletedRevision(self, timestamp, undelete=True):
- """Mark the revision identified by timestamp for undeletion.
-
- @param undelete: if False, mark the revision to remain deleted.
-
- """
- if self._deletedRevs == None:
- self.loadDeletedRevisions()
- if not self._deletedRevs.has_key(timestamp):
- #TODO: Throw an exception?
- return None
- self._deletedRevs[timestamp][4] = undelete
- self._deletedRevsModified = True
-
- @deprecate_arg("throttle", None)
- def undelete(self, comment=None):
- """Undelete revisions based on the markers set by previous calls.
-
- If no calls have been made since loadDeletedRevisions(), everything
- will be restored.
-
- Simplest case::
- Page(...).undelete('This will restore all revisions')
-
- More complex::
- pg = Page(...)
- revs = pg.loadDeletedRevsions()
- for rev in revs:
- if ... #decide whether to undelete a revision
- pg.markDeletedRevision(rev) #mark for undeletion
- pg.undelete('This will restore only selected revisions.')
-
- @param comment: The undeletion edit summary.
-
- """
- if comment is None:
- logger.info(u'Preparing to undelete %s.'
- % (self.title(asLink=True)))
- comment = pywikibot.input(
- u'Please enter a reason for the undeletion:')
- return self.site().undelete(self, comment)
-
- @deprecate_arg("throttle", None)
- def protect(self, edit='sysop', move='sysop', unprotect=False,
- reason=None, prompt=True):
- """(Un)protect a wiki page. Requires administrator status.
-
- Valid protection levels (in MediaWiki 1.12) are '' (equivalent to
- 'none'), 'autoconfirmed', and 'sysop'.
-
- @param edit: Level of edit protection
- @param move: Level of move protection
- @param unprotect: If true, unprotect the page (equivalent to setting
- all protection levels to '')
- @param reason: Edit summary.
- @param prompt: If true, ask user for confirmation.
-
- """
- if reason is None:
- if unprotect:
- un = u'un'
- else:
- un = u''
- logger.info(u'Preparing to %sprotect %s.'
- % (un, self.title(asLink=True)))
- reason = pywikibot.input(u'Please enter a reason for the action:')
- if unprotect:
- edit = move = ""
- answer = 'y'
- if prompt and not hasattr(self.site(), '_noProtectPrompt'):
- answer = pywikibot.inputChoice(
- u'Do you want to change the protection level of %s?'
- % self.title(asLink=True, forceInterwiki = True),
- ['Yes', 'No', 'All'], ['Y',
'N', 'A'], 'N')
- if answer in ['a', 'A']:
- answer = 'y'
- self.site()._noProtectPrompt = True
- if answer in ['y', 'Y']:
- return self.site().protect(self, edit, move, reason)
-
- def change_category(article, oldCat, newCat, comment=None, sortKey=None,
- inPlace=True):
- """Remove page from oldCat and add it to newCat.
-
- oldCat and newCat should be Category objects.
- If newCat is None, the category will be removed.
-
- """ # TODO: document remaining arguments
- cats = self.categories(get_redirect=True)
- site = self.site()
- changesMade = False
-
- if not self.canBeEdited():
- pywikibot.output(u"Can't edit %s, skipping it..."
- % self.title(asLink=True))
- return False
- if inPlace == True:
- newtext = pywikibot.textlib.replaceCategoryInPlace(
- self.text, oldCat, newCat)
- if newtext == self.text:
- pywikibot.output(
- u'No changes in made in page %s.'
- % self.title(asLink=True))
- return False
- try:
- self.put(newtext, comment)
- return True
- except pywikibot.EditConflict:
- pywikibot.output(
- u'Skipping %s because of edit conflict'
- % self.title(asLink=True))
- except pywikibot.LockedPage:
- pywikibot.output(u'Skipping locked page %s'
- % self.title(asLink=True))
- except pywikibot.SpamfilterError, error:
- pywikibot.output(
- u'Changing page %s blocked by spam filter (URL=%s)'
- % (self.title(asLink=True), error.url))
- except pywikibot.NoUsername:
- pywikibot.output(
- u"Page %s not saved; sysop privileges required."
- % self.title(asLink=True))
- except pywikibot.PageNotSaved, error:
- pywikibot.output(u"Saving page %s failed: %s"
- % (self.title(asLink=True), error.message))
- return False
-
- # This loop will replace all occurrences of the category to be changed,
- # and remove duplicates.
- newCatList = []
- newCatSet = set()
- for i in range(len(cats)):
- cat = cats[i]
- if cat == oldCat:
- changesMade = True
- if not sortKey:
- sortKey = cat.sortKey
- if newCat:
- if newCat.title() not in newCatSet:
- newCategory = Category(site, newCat.title(),
- sortKey=sortKey)
- newCatSet.add(newCat.title())
- newCatList.append(newCategory)
- elif cat.title() not in newCatSet:
- newCatSet.add(cat.title())
- newCatList.append(cat)
-
- if not changesMade:
- pywikibot.output(u'ERROR: %s is not in category %s!'
- % (self.title(asLink=True), oldCat.title()))
- else:
- try:
- text = pywikibot.textlib.replaceCategoryLinks(self.text,
- newCatList)
- except ValueError:
- # Make sure that the only way replaceCategoryLinks() can return
- # a ValueError is in the case of interwiki links to self.
- pywikibot.output(
- u'Skipping %s because of interwiki link to self' % self)
- try:
- self.put(text, comment)
- except pywikibot.EditConflict:
- pywikibot.output(
- u'Skipping %s because of edit conflict' % self.title())
- except pywikibot.SpamfilterError, e:
- pywikibot.output(
- u'Skipping %s because of blacklist entry %s'
- % (self.title(), e.url))
- except pywikibot.LockedPage:
- pywikibot.output(
- u'Skipping %s because page is locked' % self.title())
- except pywikibot.PageNotSaved, error:
- pywikibot.output(u"Saving page %s failed: %s"
- % (self.title(asLink=True), error.message))
-
-######## DEPRECATED METHODS ########
-
- def encoding(self):
- """DEPRECATED: use Site.encoding() instead"""
- logger.debug(u"Page.encoding() is deprecated; use Site.encoding().")
- return self.site().encoding()
-
- def titleWithoutNamespace(self, underscore=False):
- """DEPRECATED: use self.title(withNamespace=False)
instead."""
- logger.debug(
- u"Page.titleWithoutNamespace() method is deprecated.")
- return self.title(underscore=underscore, withNamespace=False,
- withSection=False)
-
- def titleForFilename(self):
- """DEPRECATED: use self.title(as_filename=True)
instead."""
- logger.debug(
- u"Page.titleForFilename() method is deprecated.")
- return self.title(as_filename=True)
-
- def sectionFreeTitle(self, underscore=False):
- """DEPRECATED: use self.title(withSection=False)
instead."""
- logger.debug(
- u"Page.sectionFreeTitle() method is deprecated.")
- return self.title(underscore=underscore, withSection=False)
-
- def aslink(self, forceInterwiki=False, textlink=False, noInterwiki=False):
- """DEPRECATED: use self.title(asLink=True)
instead."""
- logger.debug(u"Page.aslink() method is deprecated.")
- return self.title(asLink=True, forceInterwiki=forceInterwiki,
- allowInterwiki=not noInterwiki, textlink=textlink)
-
- def urlname(self):
- """Return the Page title encoded for use in an URL.
-
- DEPRECATED: use self.title(asUrl=True) instead.
-
- """
- logger.debug(u"Page.urlname() method is deprecated.")
- return self.title(asUrl=True)
-
-####### DISABLED METHODS (warnings provided) ######
- # these methods are easily replaced by editing the page's text using
- # textlib methods and then using put() on the result.
-
- def removeImage(self, image, put=False, summary=None, safe=True):
- """Old method to remove all instances of an image from
page."""
- logger.warning(u"Page.removeImage() is no longer supported.")
-
- def replaceImage(self, image, replacement=None, put=False, summary=None,
- safe=True):
- """Old method to replace all instances of an image with
another."""
- logger.warning(u"Page.replaceImage() is no longer supported.")
-
-
-class ImagePage(Page):
- """A subclass of Page representing an image descriptor wiki page.
-
- Supports the same interface as Page, with the following added methods:
-
- getImagePageHtml : Download image page and return raw HTML text.
- fileURL : Return the URL for the image described on this
- page.
- fileIsOnCommons : Return True if image stored on Wikimedia
- Commons.
- fileIsShared : Return True if image stored on Wikitravel
- shared repository.
- getFileMd5Sum : Return image file's MD5 checksum.
- getFileVersionHistory : Return the image file's version history.
- getFileVersionHistoryTable: Return the version history in the form of a
- wiki table.
- usingPages : Iterate Pages on which the image is displayed.
-
- """
- def __init__(self, source, title=u"", insite=None):
- Page.__init__(self, source, title, 6)
- if self.namespace() != 6:
- raise ValueError(u"'%s' is not in the image namespace!" %
title)
-
- def getImagePageHtml(self):
- """
- Download the image page, and return the HTML, as a unicode string.
-
- Caches the HTML code, so that if you run this method twice on the
- same ImagePage object, the page will only be downloaded once.
- """
- if not hasattr(self, '_imagePageHtml'):
- from pywikibot.data import http
- path = "%s/index.php?title=%s" \
- % (self.site().scriptpath(), self.title(asUrl=True))
- self._imagePageHtml = http.request(self.site(), path)
- return self._imagePageHtml
-
- def fileUrl(self):
- """Return the URL for the image described on this
page."""
- # TODO add scaling option?
- if not hasattr(self, '_imageinfo'):
- self._imageinfo = self.site().getimageinfo(self) #FIXME
- return self._imageinfo['url']
-
- def fileIsOnCommons(self):
- """Return True if the image is stored on Wikimedia
Commons"""
- return self.fileUrl().startswith(
- 'http://upload.wikimedia.org/wikipedia/commons/')
-
- def fileIsShared(self):
- """Return True if image is stored on any known shared
repository."""
- # as of now, the only known repositories are commons and wikitravel
- if 'wikitravel_shared' in self.site().shared_image_repository():
- return self.fileUrl().startswith(
-
u'http://wikitravel.org/upload/shared/')
- return self.fileIsOnCommons()
-
- def getFileMd5Sum(self):
- """Return image file's MD5 checksum."""
- logger.debug(
- "ImagePage.getFileMd5Sum() is deprecated; use getFileSHA1Sum().")
-# FIXME: MD5 might be performed on incomplete file due to server disconnection
-# (see bug #1795683).
- import md5, urllib
- f = urllib.urlopen(self.fileUrl())
- # TODO: check whether this needs a User-Agent header added
- md5Checksum = md5.new(f.read()).hexdigest()
- f.close()
- return md5Checksum
-
- def getFileSHA1Sum(self):
- """Return image file's SHA1 checksum."""
- if not hasattr(self, '_imageinfo'):
- self._imageinfo = self.site().getimageinfo(self) #FIXME
- return self._imageinfo['sha1']
-
- def getFileVersionHistory(self):
- """Return the image file's version history.
-
- @return: An iterator yielding tuples containing (timestamp,
- username, resolution, filesize, comment).
-
- """
- #TODO; return value may need to change
- return self.site().getimageinfo(self, history=True) #FIXME
-
- def getFileVersionHistoryTable(self):
- """Return the version history in the form of a wiki
table."""
- lines = []
- #TODO: if getFileVersionHistory changes, make sure this follows it
- for (datetime, username, resolution, size, comment) \
- in self.getFileVersionHistory():
- lines.append('| %s || %s || %s || %s ||
<nowiki>%s</nowiki>' \
- % (datetime, username, resolution, size, comment))
- return u'{| border="1"\n! date/time || username || resolution ||
size || edit summary\n|----\n' + u'\n|----\n'.join(lines) + '\n|}'
-
- def usingPages(self):
- """Yield Pages on which the image is displayed."""
- return self.site().getimageusage(self)
-
-
-class Category(Page):
- """A page in the Category: namespace"""
-
- @deprecate_arg("sortKey", None)
- def __init__(self, source, title=u"", insite=None):
- """All parameters are the same as for Page() constructor.
-
- """
- Page.__init__(self, source, title, 14)
- if self.namespace() != 14:
- raise ValueError(u"'%s' is not in the category namespace!"
- % title)
-
- @deprecate_arg("forceInterwiki", None)
- @deprecate_arg("textlink", None)
- @deprecate_arg("noInterwiki", None)
- def aslink(self, sortKey=u''):
- """Return a link to place a page in this Category.
-
- Use this only to generate a "true" category link, not for interwikis
- or text links to category pages.
-
- @param sortKey: The sort key for the article to be placed in this
- Category; if omitted, default sort key is used.
- @type sortKey: (optional) unicode
-
- """
- if sortKey:
- titleWithSortKey = '%s|%s' % (self.title(withSection=False),
- self.sortKey)
- else:
- titleWithSortKey = self.title(withSection=False)
- return '[[%s]]' % titleWithSortKey
-
- @deprecate_arg("startFrom", None)
- @deprecate_arg("cacheResults", None)
- def subcategories(self, recurse=False):
- """Iterate all subcategories of the current category.
-
- @param recurse: if not False or 0, also iterate subcategories of
- subcategories. If an int, limit recursion to this number of
- levels. (Example: recurse=1 will iterate direct subcats and
- first-level sub-sub-cats, but no deeper.)
- @type recurse: int or bool
-
- """
- if not isinstance(recurse, bool) and recurse:
- recurse = recurse - 1
- if not hasattr(self, "_subcats"):
- self._subcats = []
- for member in self.site().categorymembers(self, namespaces=[14]):
- subcat = Category(self.site(), member.title())
- self._subcats.append(subcat)
- yield subcat
- if recurse:
- for item in subcat.subcategories(recurse):
- yield item
- else:
- for subcat in self._subcats:
- yield subcat
- if recurse:
- for item in subcat.subcategories(recurse):
- yield item
-
- @deprecate_arg("startFrom", None)
- def articles(self, recurse=False):
- """
- Yields all articles in the current category.
-
- @param recurse: if not False or 0, also iterate articles in
- subcategories. If an int, limit recursion to this number of
- levels. (Example: recurse=1 will iterate articles in first-level
- subcats, but no deeper.)
- @type recurse: int or bool
-
- """
- namespaces = [x for x in self.site().namespaces().keys()
- if x>=0 and x!=14]
- for member in self.site().categorymembers(self,
- namespaces=namespaces):
- yield member
- if recurse:
- if not isinstance(recurse, bool) and recurse:
- recurse = recurse - 1
- for subcat in self.subcategories():
- for article in subcat.articles(recurse):
- yield article
-
- def isEmptyCategory(self):
- """Return True if category has no members (including
subcategories)."""
- for member in self.site().categorymembers(self, limit=1):
- return False
- return True
-
- def copyTo(self, catname):
- """
- Copy text of category page to a new page. Does not move contents.
-
- @param catname: New category title (without namespace)
- @return: True if copying was successful, False if target page
- already existed.
-
- """
- # This seems far too specialized to be in the top-level framework
- catname = self.site().category_namespace() + ':' + catname
- targetCat = Category(self.site(), catname)
- if targetCat.exists():
- logger.warn('Target page %s already exists!'
- % targetCat.title())
- return False
- else:
- logger.info('Moving text from %s to %s.'
- % (self.title(), targetCat.title()))
- authors = ', '.join(self.contributingUsers())
- creationSummary = pywikibot.translate(
- self.site(), msg_created_for_renaming
- ) % (self.title(), authors)
- targetCat.put(self.get(), creationSummary)
- return True
-
- def copyAndKeep(self, catname, cfdTemplates):
- """Copy partial category page text (not contents) to a new title.
-
- Like copyTo above, except this removes a list of templates (like
- deletion templates) that appear in the old category text. It also
- removes all text between the two HTML comments BEGIN CFD TEMPLATE
- and END CFD TEMPLATE. (This is to deal with CFD templates that are
- substituted.)
-
- Returns true if copying was successful, false if target page already
- existed.
-
- @param catname: New category title (without namespace)
- @param cfdTemplates: A list (or iterator) of templates to be removed
- from the page text
- @return: True if copying was successful, False if target page
- already existed.
-
- """
- # I don't see why we need this as part of the framework either
- catname = self.site().category_namespace() + ':' + catname
- targetCat = Category(self.site(), catname)
- if targetCat.exists():
- logger.warn('Target page %s already exists!'
- % targetCat.title())
- return False
- else:
- logger.info('Moving text from %s to %s.'
- % (self.title(), targetCat.title()))
- authors = ', '.join(self.contributingUsers())
- creationSummary = pywikibot.translate(
- self.site(), msg_created_for_renaming
- ) % (self.title(), authors)
- newtext = self.get()
- for regexName in cfdTemplates:
- matchcfd = re.compile(r"{{%s.*?}}" % regexName, re.IGNORECASE)
- newtext = matchcfd.sub('',newtext)
- matchcomment = re.compile(
- r"<!--BEGIN CFD TEMPLATE-->.*?<!--END CFD
TEMPLATE-->",
- re.IGNORECASE | re.MULTILINE | re.DOTALL)
- newtext = matchcomment.sub('', newtext)
- pos = 0
- while (newtext[pos:pos+1] == "\n"):
- pos = pos + 1
- newtext = newtext[pos:]
- targetCat.put(newtext, creationSummary)
- return True
-
-#### DEPRECATED METHODS ####
- def subcategoriesList(self, recurse=False):
- """DEPRECATED: Equivalent to
list(self.subcategories(...))"""
- logger.debug("Category.subcategoriesList() method is deprecated.")
- return sorted(list(set(self.subcategories(recurse))))
-
- def articlesList(self, recurse=False):
- """DEPRECATED: equivalent to
list(self.articles(...))"""
- logger.debug("Category.articlesList() method is deprecated.")
- return sorted(list(set(self.articles(recurse))))
-
- def supercategories(self):
- """DEPRECATED: equivalent to self.categories()"""
- logger.debug("Category.supercategories() method is deprecated.")
- return self.categories()
-
- def supercategoriesList(self):
- """DEPRECATED: equivalent to
list(self.categories(...))"""
- logger.debug("Category.articlesList() method is deprecated.")
- return sorted(list(set(self.categories())))
-
-
-class Revision(object):
- """A structure holding information about a single revision of a
Page."""
- def __init__(self, revid, timestamp, user, anon=False, comment=u"",
- text=None, minor=False):
- """All parameters correspond to object attributes (e.g., revid
- parameter is stored as self.revid)
-
- @param revid: Revision id number
- @type revid: int
- @param text: Revision wikitext.
- @type text: unicode, or None if text not yet retrieved
- @param timestamp: Revision time stamp (in ISO 8601 format)
- @type timestamp: unicode
- @param user: user who edited this revision
- @type user: unicode
- @param anon: user is unregistered
- @type anon: bool
- @param comment: edit comment text
- @type comment: unicode
- @param minor: edit flagged as minor
- @type minor: bool
-
- """
- self.revid = revid
- self.text = text
- self.timestamp = timestamp
- self.user = user
- self.anon = anon
- self.comment = comment
- self.minor = minor
-
-
-class Link(object):
- """A Mediawiki link (local or interwiki)
-
- Has the following attributes:
-
- - site: The Site object for the wiki linked to
- - namespace: The namespace of the page linked to (int)
- - title: The title of the page linked to (unicode); does not include
- namespace or section
- - section: The section of the page linked to (unicode or None); this
- contains any text following a '#' character in the title
- - anchor: The anchor text (unicode or None); this contains any text
- following a '|' character inside the link
-
- """
- illegal_titles_pattern = re.compile(
- # Matching titles will be held as illegal.
- u'''[^
%!\"$&'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\u0080-\uFFFF+]'''
- # URL percent encoding sequences interfere with the ability
- # to round-trip titles -- you can't link to them consistently.
- u'|%[0-9A-Fa-f]{2}'
- # XML/HTML character references produce similar issues.
- u'|&[A-Za-z0-9\x80-\xff]+;'
- u'|&#[0-9]+;'
- u'|&#x[0-9A-Fa-f]+;'
- )
-
- def __init__(self, text, source=None, defaultNamespace=0):
- """Constructor
-
- @param text: the link text (everything appearing between [[ and ]]
- on a wiki page)
- @type text: unicode
- @param source: the Site on which the link was found (not necessarily
- the site to which the link refers)
- @type source: Site
- @param defaultNamespace: a namespace to use if the link does not
- contain one (defaults to 0)
- @type defaultNamespace: int
-
- """
- self._text = text
- self._source = source
- self._defaultns = defaultNamespace
-
- def parse(self):
- """Parse text; called internally when accessing
attributes"""
-
- # First remove the anchor, which is stored unchanged, if there is one
- if u"|" in self._text:
- self._text, self._anchor = self._text.split(u"|", 1)
- else:
- self._anchor = None
-
- if self._source is None:
- self._source = pywikibot.Site()
- self._site = self._source
-
- # Clean up the name, it can come from anywhere.
- # Convert HTML entities to unicode
- t = html2unicode(self._text)
-
- # Convert URL-encoded characters to unicode
- t = url2unicode(t, site=self._site)
-
- # Normalize unicode string to a NFC (composed) format to allow proper
- # string comparisons. According to
- #
http://svn.wikimedia.org/viewvc/mediawiki/branches/REL1_6/phase3/includes/n…
- # the mediawiki code normalizes everything to NFC, not NFKC (which
- # might result in information loss).
- t = unicodedata.normalize('NFC', t)
-
- # This code was adapted from Title.php : secureAndSplit()
- #
- if u'\ufffd' in t:
- raise pywikibot.Error("Title contains illegal char (\\uFFFD)")
- self._namespace = self._defaultns
-
- # Replace underscores by spaces
- t = t.replace(u"_", u" ")
- # replace multiple spaces and underscores with a single space
- while u" " in t: t = t.replace(u" ", u" ")
- # Strip spaces at both ends
- t = t.strip(" ")
- # Remove left-to-right and right-to-left markers.
- t = t.replace(u"\u200e", u"").replace(u"\u200f",
u"")
-
- firstPass = True
- while u":" in t:
- # Initial colon indicates main namespace rather than default
- if t.startswith(u":"):
- self._namespace = 0
- # remove the colon but continue processing
- # remove any subsequent whitespace
- t = t.lstrip(u":").lstrip(u" ")
- continue
-
- fam = self._site.family
- prefix = t[ :t.index(u":")].lower()
- ns = self._site.ns_index(prefix)
- if ns:
- # Ordinary namespace
- t = t[t.index(u":"): ].lstrip(u":").lstrip(u"
")
- self._namespace = ns
- break
- if prefix in fam.langs.keys()\
- or prefix in fam.get_known_families(site=self._site):
- # looks like an interwiki link
- if not firstPass:
- # Can't make a local interwiki link to an interwiki link.
- raise pywikibot.Error(
- "Improperly formatted interwiki link '%s'"
- % self._text)
- t = t[t.index(u":"): ].lstrip(u":").lstrip(u"
")
- if prefix in fam.langs.keys():
- newsite = pywikibot.Site(prefix, fam)
- else:
- otherlang = self._site.code
- familyName = fam.get_known_families(site=self._site)[prefix]
- if familyName in ['commons', 'meta']:
- otherlang = familyName
- try:
- newsite = pywikibot.Site(otherlang, familyName)
- except ValueError:
- raise pywikibot.Error("""\
-%s is not a local page on %s, and the %s family is
-not supported by PyWikiBot!"""
- % (title, self._site(), familyName))
-
- # Redundant interwiki prefix to the local wiki
- if newsite == self._site:
- if not t:
- # Can't have an empty self-link
- raise pywikibot.Error(
- "Invalid link title: '%s'" % self._text)
- firstPass = False
- continue
- self._site = newsite
- else:
- break # text before : doesn't match any known prefix
-
- if u"#" in t:
- t, sec = t.split(u'#', 1)
- t, self._section = t.rstrip(), sec.lstrip()
- else:
- self._section = None
-
- # Reject illegal characters.
- m = Link.illegal_titles_pattern.search(t)
- if m:
- raise pywikibot.Error(
- u"Invalid title: contains illegal char(s) '%s'" %
m.group(0))
-
- # Pages with "/./" or "/../" appearing in the URLs will
- # often be unreachable due to the way web browsers deal
- #* with 'relative' URLs. Forbid them explicitly.
-
- if u'.' in t and (
- t == u'.' or t == u'..'
- or t.startswith(u"./")
- or t.startswith(u"../")
- or u"/./" in t
- or u"/../" in t
- or t.endswith(u"/.")
- or t.endswith(u"/..")
- ):
- raise pywikibot.Error(
- "Invalid title (contains . / combinations): '%s'"
- % self._text)
-
- # Magic tilde sequences? Nu-uh!
- if u"~~~" in t:
- raise pywikibot.Error("Invalid title (contains ~~~): '%s'"
% self._text)
-
- if self._namespace != -1 and len(t) > 255:
- raise pywikibot.Error("Invalid title (over 255 bytes):
'%s'" % t)
-
- if self._site.case() == 'first-letter':
- t = t[:1].upper() + t[1:]
-
- # Can't make a link to a namespace alone...
- # "empty" local links can only be self-links
- # with a fragment identifier.
- if not t and self._site == self._source and self._namespace != 0:
- raise ValueError("Invalid link (no page title): '%s'" %
self._text)
-
- self._title = t
-
- # define attributes, to be evaluated lazily
-
- @property
- def site(self):
- if not hasattr(self, "_site"):
- self.parse()
- return self._site
-
- @property
- def namespace(self):
- if not hasattr(self, "_namespace"):
- self.parse()
- return self._namespace
-
- @property
- def title(self):
- if not hasattr(self, "_title"):
- self.parse()
- return self._title
-
- @property
- def section(self):
- if not hasattr(self, "_section"):
- self.parse()
- return self._section
-
- @property
- def anchor(self):
- if not hasattr(self, "_anchor"):
- self.parse()
- return self._anchor
-
- def astext(self, onsite=None):
- """Return a text representation of the link.
-
- @param onsite: if specified, present as a (possibly interwiki) link
- from the given site; otherwise, present as an internal link on
- the source site.
-
- """
- if onsite is None:
- onsite = self.site
- title = self.title
- if self.namespace:
- title = onsite.namespace(self.namespace) + ":" + title
- if self.section:
- title = title + "#" + self.section
- if onsite == self.site:
- return u'[[%s]]' % title
- if onsite.family == self.site.family:
- return u'[[%s:%s]]' % (self.site.code, title)
- if self.site.family.name == self.site.code:
- # use this form for sites like commons, where the
- # code is the same as the family name
- return u'[[%s:%s]]' % (self.site.code,
- title)
- return u'[[%s:%s:%s]]' % (self.site.family.name,
- self.site.code,
- title)
-
- def __str__(self):
- return self.astext()
-
- def __cmp__(self, other):
- """Test for equality and inequality of Link objects.
-
- Link objects are "equal" if and only if they are on the same site
- and have the same normalized title, including section if any.
-
- Link objects are sortable by site, then namespace, then title.
-
- """
- if not isinstance(other, Link):
- # especially, return -1 if other is None
- return -1
- if not self.site == other.site:
- return cmp(self.site, other.site)
- if self.namespace != other.namespace:
- return cmp(self.namespace, other.namespace)
- return cmp(self.title, other.title)
-
-
-# Utility functions for parsing page titles
-
-def html2unicode(text, ignore = []):
- """Return text, replacing HTML entities by equivalent unicode
characters."""
- # This regular expression will match any decimal and hexadecimal entity and
- # also entities that might be named entities.
- entityR = re.compile(
-
r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));')
- # These characters are Html-illegal, but sadly you *can* find some of
- # these and converting them to unichr(decimal) is unsuitable
- convertIllegalHtmlEntities = {
- 128 : 8364, # €
- 130 : 8218, # ‚
- 131 : 402, # ƒ
- 132 : 8222, # „
- 133 : 8230, # …
- 134 : 8224, # †
- 135 : 8225, # ‡
- 136 : 710, # ˆ
- 137 : 8240, # ‰
- 138 : 352, # Š
- 139 : 8249, # ‹
- 140 : 338, # Œ
- 142 : 381, # Ž
- 145 : 8216, # ‘
- 146 : 8217, # ’
- 147 : 8220, # “
- 148 : 8221, # ”
- 149 : 8226, # •
- 150 : 8211, # –
- 151 : 8212, # —
- 152 : 732, # ˜
- 153 : 8482, # ™
- 154 : 353, # š
- 155 : 8250, # ›
- 156 : 339, # œ
- 158 : 382, # ž
- 159 : 376 # Ÿ
- }
- #ensuring that illegal   and , which have no known
values,
- #don't get converted to unichr(129), unichr(141) or unichr(157)
- ignore = set(ignore) | set([129, 141, 157])
- result = u''
- i = 0
- found = True
- while found:
- text = text[i:]
- match = entityR.search(text)
- if match:
- unicodeCodepoint = None
- if match.group('decimal'):
- unicodeCodepoint = int(match.group('decimal'))
- elif match.group('hex'):
- unicodeCodepoint = int(match.group('hex'), 16)
- elif match.group('name'):
- name = match.group('name')
- if htmlentitydefs.name2codepoint.has_key(name):
- # We found a known HTML entity.
- unicodeCodepoint = htmlentitydefs.name2codepoint[name]
- result += text[:match.start()]
- try:
- unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint]
- except KeyError:
- pass
- if unicodeCodepoint and unicodeCodepoint not in ignore:
- result += unichr(unicodeCodepoint)
- else:
- # Leave the entity unchanged
- result += text[match.start():match.end()]
- i = match.end()
- else:
- result += text
- found = False
- return result
-
-def url2unicode(title, site, site2 = None):
- """Convert url-encoded text to unicode using site's encoding.
-
- If site2 is provided, try its encodings as well. Uses the first encoding
- that doesn't cause an error.
-
- """
- # create a list of all possible encodings for both hint sites
- encList = [site.encoding()] + list(site.encodings())
- if site2 and site2 <> site:
- encList.append(site2.encoding())
- encList += list(site2.encodings())
- firstException = None
- # try to handle all encodings (will probably retry utf-8)
- for enc in encList:
- try:
- t = title.encode(enc)
- t = urllib.unquote(t)
- return unicode(t, enc)
- except UnicodeError, ex:
- if not firstException:
- firstException = ex
- pass
- # Couldn't convert, raise the original exception
- raise firstException
-
+# -*- coding: utf-8 -*-
+"""
+Objects representing various types of MediaWiki pages.
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import pywikibot
+from pywikibot import deprecate_arg
+from pywikibot import config
+import pywikibot.site
+import pywikibot.textlib
+
+import htmlentitydefs
+import logging
+import re
+import sys
+import threading
+import unicodedata
+import urllib
+
+logger = logging.getLogger("wiki")
+
+reNamespace = re.compile("^(.+?) *: *(.*)$")
+
+
+class Page(object):
+ """Page: A MediaWiki page
+
+ This object only implements internally methods that do not require
+ reading from or writing to the wiki. All other methods are delegated
+ to the Site object.
+
+ """
+
+ @deprecate_arg("insite", None)
+ @deprecate_arg("defaultNamespace", None)
+ def __init__(self, source, title=u"", ns=0):
+ """Instantiate a Page object.
+
+ Three calling formats are supported:
+
+ - If the first argument is a Page, create a copy of that object.
+ This can be used to convert an existing Page into a subclass
+ object, such as Category or ImagePage. (If the title is also
+ given as the second argument, creates a copy with that title;
+ this is used when pages are moved.)
+ - If the first argument is a Site, create a Page on that Site
+ using the second argument as the title (may include a section),
+ and the third as the namespace number. The namespace number is
+ mandatory, even if the title includes the namespace prefix. This
+ is the preferred syntax when using an already-normalized title
+ obtained from api.php or a database dump. WARNING: may produce
+ invalid objects if page title isn't in normal form!
+ - If the first argument is a Link, create a Page from that link.
+ This is the preferred syntax when using a title scraped from
+ wikitext, URLs, or another non-normalized source.
+
+ @param source: the source of the page
+ @type source: Link, Page (or subclass), or Site
+ @param title: normalized title of the page; required if source is a
+ Site, ignored otherwise
+ @type title: unicode
+ @param ns: namespace number; required if source is a Site, ignored
+ otherwise
+ @type ns: int
+
+ """
+ if isinstance(source, pywikibot.site.BaseSite):
+ self._site = source
+ if ns not in source.namespaces():
+ raise pywikibot.Error(
+ "Invalid namespace '%i' for site %s."
+ % (ns, source.sitename()))
+ self._ns = ns
+ if ns and not title.startswith(source.namespace(ns)+u":"):
+ title = source.namespace(ns) + u":" + title
+ elif not ns and u":" in title:
+ pos = title.index(u':')
+ nsindex = source.ns_index(title[ :pos])
+ if nsindex:
+ self._ns = nsindex
+ if u"#" in title:
+ title, self._section = title.split(u"#", 1)
+ else:
+ self._section = None
+ if not title:
+ raise pywikibot.Error(
+ "Page object cannot be created from Site without
title.")
+ self._title = title
+ elif isinstance(source, Page):
+ # copy all of source's attributes to this object
+ self.__dict__ = source.__dict__
+ if title:
+ # overwrite title
+ if ":" in title:
+ prefix = title[ :title.index(":")]
+ self._ns = site.ns_index(prefix)
+ if self._ns is None:
+ self._ns = 0
+ else:
+ title = title[title.index(":")+1 : ].strip("
_")
+ self._title = "%s:%s" % (
+ self.site().namespace(self._ns),
+ self._title)
+ else:
+ self._ns = 0
+ if "#" in title:
+ self._section = title[title.index("#") + 1 : ].strip("
_")
+ title = title[ : title.index("#")].strip(" _")
+ self._title = title
+ elif isinstance(source, Link):
+ self._site = source.site
+ self._section = source.section
+ self._ns = source.namespace
+ self._title = source.title
+ # reassemble the canonical title from components
+ if self._ns:
+ self._title = "%s:%s" % (self.site().namespace(self._ns),
+ self._title)
+ else:
+ raise pywikibot.Error(
+ "Invalid argument type '%s' in Page constructor: %s"
+ % (type(source), source))
+ if self._section is not None:
+ self._title = self._title + "#" + self._section
+ self._revisions = {}
+
+ def site(self):
+ """Return the Site object for the wiki on which this Page
resides."""
+ return self._site
+
+ def namespace(self):
+ """Return the number of the namespace of the
page."""
+ return self._ns
+
+ @deprecate_arg("decode", None)
+ @deprecate_arg("savetitle", "asUrl")
+ def title(self, underscore=False, savetitle=False, withNamespace=True,
+ withSection=True, asUrl=False, asLink=False,
+ allowInterwiki=True, forceInterwiki=False, textlink=False,
+ as_filename=False):
+ """Return the title of this Page, as a Unicode string.
+
+ @param underscore: if true, replace all ' ' characters with '_'
+ @param withNamespace: if false, omit the namespace prefix
+ @param withSection: if false, omit the section
+ @param asUrl: if true, quote title as if in an URL
+ @param asLink: if true, return the title in the form of a wikilink
+ @param allowInterwiki: (only used if asLink is true) if true, format
+ the link as an interwiki link if necessary
+ @param forceInterwiki: (only used if asLink is true) if true, always
+ format the link as an interwiki link
+ @param textlink: (only used if asLink is true) if true, place a ':'
+ before Category: and Image: links
+ @param as_filename: if true, replace any characters that are unsafe
+ in filenames
+
+ """
+ title = self._title
+ if not withNamespace and self._ns != 0:
+ title = title.split(u':', 1)[1]
+ if not withSection and self._section:
+ title = title.split(u'#', 1)[0]
+ if underscore or asUrl:
+ title = title.replace(u' ', u'_')
+ if asUrl:
+ encodedTitle = title.encode(self.site().encoding())
+ title = urllib.quote(encodedTitle)
+ if asLink:
+ if forceInterwiki or (allowInterwiki and
+ (self.site().family.name != config.family
+ or self.site().code != config.mylang)):
+ if self.site().family.name != config.family \
+ and self.site().family.name != self.site().code:
+ return u'[[%s:%s:%s]]' % (self.site().family.name,
+ self.site().code,
+ self._title)
+ else:
+ # use this form for sites like commons, where the
+ # code is the same as the family name
+ return u'[[%s:%s]]' % (self.site().code,
+ self._title)
+ elif textlink and (self.isImage() or self.isCategory()):
+ return u'[[:%s]]' % title
+ else:
+ return u'[[%s]]' % title
+ if as_filename:
+ # Replace characters that are not possible in file names on some
+ # systems.
+ # Spaces are possible on most systems, but are bad for URLs.
+ for forbidden in ':*?/\\ ':
+ title = title.replace(forbidden, '_')
+ return title
+
+ @deprecate_arg("decode", None)
+ @deprecate_arg("underscore", None)
+ def section(self):
+ """Return the name of the section this Page refers to.
+
+ The section is the part of the title following a '#' character, if
+ any. If no section is present, return None.
+
+ """
+ if self._section:
+ return self._section
+ else:
+ return None
+
+ def __str__(self):
+ """Return a console representation of the
pagelink."""
+ return self.title(asLink=True, forceInterwiki=True
+ ).encode(sys.stderr.encoding)
+
+ def __unicode__(self):
+ return self.title(asLink=True, forceInterwiki=True)
+
+ def __repr__(self):
+ """Return a more complete string
representation."""
+ return u"%s(%s)" % (self.__class__.__name__,
+ self.title().encode(sys.stderr.encoding))
+
+ def __cmp__(self, other):
+ """Test for equality and inequality of Page objects.
+
+ Page objects are "equal" if and only if they are on the same site
+ and have the same normalized title, including section if any.
+
+ Page objects are sortable by namespace first, then by title.
+
+ """
+ if not isinstance(other, Page):
+ # especially, return -1 if other is None
+ return -1
+ if not self.site() == other.site():
+ return cmp(self.site(), other.site())
+ if self.namespace() != other.namespace():
+ return cmp(self.namespace(), other.namespace())
+ owntitle = self.title(withNamespace=False)
+ othertitle = other.title(withNamespace=False)
+ return cmp(owntitle, othertitle)
+
+ def __hash__(self):
+ # Pseudo method that makes it possible to store Page objects as keys
+ # in hash-tables. This relies on the fact that the string
+ # representation of an instance can not change after the construction.
+ return hash(unicode(self))
+
+ def autoFormat(self):
+ """Return L{date.autoFormat} dictName and value, if any.
+
+ Value can be a year, date, etc., and dictName is 'YearBC',
+ 'Year_December', or another dictionary name. Please note that two
+ entries may have exactly the same autoFormat, but be in two
+ different namespaces, as some sites have categories with the
+ same names. Regular titles return (None, None).
+
+ """
+ if not hasattr(self, '_autoFormat'):
+ from pywikibot import date
+ self._autoFormat = date.getAutoFormat(
+ self.site().code,
+ self.title(withNamespace=False)
+ )
+ return self._autoFormat
+
+ def isAutoTitle(self):
+ """Return True if title of this Page is in the autoFormat
dictionary."""
+ return self.autoFormat()[0] is not None
+
+ @deprecate_arg("throttle", None)
+ @deprecate_arg("nofollow_redirects", None)
+ @deprecate_arg("change_edit_time", None)
+ def get(self, force=False, get_redirect=False, sysop=False):
+ """Return the wiki-text of the page.
+
+ This will retrieve the page from the server if it has not been
+ retrieved yet, or if force is True. This can raise the following
+ exceptions that should be caught by the calling code:
+
+ - NoPage: The page does not exist
+ - IsRedirectPage: The page is a redirect. The argument of the
+ exception is the title of the page it redirects to.
+ - SectionError: The section does not exist on a page with a #
+ link
+
+ @param force: reload all page attributes, including errors.
+ @param get_redirect: return the redirect text, do not follow the
+ redirect, do not raise an exception.
+ @param sysop: if the user has a sysop account, use it to retrieve
+ this page
+
+ """
+ if force:
+ # When forcing, we retry the page no matter what. Old exceptions
+ # do not apply any more.
+ for attr in ['_redirarg', '_getexception']:
+ if hasattr(self, attr):
+ delattr(self,attr)
+ else:
+ # Make sure we re-raise an exception we got on an earlier attempt
+ if hasattr(self, '_redirarg') and not get_redirect:
+ raise pywikibot.IsRedirectPage, self._redirarg
+ elif hasattr(self, '_getexception'):
+ raise self._getexception
+ if force or not hasattr(self, "_revid") \
+ or not self._revid in self._revisions \
+ or self._revisions[self._revid].text is None:
+ self.site().loadrevisions(self, getText=True, sysop=sysop)
+ # TODO: Exception handling for no-page, redirects, etc.
+
+ return self._revisions[self._revid].text
+
+ @deprecate_arg("throttle", None)
+ @deprecate_arg("nofollow_redirects", None)
+ @deprecate_arg("change_edit_time", None)
+ def getOldVersion(self, oldid, force=False, get_redirect=False,
+ sysop=False):
+ """Return text of an old revision of this page; same options as
get().
+
+ @param oldid: The revid of the revision desired.
+
+ """
+ if force or not oldid in self._revisions \
+ or self._revisions[oldid].text is None:
+ self.site().loadrevisions(self, getText=True, revids=oldid,
+ sysop=sysop)
+ # TODO: what about redirects, errors?
+ return self._revisions[oldid].text
+
+ def permalink(self):
+ """Return the permalink URL for current revision of this
page."""
+ return "%s://%s/%sindex.php?title=%s&oldid=%s" \
+ % (self.site().protocol(),
+ self.site().hostname(),
+ self.site().scriptpath(),
+ self.title(asUrl=True),
+ self.latestRevision())
+
+ def latestRevision(self):
+ """Return the current revision id for this
page."""
+ if not hasattr(self, '_revid'):
+ self.site().loadrevisions(self)
+ return self._revid
+
+ def _textgetter(self):
+ """Return the current (edited) wikitext, loading it if
necessary."""
+ if not hasattr(self, '_text') or self._text is None:
+ try:
+ self._text = self.get()
+ except pywikibot.NoPage:
+ # TODO: what other exceptions might be returned?
+ self._text = u""
+ return self._text
+
+ def _textsetter(self, value):
+ """Update the edited wikitext"""
+ self._text = unicode(value)
+
+ def _cleartext(self):
+ """Delete the edited wikitext"""
+ if hasattr(self, "_text"):
+ del self._text
+
+ text = property(_textgetter, _textsetter, _cleartext,
+ "The edited wikitext (unicode) of this Page")
+
+ def expand_text(self):
+ """Return the page text with all templates
expanded."""
+ req = pywikibot.data.api.Request(action="expandtemplates",
+ text=self.text,
+ title=self.title(withSection=False),
+ site=self.site())
+ result = req.submit()
+ return result["expandtemplates"]["*"]
+
+ def userName(self):
+ """Return name or IP address of last user to edit
page."""
+ return self._revisions[self.latestRevision()].user
+
+ def isIpEdit(self):
+ """Return True if last editor was unregistered."""
+ return self._revisions[self.latestRevision()].anon
+
+ def editTime(self):
+ """Return timestamp (in ISO 8601 format) of last revision to
page."""
+ return self._revisions[self.latestRevision()].timestamp
+
+ def previousRevision(self):
+ """Return the revision id for the previous revision of this
Page."""
+ vh = self.getVersionHistory(revCount=2)
+ revkey = sorted(self._revisions.keys(), reverse=True)[1]
+ return revkey
+
+ def exists(self):
+ """Return True if page exists on the wiki, even if it's a
redirect.
+
+ If the title includes a section, return False if this section isn't
+ found.
+
+ """
+ return self.site().page_exists(self)
+
+ def isRedirectPage(self):
+ """Return True if this is a redirect, False if not or not
existing."""
+ return self.site().page_isredirect(self)
+
+ def isEmpty(self):
+ """Return True if the page text has less than 4 characters.
+
+ Character count ignores language links and category links.
+ Can raise the same exceptions as get().
+
+ """
+ txt = self.get()
+ txt = pywikibot.textlib.removeLanguageLinks(txt, site = self.site())
+ txt = pywikibot.textlib.removeCategoryLinks(txt, site = self.site())
+ if len(txt) < 4:
+ return True
+ else:
+ return False
+
+ def isTalkPage(self):
+ """Return True if this page is in any talk
namespace."""
+ ns = self.namespace()
+ return ns >= 0 and ns % 2 == 1
+
+ def toggleTalkPage(self):
+ """Return other member of the article-talk page pair for this
Page.
+
+ If self is a talk page, returns the associated content page;
+ otherwise, returns the associated talk page. The returned page need
+ not actually exist on the wiki.
+
+ Returns None if self is a special page.
+
+ """
+ ns = self.namespace()
+ if ns < 0: # Special page
+ return None
+ if self.isTalkPage():
+ if self.namespace() == 1:
+ return Page(self.site(), self.title(withNamespace=False))
+ else:
+ return Page(self.site(),
+ self.site().namespace(ns - 1) + ':'
+ + self.title(withNamespace=False))
+ else:
+ return Page(self.site(),
+ self.site().namespace(ns + 1) + ':'
+ + self.title(withNamespace=False))
+
+ def isCategory(self):
+ """Return True if the page is a Category, False
otherwise."""
+ return self.namespace() == 14
+
+ def isImage(self):
+ """Return True if this is an image description page, False
otherwise."""
+ return self.namespace() == 6
+
+ def isDisambig(self):
+ """Return True if this is a disambiguation page, False otherwise.
+
+ Relies on the presence of specific templates, identified in
+ the Family file or on a wiki page, to identify disambiguation
+ pages.
+
+ By default, loads a list of template names from the Family file;
+ if the value in the Family file is None, looks for the list on
+ [[MediaWiki:Disambiguationspage]].
+
+ """
+ if not hasattr(self, "_isDisambig"):
+ if not hasattr(self.site(), "_disambigtemplates"):
+ self.site()._disambigtemplates = \
+ self.site().family.disambig(self.site().code)
+ if self.site()._disambigtemplates is None:
+ try:
+ disambigpages = Page(self.site(),
+ "MediaWiki:Disambiguationspage")
+ self.site()._disambigtemplates = [
+ link.title(withNamespace=False)
+ for link in disambigpages.linkedPages()
+ if link.namespace() == 10
+ ]
+ except NoPage:
+ self.site()._disambigtemplates = ['Disambig']
+ for t in self.templates():
+ if t.title(withNamespace=False) in self.site()._disambigtemplates:
+ self._isDisambig = True
+ break
+ else:
+ self._isDisambig = False
+ return self._isDisambig
+
+ def getReferences(self, follow_redirects=True, withTemplateInclusion=True,
+ onlyTemplateInclusion=False, redirectsOnly=False,
+ namespaces=None):
+ """Return an iterator all pages that refer to or embed the page.
+
+ If you need a full list of referring pages, use
+ C{pages = list(s.getReferences())}
+
+ @param follow_redirects: if True, also iterate pages that link to a
+ redirect pointing to the page.
+ @param withTemplateInclusion: if True, also iterate pages where self
+ is used as a template.
+ @param onlyTemplateInclusion: if True, only iterate pages where self
+ is used as a template.
+ @param redirectsOnly: if True, only iterate redirects to self.
+ @param namespaces: only iterate pages in these namespaces
+
+ """
+ # N.B.: this method intentionally overlaps with backlinks() and
+ # embeddedin(). Depending on the interface, it may be more efficient
+ # to implement those methods in the site interface and then combine
+ # the results for this method, or to implement this method and then
+ # split up the results for the others.
+ return self.site().pagereferences(
+ self, follow_redirects, redirectsOnly,
+ withTemplateInclusion, onlyTemplateInclusion,
+ namespaces)
+
+ def backlinks(self, followRedirects=True, filterRedirects=None,
+ namespaces=None):
+ """Return an iterator for pages that link to this page.
+
+ @param followRedirects: if True, also iterate pages that link to a
+ redirect pointing to the page.
+ @param filterRedirects: if True, only iterate redirects; if False,
+ omit redirects; if None, do not filter
+ @param namespaces: only iterate pages in these namespaces
+
+ """
+ return self.site().pagebacklinks(self, followRedirects, filterRedirects,
+ namespaces)
+
+ def embeddedin(self, filter_redirects=None, namespaces=None):
+ """Return an iterator for pages that embed this page as a
template.
+
+ @param filterRedirects: if True, only iterate redirects; if False,
+ omit redirects; if None, do not filter
+ @param namespaces: only iterate pages in these namespaces
+
+ """
+ return self.site().page_embeddedin(self, filter_redirects, namespaces)
+
+ def canBeEdited(self):
+ """Return bool indicating whether this page can be edited.
+
+ This returns True if and only if:
+ - page is unprotected, and bot has an account for this site, or
+ - page is protected, and bot has a sysop account for this site.
+
+ """
+ return self.site().page_can_be_edited(self)
+
+ def botMayEdit(self):
+ """Return True if this page allows bots to edit it.
+
+ This will be True if the page doesn't contain {{bots}} or
+ {{nobots}}, or it contains them and the active bot is allowed to
+ edit this page. (This method is only useful on those sites that
+ recognize the bot-exclusion protocol; on other sites, it will always
+ return True.)
+
+ The framework enforces this restriction by default. It is possible
+ to override this by setting ignore_bot_templates=True in
+ user_config.py, or using page.put(force=True).
+
+ """ # TODO: move this to Site object?
+ if config.ignore_bot_templates: #Check the "master ignore switch"
+ return True
+ try:
+ templates = self.templatesWithParams();
+ except (pywikibot.NoPage,
+ pywikibot.IsRedirectPage,
+ pywikibot.SectionError):
+ return True
+ for template in templates:
+ title = template[0].title(withNamespace=False)
+ if title == 'Nobots':
+ return False
+ elif title == 'Bots':
+ if len(template[1]) == 0:
+ return True
+ else:
+ (ttype, bots) = template[1][0].split('=', 1)
+ bots = bots.split(',')
+ if ttype == 'allow':
+ if 'all' in bots or username in bots:
+ return True
+ else:
+ return False
+ if ttype == 'deny':
+ if 'all' in bots or username in bots:
+ return False
+ else:
+ return True
+ # no restricting template found
+ return True
+
+ def save(self, comment=None, watch=None, minor=True, force=False,
+ async=False, callback=None):
+ """Save the current contents of page's text to the wiki.
+
+ @param comment: The edit summary for the modification (optional, but
+ most wikis strongly encourage its use)
+ @type comment: unicode
+ @param watch: if True, add or if False, remove this Page to/from bot
+ user's watchlist; if None, leave watchlist status unchanged
+ @type watch: bool or None
+ @param minor: if True, mark this edit as minor
+ @type minor: bool
+ @param force: if True, ignore botMayEdit() setting
+ @type force: bool
+ @param async: if True, launch a separate thread to save
+ asynchronously
+ @param callback: a callable object that will be called after the
+ page put operation. This object must take two arguments: (1) a
+ Page object, and (2) an exception instance, which will be None
+ if the page was saved successfully. The callback is intended for
+ use by bots that need to keep track of which saves were
+ successful.
+
+ """
+ if not comment:
+ comment = pywikibot.default_comment # needs to be defined
+ if watch is None:
+ unwatch = False
+ watch = False
+ else:
+ unwatch = not watch
+ if not force and not self.botMayEdit:
+ raise pywikibot.PageNotSaved(
+ "Page %s not saved; editing restricted by {{bots}} template"
+ % self.title(asLink=True))
+ if async:
+ thd = threading.Thread(
+ target=self._save,
+ args=(comment, minor, watch, unwatch, callback)
+ )
+ pywikibot.threadpool.append(thd)
+ thd.start()
+ else:
+ self._save(comment, minor, watch, unwatch, callback)
+
+ def _save(self, comment, minor, watch, unwatch, callback):
+ err = None
+ try:
+ done = self.site().editpage(self, summary=comment, minor=minor,
+ watch=watch, unwatch=unwatch)
+ if not done:
+ logger.warn("Page %s not saved" % self.title(asLink=True))
+ else:
+ logger.info("Page %s saved" % self.title(asLink=True))
+ except pywikibot.Error, err:
+ logger.exception("Error saving page %s" % self.title(asLink=True))
+ if callback:
+ callback(self, err)
+
+ def put(self, newtext, comment=u'', watchArticle=None, minorEdit=True,
+ force=False, async=False, callback=None):
+ """Save the page with the contents of the first argument as the
text.
+
+ This method is maintained primarily for backwards-compatibility.
+ For new code, using Page.save() is preferred. See save() method
+ docs for all parameters not listed here.
+
+ @param newtext: The complete text of the revised page.
+ @type newtext: unicode
+
+ """
+ self.text = newtext
+ return self.save(comment, watchArticle, minorEdit, force,
+ async, callback)
+
+ def put_async(self, newtext, comment=u'', watchArticle=None,
+ minorEdit=True, force=False, callback=None):
+ """Put page on queue to be saved to wiki asynchronously.
+
+ Asynchronous version of put (takes the same arguments), which places
+ pages on a queue to be saved by a daemon thread. All arguments are
+ the same as for .put(). This version is maintained solely for
+ backwards-compatibility.
+
+ """
+ return self.put(self, newtext, comment, watchArticle,
+ minorEdit, force, callback, async=True)
+
+ def linkedPages(self):
+ """Iterate Pages that this Page links to.
+
+ Only returns pages from "normal" internal links. Image and category
+ links are omitted unless prefixed with ":". Embedded templates are
+ omitted (but links within them are returned). All interwiki and
+ external links are omitted.
+
+ @return: a generator that yields Page objects.
+
+ """
+ return self.site().pagelinks(self)
+
+ def interwiki(self, expand=True):
+ """Iterate interwiki links in the page text, excluding language
links.
+
+ @param expand: if True (default), include interwiki links found in
+ templates transcluded onto this page; if False, only iterate
+ interwiki links found in this page's own wikitext
+ @return: a generator that yields Link objects
+
+ """
+ # This function does not exist in the API, so it has to be
+ # implemented by screen-scraping
+ if expand:
+ text = self.expand_text()
+ else:
+ text = self.text
+ for linkmatch in pywikibot.link_regex.finditer(
+ pywikibot.textlib.removeDisabledParts(text)):
+ linktitle = linkmatch.group("title")
+ link = Link(linktitle, self.site())
+ # only yield links that are to a different site and that
+ # are not language links
+ try:
+ if link.site != self.site():
+ if linktitle.lstrip().startswith(":"):
+ # initial ":" indicates not a language link
+ yield link
+ elif link.site.family != self.site().family:
+ # link to a different family is not a language link
+ yield link
+ except pywikibot.Error:
+ # ignore any links with invalid contents
+ continue
+
+ def langlinks(self):
+ """Iterate all interlanguage links on this page.
+
+ @return: a generator that yields Link objects.
+
+ """
+ return self.site().pagelanglinks(self)
+
+ @deprecate_arg("followRedirects", None)
+ @deprecate_arg("loose", None)
+ def imagelinks(self, followRedirects=None, loose=None):
+ """Iterate ImagePage objects for images displayed on this Page.
+
+ @return: a generator that yields ImagePage objects.
+
+ """
+ return self.site().pageimages(self)
+
+ def templates(self):
+ """Iterate Page objects for templates used on this Page.
+
+ Template parameters are ignored. This method only returns embedded
+ templates, not template pages that happen to be referenced through
+ a normal link.
+
+ """
+ return self.site().pagetemplates(self)
+
+ def templatesWithParams(self):
+ """Iterate templates used on this Page.
+
+ @return: a generator that yields a tuple for each use of a template
+ in the page, with the template Page as the first entry and a list of
+ parameters as the second entry.
+
+ """
+ templates = pywikibot.textlib.extract_templates_and_params(self.text)
+ # backwards-compatibility: convert the dict returned as the second
+ # element into a list in the format used by old scripts
+ result = []
+ for template in templates:
+ args = template[1]
+ positional = []
+ named = {}
+ for key in sorted(args.keys()):
+ try:
+ int(key)
+ except ValueError:
+ named[key] = args[key]
+ else:
+ positional.append(args[key])
+ for name in named:
+ positional.append("%s=%s" % (name, named[name]))
+ result.append((pywikibot.Page(
+ pywikibot.Link(template[0], self.site())),
+ positional))
+ return result
+
+ @deprecate_arg("nofollow_redirects", None)
+ def categories(self, withSortKey=False):
+ """Iterate categories that the article is in.
+
+ @param withSortKey: if True, include the sort key in each Category.
+ @return: a generator that yields Category objects.
+
+ """
+ return self.site().pagecategories(self, withSortKey=withSortKey)
+
+ def extlinks(self):
+ """Iterate all external URLs (not interwiki links) from this
page.
+
+ @return: a generator that yields unicode objects containing URLs.
+
+ """
+ return self.site().page_extlinks(self)
+
+ def getRedirectTarget(self):
+ """Return a Page object for the target this Page redirects to.
+
+ If this page is not a redirect page, will raise an IsNotRedirectPage
+ exception. This method also can raise a NoPage exception.
+
+ """
+ if not self.isRedirectPage():
+ raise pywikibot.IsNotRedirectPage
+ if not isinstance(self._redir, Page):
+ self.site().getredirtarget(self)
+ return self._redir
+
+ @deprecate_arg("forceReload", None)
+ def getVersionHistory(self, reverseOrder=False, getAll=False,
+ revCount=500):
+ """Load the version history page and return history information.
+
+ Return value is a list of tuples, where each tuple represents one
+ edit and is built of revision id, edit date/time, user name, and
+ edit summary. Starts with the most current revision, unless
+ reverseOrder is True. Defaults to getting the first revCount edits,
+ unless getAll is True.
+
+ """
+ if getAll:
+ limit = None
+ else:
+ limit = revCount
+ self.site().loadrevisions(self, getText=False, rvdir=reverseOrder,
+ limit=limit)
+ if getAll:
+ revCount = len(self._revisions)
+ return [ ( self._revisions[rev].revid,
+ self._revisions[rev].timestamp,
+ self._revisions[rev].user,
+ self._revisions[rev].comment
+ ) for rev in sorted(self._revisions.keys(),
+ reverse=not reverseOrder)[ : revCount]
+ ]
+
+ def getVersionHistoryTable(self, forceReload=False, reverseOrder=False,
+ getAll=False, revCount=500):
+ """Return the version history as a wiki table."""
+ result = '{| border="1"\n'
+ result += '! oldid || date/time || username || edit summary\n'
+ for oldid, time, username, summary \
+ in self.getVersionHistory(forceReload=forceReload,
+ reverseOrder=reverseOrder,
+ getAll=getAll, revCount=revCount):
+ result += '|----\n'
+ result += '| %s || %s || %s || <nowiki>%s</nowiki>\n'\
+ % (oldid, time, username, summary)
+ result += '|}\n'
+ return result
+
+ def fullVersionHistory(self):
+ """Iterate all previous versions including wikitext.
+
+ @return: A generator that yields tuples consisting of revision ID,
+ edit date/time, user name and content
+ """
+ return self.site().loadrevisions(self, withText=True)
+
+ def contributingUsers(self):
+ """Return a set of usernames (or IPs) of users who edited this
page."""
+ edits = self.getVersionHistory()
+ users = set([edit[2] for edit in edits])
+ return users
+
+ @deprecate_arg("throttle", None)
+ def move(self, newtitle, reason=None, movetalkpage=True, sysop=False,
+ deleteAndMove=False, safe=True):
+ """Move this page to a new title.
+
+ @param newtitle: The new page title.
+ @param reason: The edit summary for the move.
+ @param movetalkpage: If true, move this page's talk page (if it exists)
+ @param sysop: Try to move using sysop account, if available
+ @param deleteAndMove: if move succeeds, delete the old page
+ (usually requires sysop privileges, depending on wiki settings)
+ @param safe: If false, attempt to delete existing page at newtitle
+ (if there is one) and then move this page to that title
+
+ """
+ if reason is None:
+ logger.info(u'Moving %s to [[%s]].'
+ % (self.title(asLink=True), newtitle))
+ reason = pywikibot.input(u'Please enter a reason for the move:')
+ # TODO: implement "safe" parameter
+ # TODO: implement "sysop" parameter
+ return self.site().movepage(self, newtitle, reason,
+ movetalk=movetalkpage,
+ noredirect=deleteAndMove)
+
+ @deprecate_arg("throttle", None)
+ def delete(self, reason=None, prompt=True, throttle=None, mark=False):
+ """Deletes the page from the wiki. Requires administrator status.
+
+ @param reason: The edit summary for the deletion.
+ @param prompt: If true, prompt user for confirmation before deleting.
+ @param mark: if true, and user does not have sysop rights, place a
+ speedy-deletion request on the page instead.
+
+ """
+ if reason is None:
+ logger.info(u'Deleting %s.' % (self.title(asLink=True)))
+ reason = pywikibot.input(u'Please enter a reason for the deletion:')
+ answer = u'y'
+ if prompt and not hasattr(self.site(), '_noDeletePrompt'):
+ answer = pywikibot.inputChoice(u'Do you want to delete %s?'
+ % self.title(asLink = True, forceInterwiki = True),
+ ['Yes', 'No', 'All'],
+ ['Y', 'N', 'A'],
+ 'N')
+ if answer in ['a', 'A']:
+ answer = 'y'
+ self.site()._noDeletePrompt = True
+ if answer in ['y', 'Y']:
+ return self.site().delete(self, reason, mark=mark)
+
+ def loadDeletedRevisions(self):
+ """Retrieve all deleted revisions for this Page from
Special/Undelete.
+
+ Stores all revisions' timestamps, dates, editors and comments in
+ self._deletedRevs attribute.
+
+ @return: list of timestamps (which can be used to retrieve revisions
+ later on).
+
+ """
+ return self.site().loadDeletedRevisions(self)
+
+ def getDeletedRevision(self, timestamp, retrieveText=False):
+ """Return a particular deleted revision by timestamp.
+
+ @return: a list of [date, editor, comment, text, restoration
+ marker]. text will be None, unless retrieveText is True (or has
+ been retrieved earlier). If timestamp is not found, returns
+ None.
+
+ """
+ return self.site().getDeletedRevision(self, timestamp,
+ getText=retrieveText)
+
+ def markDeletedRevision(self, timestamp, undelete=True):
+ """Mark the revision identified by timestamp for undeletion.
+
+ @param undelete: if False, mark the revision to remain deleted.
+
+ """
+ if self._deletedRevs == None:
+ self.loadDeletedRevisions()
+ if not self._deletedRevs.has_key(timestamp):
+ #TODO: Throw an exception?
+ return None
+ self._deletedRevs[timestamp][4] = undelete
+ self._deletedRevsModified = True
+
+ @deprecate_arg("throttle", None)
+ def undelete(self, comment=None):
+ """Undelete revisions based on the markers set by previous calls.
+
+ If no calls have been made since loadDeletedRevisions(), everything
+ will be restored.
+
+ Simplest case::
+ Page(...).undelete('This will restore all revisions')
+
+ More complex::
+ pg = Page(...)
+ revs = pg.loadDeletedRevsions()
+ for rev in revs:
+ if ... #decide whether to undelete a revision
+ pg.markDeletedRevision(rev) #mark for undeletion
+ pg.undelete('This will restore only selected revisions.')
+
+ @param comment: The undeletion edit summary.
+
+ """
+ if comment is None:
+ logger.info(u'Preparing to undelete %s.'
+ % (self.title(asLink=True)))
+ comment = pywikibot.input(
+ u'Please enter a reason for the undeletion:')
+ return self.site().undelete(self, comment)
+
+ @deprecate_arg("throttle", None)
+ def protect(self, edit='sysop', move='sysop', unprotect=False,
+ reason=None, prompt=True):
+ """(Un)protect a wiki page. Requires administrator status.
+
+ Valid protection levels (in MediaWiki 1.12) are '' (equivalent to
+ 'none'), 'autoconfirmed', and 'sysop'.
+
+ @param edit: Level of edit protection
+ @param move: Level of move protection
+ @param unprotect: If true, unprotect the page (equivalent to setting
+ all protection levels to '')
+ @param reason: Edit summary.
+ @param prompt: If true, ask user for confirmation.
+
+ """
+ if reason is None:
+ if unprotect:
+ un = u'un'
+ else:
+ un = u''
+ logger.info(u'Preparing to %sprotect %s.'
+ % (un, self.title(asLink=True)))
+ reason = pywikibot.input(u'Please enter a reason for the action:')
+ if unprotect:
+ edit = move = ""
+ answer = 'y'
+ if prompt and not hasattr(self.site(), '_noProtectPrompt'):
+ answer = pywikibot.inputChoice(
+ u'Do you want to change the protection level of %s?'
+ % self.title(asLink=True, forceInterwiki = True),
+ ['Yes', 'No', 'All'], ['Y',
'N', 'A'], 'N')
+ if answer in ['a', 'A']:
+ answer = 'y'
+ self.site()._noProtectPrompt = True
+ if answer in ['y', 'Y']:
+ return self.site().protect(self, edit, move, reason)
+
+ def change_category(article, oldCat, newCat, comment=None, sortKey=None,
+ inPlace=True):
+ """Remove page from oldCat and add it to newCat.
+
+ oldCat and newCat should be Category objects.
+ If newCat is None, the category will be removed.
+
+ """ # TODO: document remaining arguments
+ cats = self.categories(get_redirect=True)
+ site = self.site()
+ changesMade = False
+
+ if not self.canBeEdited():
+ pywikibot.output(u"Can't edit %s, skipping it..."
+ % self.title(asLink=True))
+ return False
+ if inPlace == True:
+ newtext = pywikibot.textlib.replaceCategoryInPlace(
+ self.text, oldCat, newCat)
+ if newtext == self.text:
+ pywikibot.output(
+ u'No changes in made in page %s.'
+ % self.title(asLink=True))
+ return False
+ try:
+ self.put(newtext, comment)
+ return True
+ except pywikibot.EditConflict:
+ pywikibot.output(
+ u'Skipping %s because of edit conflict'
+ % self.title(asLink=True))
+ except pywikibot.LockedPage:
+ pywikibot.output(u'Skipping locked page %s'
+ % self.title(asLink=True))
+ except pywikibot.SpamfilterError, error:
+ pywikibot.output(
+ u'Changing page %s blocked by spam filter (URL=%s)'
+ % (self.title(asLink=True), error.url))
+ except pywikibot.NoUsername:
+ pywikibot.output(
+ u"Page %s not saved; sysop privileges required."
+ % self.title(asLink=True))
+ except pywikibot.PageNotSaved, error:
+ pywikibot.output(u"Saving page %s failed: %s"
+ % (self.title(asLink=True), error.message))
+ return False
+
+ # This loop will replace all occurrences of the category to be changed,
+ # and remove duplicates.
+ newCatList = []
+ newCatSet = set()
+ for i in range(len(cats)):
+ cat = cats[i]
+ if cat == oldCat:
+ changesMade = True
+ if not sortKey:
+ sortKey = cat.sortKey
+ if newCat:
+ if newCat.title() not in newCatSet:
+ newCategory = Category(site, newCat.title(),
+ sortKey=sortKey)
+ newCatSet.add(newCat.title())
+ newCatList.append(newCategory)
+ elif cat.title() not in newCatSet:
+ newCatSet.add(cat.title())
+ newCatList.append(cat)
+
+ if not changesMade:
+ pywikibot.output(u'ERROR: %s is not in category %s!'
+ % (self.title(asLink=True), oldCat.title()))
+ else:
+ try:
+ text = pywikibot.textlib.replaceCategoryLinks(self.text,
+ newCatList)
+ except ValueError:
+ # Make sure that the only way replaceCategoryLinks() can return
+ # a ValueError is in the case of interwiki links to self.
+ pywikibot.output(
+ u'Skipping %s because of interwiki link to self' % self)
+ try:
+ self.put(text, comment)
+ except pywikibot.EditConflict:
+ pywikibot.output(
+ u'Skipping %s because of edit conflict' % self.title())
+ except pywikibot.SpamfilterError, e:
+ pywikibot.output(
+ u'Skipping %s because of blacklist entry %s'
+ % (self.title(), e.url))
+ except pywikibot.LockedPage:
+ pywikibot.output(
+ u'Skipping %s because page is locked' % self.title())
+ except pywikibot.PageNotSaved, error:
+ pywikibot.output(u"Saving page %s failed: %s"
+ % (self.title(asLink=True), error.message))
+
+######## DEPRECATED METHODS ########
+
+ def encoding(self):
+ """DEPRECATED: use Site.encoding() instead"""
+ logger.debug(u"Page.encoding() is deprecated; use Site.encoding().")
+ return self.site().encoding()
+
+ def titleWithoutNamespace(self, underscore=False):
+ """DEPRECATED: use self.title(withNamespace=False)
instead."""
+ logger.debug(
+ u"Page.titleWithoutNamespace() method is deprecated.")
+ return self.title(underscore=underscore, withNamespace=False,
+ withSection=False)
+
+ def titleForFilename(self):
+ """DEPRECATED: use self.title(as_filename=True)
instead."""
+ logger.debug(
+ u"Page.titleForFilename() method is deprecated.")
+ return self.title(as_filename=True)
+
+ def sectionFreeTitle(self, underscore=False):
+ """DEPRECATED: use self.title(withSection=False)
instead."""
+ logger.debug(
+ u"Page.sectionFreeTitle() method is deprecated.")
+ return self.title(underscore=underscore, withSection=False)
+
+ def aslink(self, forceInterwiki=False, textlink=False, noInterwiki=False):
+ """DEPRECATED: use self.title(asLink=True)
instead."""
+ logger.debug(u"Page.aslink() method is deprecated.")
+ return self.title(asLink=True, forceInterwiki=forceInterwiki,
+ allowInterwiki=not noInterwiki, textlink=textlink)
+
+ def urlname(self):
+ """Return the Page title encoded for use in an URL.
+
+ DEPRECATED: use self.title(asUrl=True) instead.
+
+ """
+ logger.debug(u"Page.urlname() method is deprecated.")
+ return self.title(asUrl=True)
+
+####### DISABLED METHODS (warnings provided) ######
+ # these methods are easily replaced by editing the page's text using
+ # textlib methods and then using put() on the result.
+
+ def removeImage(self, image, put=False, summary=None, safe=True):
+ """Old method to remove all instances of an image from
page."""
+ logger.warning(u"Page.removeImage() is no longer supported.")
+
+ def replaceImage(self, image, replacement=None, put=False, summary=None,
+ safe=True):
+ """Old method to replace all instances of an image with
another."""
+ logger.warning(u"Page.replaceImage() is no longer supported.")
+
+
+class ImagePage(Page):
+ """A subclass of Page representing an image descriptor wiki page.
+
+ Supports the same interface as Page, with the following added methods:
+
+ getImagePageHtml : Download image page and return raw HTML text.
+ fileURL : Return the URL for the image described on this
+ page.
+ fileIsOnCommons : Return True if image stored on Wikimedia
+ Commons.
+ fileIsShared : Return True if image stored on Wikitravel
+ shared repository.
+ getFileMd5Sum : Return image file's MD5 checksum.
+ getFileVersionHistory : Return the image file's version history.
+ getFileVersionHistoryTable: Return the version history in the form of a
+ wiki table.
+ usingPages : Iterate Pages on which the image is displayed.
+
+ """
+ def __init__(self, source, title=u"", insite=None):
+ Page.__init__(self, source, title, 6)
+ if self.namespace() != 6:
+ raise ValueError(u"'%s' is not in the image namespace!" %
title)
+
+ def getImagePageHtml(self):
+ """
+ Download the image page, and return the HTML, as a unicode string.
+
+ Caches the HTML code, so that if you run this method twice on the
+ same ImagePage object, the page will only be downloaded once.
+ """
+ if not hasattr(self, '_imagePageHtml'):
+ from pywikibot.data import http
+ path = "%s/index.php?title=%s" \
+ % (self.site().scriptpath(), self.title(asUrl=True))
+ self._imagePageHtml = http.request(self.site(), path)
+ return self._imagePageHtml
+
+ def fileUrl(self):
+ """Return the URL for the image described on this
page."""
+ # TODO add scaling option?
+ if not hasattr(self, '_imageinfo'):
+ self._imageinfo = self.site().getimageinfo(self) #FIXME
+ return self._imageinfo['url']
+
+ def fileIsOnCommons(self):
+ """Return True if the image is stored on Wikimedia
Commons"""
+ return self.fileUrl().startswith(
+ 'http://upload.wikimedia.org/wikipedia/commons/')
+
+ def fileIsShared(self):
+ """Return True if image is stored on any known shared
repository."""
+ # as of now, the only known repositories are commons and wikitravel
+ if 'wikitravel_shared' in self.site().shared_image_repository():
+ return self.fileUrl().startswith(
+
u'http://wikitravel.org/upload/shared/')
+ return self.fileIsOnCommons()
+
+ def getFileMd5Sum(self):
+ """Return image file's MD5 checksum."""
+ logger.debug(
+ "ImagePage.getFileMd5Sum() is deprecated; use getFileSHA1Sum().")
+# FIXME: MD5 might be performed on incomplete file due to server disconnection
+# (see bug #1795683).
+ import md5, urllib
+ f = urllib.urlopen(self.fileUrl())
+ # TODO: check whether this needs a User-Agent header added
+ md5Checksum = md5.new(f.read()).hexdigest()
+ f.close()
+ return md5Checksum
+
+ def getFileSHA1Sum(self):
+ """Return image file's SHA1 checksum."""
+ if not hasattr(self, '_imageinfo'):
+ self._imageinfo = self.site().getimageinfo(self) #FIXME
+ return self._imageinfo['sha1']
+
+ def getFileVersionHistory(self):
+ """Return the image file's version history.
+
+ @return: An iterator yielding tuples containing (timestamp,
+ username, resolution, filesize, comment).
+
+ """
+ #TODO; return value may need to change
+ return self.site().getimageinfo(self, history=True) #FIXME
+
+ def getFileVersionHistoryTable(self):
+ """Return the version history in the form of a wiki
table."""
+ lines = []
+ #TODO: if getFileVersionHistory changes, make sure this follows it
+ for (datetime, username, resolution, size, comment) \
+ in self.getFileVersionHistory():
+ lines.append('| %s || %s || %s || %s ||
<nowiki>%s</nowiki>' \
+ % (datetime, username, resolution, size, comment))
+ return u'{| border="1"\n! date/time || username || resolution ||
size || edit summary\n|----\n' + u'\n|----\n'.join(lines) + '\n|}'
+
+ def usingPages(self):
+ """Yield Pages on which the image is displayed."""
+ return self.site().getimageusage(self)
+
+
+class Category(Page):
+ """A page in the Category: namespace"""
+
+ @deprecate_arg("sortKey", None)
+ def __init__(self, source, title=u"", insite=None):
+ """All parameters are the same as for Page() constructor.
+
+ """
+ Page.__init__(self, source, title, 14)
+ if self.namespace() != 14:
+ raise ValueError(u"'%s' is not in the category namespace!"
+ % title)
+
+ @deprecate_arg("forceInterwiki", None)
+ @deprecate_arg("textlink", None)
+ @deprecate_arg("noInterwiki", None)
+ def aslink(self, sortKey=u''):
+ """Return a link to place a page in this Category.
+
+ Use this only to generate a "true" category link, not for interwikis
+ or text links to category pages.
+
+ @param sortKey: The sort key for the article to be placed in this
+ Category; if omitted, default sort key is used.
+ @type sortKey: (optional) unicode
+
+ """
+ if sortKey:
+ titleWithSortKey = '%s|%s' % (self.title(withSection=False),
+ self.sortKey)
+ else:
+ titleWithSortKey = self.title(withSection=False)
+ return '[[%s]]' % titleWithSortKey
+
+ @deprecate_arg("startFrom", None)
+ @deprecate_arg("cacheResults", None)
+ def subcategories(self, recurse=False):
+ """Iterate all subcategories of the current category.
+
+ @param recurse: if not False or 0, also iterate subcategories of
+ subcategories. If an int, limit recursion to this number of
+ levels. (Example: recurse=1 will iterate direct subcats and
+ first-level sub-sub-cats, but no deeper.)
+ @type recurse: int or bool
+
+ """
+ if not isinstance(recurse, bool) and recurse:
+ recurse = recurse - 1
+ if not hasattr(self, "_subcats"):
+ self._subcats = []
+ for member in self.site().categorymembers(self, namespaces=[14]):
+ subcat = Category(self.site(), member.title())
+ self._subcats.append(subcat)
+ yield subcat
+ if recurse:
+ for item in subcat.subcategories(recurse):
+ yield item
+ else:
+ for subcat in self._subcats:
+ yield subcat
+ if recurse:
+ for item in subcat.subcategories(recurse):
+ yield item
+
+ @deprecate_arg("startFrom", None)
+ def articles(self, recurse=False):
+ """
+ Yields all articles in the current category.
+
+ @param recurse: if not False or 0, also iterate articles in
+ subcategories. If an int, limit recursion to this number of
+ levels. (Example: recurse=1 will iterate articles in first-level
+ subcats, but no deeper.)
+ @type recurse: int or bool
+
+ """
+ namespaces = [x for x in self.site().namespaces().keys()
+ if x>=0 and x!=14]
+ for member in self.site().categorymembers(self,
+ namespaces=namespaces):
+ yield member
+ if recurse:
+ if not isinstance(recurse, bool) and recurse:
+ recurse = recurse - 1
+ for subcat in self.subcategories():
+ for article in subcat.articles(recurse):
+ yield article
+
+ def isEmptyCategory(self):
+ """Return True if category has no members (including
subcategories)."""
+ for member in self.site().categorymembers(self, limit=1):
+ return False
+ return True
+
+ def copyTo(self, catname):
+ """
+ Copy text of category page to a new page. Does not move contents.
+
+ @param catname: New category title (without namespace)
+ @return: True if copying was successful, False if target page
+ already existed.
+
+ """
+ # This seems far too specialized to be in the top-level framework
+ catname = self.site().category_namespace() + ':' + catname
+ targetCat = Category(self.site(), catname)
+ if targetCat.exists():
+ logger.warn('Target page %s already exists!'
+ % targetCat.title())
+ return False
+ else:
+ logger.info('Moving text from %s to %s.'
+ % (self.title(), targetCat.title()))
+ authors = ', '.join(self.contributingUsers())
+ creationSummary = pywikibot.translate(
+ self.site(), msg_created_for_renaming
+ ) % (self.title(), authors)
+ targetCat.put(self.get(), creationSummary)
+ return True
+
+ def copyAndKeep(self, catname, cfdTemplates):
+ """Copy partial category page text (not contents) to a new title.
+
+ Like copyTo above, except this removes a list of templates (like
+ deletion templates) that appear in the old category text. It also
+ removes all text between the two HTML comments BEGIN CFD TEMPLATE
+ and END CFD TEMPLATE. (This is to deal with CFD templates that are
+ substituted.)
+
+ Returns true if copying was successful, false if target page already
+ existed.
+
+ @param catname: New category title (without namespace)
+ @param cfdTemplates: A list (or iterator) of templates to be removed
+ from the page text
+ @return: True if copying was successful, False if target page
+ already existed.
+
+ """
+ # I don't see why we need this as part of the framework either
+ catname = self.site().category_namespace() + ':' + catname
+ targetCat = Category(self.site(), catname)
+ if targetCat.exists():
+ logger.warn('Target page %s already exists!'
+ % targetCat.title())
+ return False
+ else:
+ logger.info('Moving text from %s to %s.'
+ % (self.title(), targetCat.title()))
+ authors = ', '.join(self.contributingUsers())
+ creationSummary = pywikibot.translate(
+ self.site(), msg_created_for_renaming
+ ) % (self.title(), authors)
+ newtext = self.get()
+ for regexName in cfdTemplates:
+ matchcfd = re.compile(r"{{%s.*?}}" % regexName, re.IGNORECASE)
+ newtext = matchcfd.sub('',newtext)
+ matchcomment = re.compile(
+ r"<!--BEGIN CFD TEMPLATE-->.*?<!--END CFD
TEMPLATE-->",
+ re.IGNORECASE | re.MULTILINE | re.DOTALL)
+ newtext = matchcomment.sub('', newtext)
+ pos = 0
+ while (newtext[pos:pos+1] == "\n"):
+ pos = pos + 1
+ newtext = newtext[pos:]
+ targetCat.put(newtext, creationSummary)
+ return True
+
+#### DEPRECATED METHODS ####
+ def subcategoriesList(self, recurse=False):
+ """DEPRECATED: Equivalent to
list(self.subcategories(...))"""
+ logger.debug("Category.subcategoriesList() method is deprecated.")
+ return sorted(list(set(self.subcategories(recurse))))
+
+ def articlesList(self, recurse=False):
+ """DEPRECATED: equivalent to
list(self.articles(...))"""
+ logger.debug("Category.articlesList() method is deprecated.")
+ return sorted(list(set(self.articles(recurse))))
+
+ def supercategories(self):
+ """DEPRECATED: equivalent to self.categories()"""
+ logger.debug("Category.supercategories() method is deprecated.")
+ return self.categories()
+
+ def supercategoriesList(self):
+ """DEPRECATED: equivalent to
list(self.categories(...))"""
+ logger.debug("Category.articlesList() method is deprecated.")
+ return sorted(list(set(self.categories())))
+
+
+class Revision(object):
+ """A structure holding information about a single revision of a
Page."""
+ def __init__(self, revid, timestamp, user, anon=False, comment=u"",
+ text=None, minor=False):
+ """All parameters correspond to object attributes (e.g., revid
+ parameter is stored as self.revid)
+
+ @param revid: Revision id number
+ @type revid: int
+ @param text: Revision wikitext.
+ @type text: unicode, or None if text not yet retrieved
+ @param timestamp: Revision time stamp (in ISO 8601 format)
+ @type timestamp: unicode
+ @param user: user who edited this revision
+ @type user: unicode
+ @param anon: user is unregistered
+ @type anon: bool
+ @param comment: edit comment text
+ @type comment: unicode
+ @param minor: edit flagged as minor
+ @type minor: bool
+
+ """
+ self.revid = revid
+ self.text = text
+ self.timestamp = timestamp
+ self.user = user
+ self.anon = anon
+ self.comment = comment
+ self.minor = minor
+
+
+class Link(object):
+ """A Mediawiki link (local or interwiki)
+
+ Has the following attributes:
+
+ - site: The Site object for the wiki linked to
+ - namespace: The namespace of the page linked to (int)
+ - title: The title of the page linked to (unicode); does not include
+ namespace or section
+ - section: The section of the page linked to (unicode or None); this
+ contains any text following a '#' character in the title
+ - anchor: The anchor text (unicode or None); this contains any text
+ following a '|' character inside the link
+
+ """
+ illegal_titles_pattern = re.compile(
+ # Matching titles will be held as illegal.
+ u'''[^
%!\"$&'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\u0080-\uFFFF+]'''
+ # URL percent encoding sequences interfere with the ability
+ # to round-trip titles -- you can't link to them consistently.
+ u'|%[0-9A-Fa-f]{2}'
+ # XML/HTML character references produce similar issues.
+ u'|&[A-Za-z0-9\x80-\xff]+;'
+ u'|&#[0-9]+;'
+ u'|&#x[0-9A-Fa-f]+;'
+ )
+
+ def __init__(self, text, source=None, defaultNamespace=0):
+ """Constructor
+
+ @param text: the link text (everything appearing between [[ and ]]
+ on a wiki page)
+ @type text: unicode
+ @param source: the Site on which the link was found (not necessarily
+ the site to which the link refers)
+ @type source: Site
+ @param defaultNamespace: a namespace to use if the link does not
+ contain one (defaults to 0)
+ @type defaultNamespace: int
+
+ """
+ self._text = text
+ self._source = source
+ self._defaultns = defaultNamespace
+
+ def parse(self):
+ """Parse text; called internally when accessing
attributes"""
+
+ # First remove the anchor, which is stored unchanged, if there is one
+ if u"|" in self._text:
+ self._text, self._anchor = self._text.split(u"|", 1)
+ else:
+ self._anchor = None
+
+ if self._source is None:
+ self._source = pywikibot.Site()
+ self._site = self._source
+
+ # Clean up the name, it can come from anywhere.
+ # Convert HTML entities to unicode
+ t = html2unicode(self._text)
+
+ # Convert URL-encoded characters to unicode
+ t = url2unicode(t, site=self._site)
+
+ # Normalize unicode string to a NFC (composed) format to allow proper
+ # string comparisons. According to
+ #
http://svn.wikimedia.org/viewvc/mediawiki/branches/REL1_6/phase3/includes/n…
+ # the mediawiki code normalizes everything to NFC, not NFKC (which
+ # might result in information loss).
+ t = unicodedata.normalize('NFC', t)
+
+ # This code was adapted from Title.php : secureAndSplit()
+ #
+ if u'\ufffd' in t:
+ raise pywikibot.Error("Title contains illegal char (\\uFFFD)")
+ self._namespace = self._defaultns
+
+ # Replace underscores by spaces
+ t = t.replace(u"_", u" ")
+ # replace multiple spaces and underscores with a single space
+ while u" " in t: t = t.replace(u" ", u" ")
+ # Strip spaces at both ends
+ t = t.strip(" ")
+ # Remove left-to-right and right-to-left markers.
+ t = t.replace(u"\u200e", u"").replace(u"\u200f",
u"")
+
+ firstPass = True
+ while u":" in t:
+ # Initial colon indicates main namespace rather than default
+ if t.startswith(u":"):
+ self._namespace = 0
+ # remove the colon but continue processing
+ # remove any subsequent whitespace
+ t = t.lstrip(u":").lstrip(u" ")
+ continue
+
+ fam = self._site.family
+ prefix = t[ :t.index(u":")].lower()
+ ns = self._site.ns_index(prefix)
+ if ns:
+ # Ordinary namespace
+ t = t[t.index(u":"): ].lstrip(u":").lstrip(u"
")
+ self._namespace = ns
+ break
+ if prefix in fam.langs.keys()\
+ or prefix in fam.get_known_families(site=self._site):
+ # looks like an interwiki link
+ if not firstPass:
+ # Can't make a local interwiki link to an interwiki link.
+ raise pywikibot.Error(
+ "Improperly formatted interwiki link '%s'"
+ % self._text)
+ t = t[t.index(u":"): ].lstrip(u":").lstrip(u"
")
+ if prefix in fam.langs.keys():
+ newsite = pywikibot.Site(prefix, fam)
+ else:
+ otherlang = self._site.code
+ familyName = fam.get_known_families(site=self._site)[prefix]
+ if familyName in ['commons', 'meta']:
+ otherlang = familyName
+ try:
+ newsite = pywikibot.Site(otherlang, familyName)
+ except ValueError:
+ raise pywikibot.Error("""\
+%s is not a local page on %s, and the %s family is
+not supported by PyWikiBot!"""
+ % (title, self._site(), familyName))
+
+ # Redundant interwiki prefix to the local wiki
+ if newsite == self._site:
+ if not t:
+ # Can't have an empty self-link
+ raise pywikibot.Error(
+ "Invalid link title: '%s'" % self._text)
+ firstPass = False
+ continue
+ self._site = newsite
+ else:
+ break # text before : doesn't match any known prefix
+
+ if u"#" in t:
+ t, sec = t.split(u'#', 1)
+ t, self._section = t.rstrip(), sec.lstrip()
+ else:
+ self._section = None
+
+ # Reject illegal characters.
+ m = Link.illegal_titles_pattern.search(t)
+ if m:
+ raise pywikibot.Error(
+ u"Invalid title: contains illegal char(s) '%s'" %
m.group(0))
+
+ # Pages with "/./" or "/../" appearing in the URLs will
+ # often be unreachable due to the way web browsers deal
+ #* with 'relative' URLs. Forbid them explicitly.
+
+ if u'.' in t and (
+ t == u'.' or t == u'..'
+ or t.startswith(u"./")
+ or t.startswith(u"../")
+ or u"/./" in t
+ or u"/../" in t
+ or t.endswith(u"/.")
+ or t.endswith(u"/..")
+ ):
+ raise pywikibot.Error(
+ "Invalid title (contains . / combinations): '%s'"
+ % self._text)
+
+ # Magic tilde sequences? Nu-uh!
+ if u"~~~" in t:
+ raise pywikibot.Error("Invalid title (contains ~~~): '%s'"
% self._text)
+
+ if self._namespace != -1 and len(t) > 255:
+ raise pywikibot.Error("Invalid title (over 255 bytes):
'%s'" % t)
+
+ if self._site.case() == 'first-letter':
+ t = t[:1].upper() + t[1:]
+
+ # Can't make a link to a namespace alone...
+ # "empty" local links can only be self-links
+ # with a fragment identifier.
+ if not t and self._site == self._source and self._namespace != 0:
+ raise ValueError("Invalid link (no page title): '%s'" %
self._text)
+
+ self._title = t
+
+ # define attributes, to be evaluated lazily
+
+ @property
+ def site(self):
+ if not hasattr(self, "_site"):
+ self.parse()
+ return self._site
+
+ @property
+ def namespace(self):
+ if not hasattr(self, "_namespace"):
+ self.parse()
+ return self._namespace
+
+ @property
+ def title(self):
+ if not hasattr(self, "_title"):
+ self.parse()
+ return self._title
+
+ @property
+ def section(self):
+ if not hasattr(self, "_section"):
+ self.parse()
+ return self._section
+
+ @property
+ def anchor(self):
+ if not hasattr(self, "_anchor"):
+ self.parse()
+ return self._anchor
+
+ def astext(self, onsite=None):
+ """Return a text representation of the link.
+
+ @param onsite: if specified, present as a (possibly interwiki) link
+ from the given site; otherwise, present as an internal link on
+ the source site.
+
+ """
+ if onsite is None:
+ onsite = self.site
+ title = self.title
+ if self.namespace:
+ title = onsite.namespace(self.namespace) + ":" + title
+ if self.section:
+ title = title + "#" + self.section
+ if onsite == self.site:
+ return u'[[%s]]' % title
+ if onsite.family == self.site.family:
+ return u'[[%s:%s]]' % (self.site.code, title)
+ if self.site.family.name == self.site.code:
+ # use this form for sites like commons, where the
+ # code is the same as the family name
+ return u'[[%s:%s]]' % (self.site.code,
+ title)
+ return u'[[%s:%s:%s]]' % (self.site.family.name,
+ self.site.code,
+ title)
+
+ def __str__(self):
+ return self.astext()
+
+ def __cmp__(self, other):
+ """Test for equality and inequality of Link objects.
+
+ Link objects are "equal" if and only if they are on the same site
+ and have the same normalized title, including section if any.
+
+ Link objects are sortable by site, then namespace, then title.
+
+ """
+ if not isinstance(other, Link):
+ # especially, return -1 if other is None
+ return -1
+ if not self.site == other.site:
+ return cmp(self.site, other.site)
+ if self.namespace != other.namespace:
+ return cmp(self.namespace, other.namespace)
+ return cmp(self.title, other.title)
+
+
+# Utility functions for parsing page titles
+
+def html2unicode(text, ignore = []):
+ """Return text, replacing HTML entities by equivalent unicode
characters."""
+ # This regular expression will match any decimal and hexadecimal entity and
+ # also entities that might be named entities.
+ entityR = re.compile(
+
r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));')
+ # These characters are Html-illegal, but sadly you *can* find some of
+ # these and converting them to unichr(decimal) is unsuitable
+ convertIllegalHtmlEntities = {
+ 128 : 8364, # €
+ 130 : 8218, # ‚
+ 131 : 402, # ƒ
+ 132 : 8222, # „
+ 133 : 8230, # …
+ 134 : 8224, # †
+ 135 : 8225, # ‡
+ 136 : 710, # ˆ
+ 137 : 8240, # ‰
+ 138 : 352, # Š
+ 139 : 8249, # ‹
+ 140 : 338, # Œ
+ 142 : 381, # Ž
+ 145 : 8216, # ‘
+ 146 : 8217, # ’
+ 147 : 8220, # “
+ 148 : 8221, # ”
+ 149 : 8226, # •
+ 150 : 8211, # –
+ 151 : 8212, # —
+ 152 : 732, # ˜
+ 153 : 8482, # ™
+ 154 : 353, # š
+ 155 : 8250, # ›
+ 156 : 339, # œ
+ 158 : 382, # ž
+ 159 : 376 # Ÿ
+ }
+ #ensuring that illegal   and , which have no known
values,
+ #don't get converted to unichr(129), unichr(141) or unichr(157)
+ ignore = set(ignore) | set([129, 141, 157])
+ result = u''
+ i = 0
+ found = True
+ while found:
+ text = text[i:]
+ match = entityR.search(text)
+ if match:
+ unicodeCodepoint = None
+ if match.group('decimal'):
+ unicodeCodepoint = int(match.group('decimal'))
+ elif match.group('hex'):
+ unicodeCodepoint = int(match.group('hex'), 16)
+ elif match.group('name'):
+ name = match.group('name')
+ if htmlentitydefs.name2codepoint.has_key(name):
+ # We found a known HTML entity.
+ unicodeCodepoint = htmlentitydefs.name2codepoint[name]
+ result += text[:match.start()]
+ try:
+ unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint]
+ except KeyError:
+ pass
+ if unicodeCodepoint and unicodeCodepoint not in ignore:
+ result += unichr(unicodeCodepoint)
+ else:
+ # Leave the entity unchanged
+ result += text[match.start():match.end()]
+ i = match.end()
+ else:
+ result += text
+ found = False
+ return result
+
+def url2unicode(title, site, site2 = None):
+ """Convert url-encoded text to unicode using site's encoding.
+
+ If site2 is provided, try its encodings as well. Uses the first encoding
+ that doesn't cause an error.
+
+ """
+ # create a list of all possible encodings for both hint sites
+ encList = [site.encoding()] + list(site.encodings())
+ if site2 and site2 <> site:
+ encList.append(site2.encoding())
+ encList += list(site2.encodings())
+ firstException = None
+ # try to handle all encodings (will probably retry utf-8)
+ for enc in encList:
+ try:
+ t = title.encode(enc)
+ t = urllib.unquote(t)
+ return unicode(t, enc)
+ except UnicodeError, ex:
+ if not firstException:
+ firstException = ex
+ pass
+ # Couldn't convert, raise the original exception
+ raise firstException
+
Property changes on: branches/rewrite/pywikibot/page.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Added: svn:eol-style
+ native
Modified: branches/rewrite/pywikibot/pagegenerators.py
===================================================================
--- branches/rewrite/pywikibot/pagegenerators.py 2008-12-16 19:34:48 UTC (rev 6155)
+++ branches/rewrite/pywikibot/pagegenerators.py 2008-12-16 19:40:20 UTC (rev 6156)
@@ -1,965 +1,965 @@
-# -*- coding: utf-8 -*-
-"""This module offers a wide variety of page generators. A page generator
is an
-object that is iterable (see
http://www.python.org/dev/peps/pep-0255/ ) and
-that yields page objects on which other scripts can then work.
-
-In general, there is no need to run this script directly. It can, however,
-be run for testing purposes. It will then print the page titles to standard
-output.
-
-These parameters are supported to specify which pages titles to print:
-
-¶ms;
-"""
-#
-# (C) Pywikipedia bot team, 2008
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id: $'
-
-import pywikibot
-
-import itertools
-import Queue
-import re
-import sys
-import threading
-
-
-# ported from version 1 for backwards-compatibility
-# most of these functions just wrap a Site or Page method that returns
-# a generator
-
-parameterHelp = """\
--cat Work on all pages which are in a specific category.
- Argument can also be given as "-cat:categoryname" or
- as "-cat:categoryname|fromtitle".
-
--catr Like -cat, but also recursively includes pages in
- subcategories, sub-subcategories etc. of the
- given category.
- Argument can also be given as "-catr:categoryname" or
- as "-catr:categoryname|fromtitle".
-
--subcats Work on all subcategories of a specific category.
- Argument can also be given as "-subcats:categoryname" or
- as "-subcats:categoryname|fromtitle".
-
--subcatsr Like -subcats, but also includes sub-subcategories etc. of
- the given category.
- Argument can also be given as "-subcatsr:categoryname" or
- as "-subcatsr:categoryname|fromtitle".
-
--uncat Work on all pages which are not categorised.
-
--uncatcat Work on all categories which are not categorised.
-
--uncatfiles Work on all files which are not categorised.
-
--file Read a list of pages to treat from the named text file.
- Page titles in the file must be enclosed with [[brackets]].
- Argument can also be given as "-file:filename".
-
--filelinks Work on all pages that use a certain image/media file.
- Argument can also be given as "-filelinks:filename".
-
--yahoo Work on all pages that are found in a Yahoo search.
- Depends on python module pYsearch. See yahoo_appid in
- config.py for instructions.
-
--search Work on all pages that are found in a MediaWiki search
- across all namespaces.
-
--google Work on all pages that are found in a Google search.
- You need a Google Web API license key. Note that Google
- doesn't give out license keys anymore. See google_key in
- config.py for instructions.
- Argument can also be given as "-google:searchstring".
-
--interwiki Work on the given page and all equivalent pages in other
- languages. This can, for example, be used to fight
- multi-site spamming.
- Attention: this will cause the bot to modify
- pages on several wiki sites, this is not well tested,
- so check your edits!
-
--links Work on all pages that are linked from a certain page.
- Argument can also be given as "-links:linkingpagetitle".
-
--new Work on the 60 newest pages. If given as -new:x, will work
- on the x newest pages.
-
--imagelinks Work on all images that are linked from a certain page.
- Argument can also be given as
"-imagelinks:linkingpagetitle".
-
--newimages Work on the 100 newest images. If given as -newimages:x,
- will work on the x newest images.
-
--ref Work on all pages that link to a certain page.
- Argument can also be given as "-ref:referredpagetitle".
-
--start Specifies that the robot should go alphabetically through
- all pages on the home wiki, starting at the named page.
- Argument can also be given as "-start:pagetitle".
-
- You can also include a namespace. For example,
- "-start:Template:!" will make the bot work on all pages
- in the template namespace.
-
--prefixindex Work on pages commencing with a common prefix.
-
--regex Obsolete, use -titleregex
-
--titleregex Work on titles that match the given regular expression.
-
--transcludes Work on all pages that use a certain template.
- Argument can also be given as "-transcludes:Template:Title".
-
--unusedfiles Work on all description pages of images/media files that are
- not used anywhere.
- Argument can be given as "-unusedfiles:n" where
- n is the maximum number of articles to work on.
-
--unwatched Work on all articles that are not watched by anyone.
- Argument can be given as "-unwatched:n" where
- n is the maximum number of articles to work on.
-
--usercontribs Work on all articles that were edited by a certain user :
- Example : -usercontribs:DumZiBoT
-
--weblink Work on all articles that contain an external link to
- a given URL; may be given as "-weblink:url"
-
--withoutinterwiki Work on all pages that don't have interlanguage links.
- Argument can be given as "-withoutinterwiki:n" where
- n is some number (??).
-"""
-
-docuReplacements = {'¶ms;': parameterHelp}
-
-# if a bot uses GeneratorFactory, the module should include the line
-# docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp}
-# and include the marker ¶ms; in the module's docstring
-
-
-class GeneratorFactory(object):
- """Process command line arguments and return appropriate page
generator."""
-
- def setCategoryGen(self, arg, length, recurse = False):
- if len(arg) == length:
- categoryname = pywikibot.input(u'Please enter the category name:')
- else:
- categoryname = arg[length + 1:]
-
- ind = categoryname.find('|')
- if ind > 0:
- startfrom = categoryname[ind + 1:]
- categoryname = categoryname[:ind]
- else:
- startfrom = None
-
- cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
- return CategorizedPageGenerator(cat, start=startfrom, recurse=recurse)
-
- def setSubCategoriesGen(self, arg, length, recurse=False):
- if len(arg) == length:
- categoryname = pywikibot.input(u'Please enter the category name:')
- else:
- categoryname = arg[length + 1:]
-
- ind = categoryname.find('|')
- if ind > 0:
- startfrom = categoryname[ind + 1:]
- categoryname = categoryname[:ind]
- else:
- startfrom = None
-
- cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
- return SubCategoriesPageGenerator(cat, start=startfrom, recurse=recurse)
-
- def handleArg(self, arg):
- gen = None
- if arg.startswith('-filelinks'):
- fileLinksPageTitle = arg[11:]
- if not fileLinksPageTitle:
- fileLinksPageTitle = pywikibot.input(
- u'Links to which image page should be processed?')
- if fileLinksPageTitle.startswith(pywikibot.Site().namespace(6)
- + ":"):
- fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
- fileLinksPageTitle)
- else:
- fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
- 'Image:' +
- fileLinksPageTitle)
- gen = FileLinksGenerator(fileLinksPage)
- elif arg.startswith('-unusedfiles'):
- if len(arg) == 12:
- gen = UnusedFilesGenerator()
- else:
- gen = UnusedFilesGenerator(number = int(arg[13:]))
- elif arg.startswith('-unwatched'):
- if len(arg) == 10:
- gen = UnwatchedPagesPageGenerator()
- else:
- gen = UnwatchedPagesPageGenerator(number = int(arg[11:]))
- elif arg.startswith('-usercontribs'):
- gen = UserContributionsGenerator(arg[14:])
- elif arg.startswith('-withoutinterwiki'):
- if len(arg) == 17:
- gen = WithoutInterwikiPageGenerator()
- else:
- gen = WithoutInterwikiPageGenerator(number = int(arg[18:]))
- elif arg.startswith('-interwiki'):
- title = arg[11:]
- if not title:
- title = pywikibot.input(u'Which page should be processed?')
- page = pywikibot.Page(pywikibot.Site(), title)
- gen = InterwikiPageGenerator(page)
- elif arg.startswith('-file'):
- textfilename = arg[6:]
- if not textfilename:
- textfilename = pywikibot.input(
- u'Please enter the local file name:')
- gen = TextfilePageGenerator(textfilename)
- elif arg.startswith('-catr'):
- gen = self.setCategoryGen(arg, 5, recurse = True)
- elif arg.startswith('-cat'):
- gen = self.setCategoryGen(arg, 4)
- elif arg.startswith('-subcatsr'):
- gen = self.setSubCategoriesGen(arg, 9, recurse = True)
- elif arg.startswith('-subcats'):
- gen = self.setSubCategoriesGen(arg, 8)
- elif arg.startswith('-uncatfiles'):
- gen = UnCategorizedImageGenerator()
- elif arg.startswith('-uncatcat'):
- gen = UnCategorizedCategoryGenerator()
- elif arg.startswith('-uncat'):
- gen = UnCategorizedPageGenerator()
- elif arg.startswith('-ref'):
- referredPageTitle = arg[5:]
- if not referredPageTitle:
- referredPageTitle = pywikibot.input(
- u'Links to which page should be processed?')
- referredPage = pywikibot.Page(pywikibot.Site(), referredPageTitle)
- gen = ReferringPageGenerator(referredPage)
- elif arg.startswith('-links'):
- linkingPageTitle = arg[7:]
- if not linkingPageTitle:
- linkingPageTitle = pywikibot.input(
- u'Links from which page should be processed?')
- linkingPage = pywikibot.Page(pywikibot.Site(), linkingPageTitle)
- gen = LinkedPageGenerator(linkingPage)
- elif arg.startswith('-weblink'):
- url = arg[9:]
- if not url:
- url = pywikibot.input(
- u'Pages with which weblink should be processed?')
- gen = LinksearchPageGenerator(url)
- elif arg.startswith('-transcludes'):
- transclusionPageTitle = arg[len('-transcludes:'):]
- if not transclusionPageTitle:
- transclusionPageTitle = pywikibot.input(
- u'Pages that transclude which page should be processed?')
- transclusionPage = pywikibot.Page(pywikibot.Site(),
- 'Template:%s' % transclusionPageTitle)
- gen = ReferringPageGenerator(transclusionPage,
- onlyTemplateInclusion=True)
- elif arg.startswith('-start'):
- if arg.startswith('-startxml'):
- pywikibot.output(u'-startxml : wrong parameter')
- raise ValueError
- firstPageTitle = arg[7:]
- if not firstPageTitle:
- firstPageTitle = pywikibot.input(
- u'At which page do you want to start?')
- namespace = pywikibot.Page(pywikibot.Site(),
- firstPageTitle).namespace()
- firstPageTitle = pywikibot.Page(pywikibot.link(firstPageTitle)
- ).titleWithoutNamespace()
- gen = AllpagesPageGenerator(firstPageTitle, namespace,
- includeredirects=False)
- elif arg.startswith('-prefixindex'):
- prefix = arg[13:]
- namespace = None
- if not prefix:
- prefix = pywikibot.input(
- u'What page names are you looking for?')
- gen = PrefixingPageGenerator(prefix=prefix)
- elif arg.startswith('-newimages'):
- limit = arg[11:] or pywikibot.input(
- u'How many images do you want to load?')
- gen = NewimagesPageGenerator(number=int(limit))
- elif arg.startswith('-new'):
- if len(arg) >=5:
- gen = NewpagesPageGenerator(number=int(arg[5:]))
- else:
- gen = NewpagesPageGenerator(number=60)
- elif arg.startswith('-imagelinks'):
- imagelinkstitle = arg[len('-imagelinks:'):]
- if not imagelinkstitle:
- imagelinkstitle = pywikibot.input(
- u'Images on which page should be processed?')
- imagelinksPage = pywikibot.Page(pywikibot.Link(imagelinkstitle))
- gen = ImagesPageGenerator(imagelinksPage)
- elif arg.startswith('-search'):
- mediawikiQuery = arg[8:]
- if not mediawikiQuery:
- mediawikiQuery = pywikibot.input(
- u'What do you want to search for?')
- # In order to be useful, all namespaces are required
- gen = SearchPageGenerator(mediawikiQuery, namespaces = [])
- elif arg.startswith('-google'):
- gen = GoogleSearchPageGenerator(arg[8:])
- elif arg.startswith('-titleregex'):
- if len(arg) == 6:
- regex = pywikibot.input(
- u'What page names are you looking for?')
- else:
- regex = arg[7:]
- gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex)
- elif arg.startswith('-yahoo'):
- gen = YahooSearchPageGenerator(arg[7:])
- else:
- return None
- # make sure all yielded pages are unique
- gen = DuplicateFilterPageGenerator(gen)
- return gen
-
-
-class ThreadedGenerator(threading.Thread):
- """Look-ahead generator class.
-
- Runs a generator in a separate thread and queues the results; can
- be called like a regular generator.
-
- Subclasses should override self.generator, _not_ self.run
-
- Important: the generator thread will stop itself if the generator's
- internal queue is exhausted; but, if the calling program does not use
- all the generated values, it must call the generator's stop() method to
- stop the background thread. Example usage:
-
- >>> gen = ThreadedGenerator(target=foo)
- >>> try:
- ... for data in gen:
- ... do_work(data)
- ... finally:
- ... gen.stop()
-
- """ #NOT CURRENTLY USED: Intended for future development
-
- def __init__(self, group=None, target=None, name="GeneratorThread",
- args=(), kwargs=None, qsize=65536):
- """Constructor. Takes same keyword arguments as
threading.Thread.
-
- target must be a generator function (or other callable that returns
- an iterable object).
-
- @param qsize: The size of the lookahead queue. The larger the qsize,
- the more values will be computed in advance of use (which can eat
- up memory and processor time).
- @type qsize: int
-
- """
- if kwargs is None:
- kwargs = {}
- if target:
- self.generator = target
- if not hasattr(self, "generator"):
- raise RuntimeError("No generator for ThreadedGenerator to run.")
- self.args, self.kwargs = args, kwargs
- threading.Thread.__init__(self, group=group, name=name)
- self.queue = Queue.Queue(qsize)
- self.finished = threading.Event()
-
- def __iter__(self):
- """Iterate results from the queue."""
- if not self.isAlive() and not self.finished.isSet():
- self.start()
- # if there is an item in the queue, yield it, otherwise wait
- while not self.finished.isSet():
- try:
- yield self.queue.get(True, 0.25)
- except Queue.Empty:
- pass
- except KeyboardInterrupt:
- self.stop()
-
- def stop(self):
- """Stop the background thread."""
-## if not self.finished.isSet():
-## pywikibot.output("DEBUG: signalling %s to stop." % self)
- self.finished.set()
-
- def run(self):
- """Run the generator and store the results on the
queue."""
- self.__gen = self.generator(*self.args, **self.kwargs)
- for result in self.__gen:
- while True:
- if self.finished.isSet():
-## pywikibot.output("DEBUG: %s received stop signal." %
self)
- return
- try:
- self.queue.put_nowait(result)
- except Queue.Full:
- time.sleep(0.25)
- continue
- break
- # wait for queue to be emptied, then kill the thread
- while not self.finished.isSet() and not self.queue.empty():
- time.sleep(0.25)
- self.stop()
-## pywikibot.output("DEBUG: %s stopped because generator exhausted." %
self)
-
-
-def AllpagesPageGenerator(start ='!', namespace=None, includeredirects=True,
- site=None):
- """
- Using the Allpages special page, retrieve all articles' titles, and yield
- page objects.
- If includeredirects is False, redirects are not included. If
- includeredirects equals the string 'only', only redirects are added.
- """
- if site is None:
- site = pywikibot.getSite()
- if includeredirects:
- if includeredirects == 'only':
- filterredir = True
- else:
- filterredir = None
- else:
- filterredir = False
- return site.allpages(start=start, namespace=namespace,
- filterredir=filterredir)
-
-
-def PrefixingPageGenerator(prefix, namespace=None, includeredirects=True,
- site=None):
- if site is None:
- site = pywikibot.Site()
- page = pywikibot.Page(site, prefix)
- if namespace is None:
- namespace = page.namespace()
- title = page.titleWithoutNamespace()
- if includeredirects:
- if includeredirects == 'only':
- filterredir = True
- else:
- filterredir = None
- else:
- filterredir = False
- return site.allpages(prefix=title, namespace=namespace,
- filterredir=filterredir)
-
-
-def NewpagesPageGenerator(number=100, get_redirect=False, repeat=False,
- site=None):
- # API does not (yet) have a newpages function, so this tries to duplicate
- # it by filtering the recentchanges output
- # defaults to namespace 0 because that's how Special:Newpages defaults
- if site is None:
- site = pywikibot.Site()
- return site.recentchanges(limit=number, showredirects=get_redirect,
- changetype="new", namespaces=0)
-
-
-def FileLinksGenerator(referredImagePage):
- return referredImagePage.usingPages()
-
-
-def ImagesPageGenerator(pageWithImages):
- return pageWithImages.imagelinks()
-
-
-def InterwikiPageGenerator(page):
- """Iterator over all interwiki (non-language) links on a
page."""
- for link in page.interwiki():
- yield pywikibot.Page(link)
-
-
-def LanguageLinksPageGenerator(page):
- """Iterator over all interwiki language links on a
page."""
- for link in page.langlinks():
- yield pywikibot.Page(link)
-
-
-def ReferringPageGenerator(referredPage, followRedirects=False,
- withTemplateInclusion=True,
- onlyTemplateInclusion=False):
- '''Yields all pages referring to a specific page.'''
- return referredPage.getReferences(
- follow_redirects=followRedirects,
- withTemplateInclusion=withTemplateInclusion,
- onlyTemplateInclusion=onlyTemplateInclusion)
-
-
-def CategorizedPageGenerator(category, recurse=False, start=None):
- '''Yield all pages in a specific category.
-
- If recurse is True, pages in subcategories are included as well; if
- recurse is an int, only subcategories to that depth will be included
- (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
- not go any further).
- If start is a string value, only pages whose sortkey comes after start
- alphabetically are included.
-
- ''' # TODO: page generator could be modified to use cmstartsortkey ...
- for a in category.articles(recurse=recurse):
- if start is None or a.title(withNamespace=False) >= start:
- yield a
-
-
-def SubCategoriesPageGenerator(category, recurse=False, start=None):
- '''Yields all subcategories in a specific category.
-
- If recurse is True, pages in subcategories are included as well; if
- recurse is an int, only subcategories to that depth will be included
- (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
- not go any further).
- If start is a string value, only categories whose sortkey comes after
- start alphabetically are included.
-
- ''' # TODO: page generator could be modified to use cmstartsortkey ...
- for s in category.subcategories(recurse=recurse):
- if start is None or s.title(withNamespace=False) >= start:
- yield s
-
-
-def LinkedPageGenerator(linkingPage):
- """Yields all pages linked from a specific page."""
- return linkingPage.linkedPages()
-
-
-def TextfilePageGenerator(filename=None, site=None):
- """Iterate pages from a list in a text file.
-
- The file must contain page links between double-square-brackets. The
- generator will yield each corresponding Page object.
-
- @param filename: the name of the file that should be read. If no name is
- given, the generator prompts the user.
- @param site: the default Site for which Page objects should be created
-
- """
- if filename is None:
- filename = pywikibot.input(u'Please enter the filename:')
- if site is None:
- site = pywikibot.Site()
- f = codecs.open(filename, 'r', config.textfile_encoding)
- for linkmatch in pywikibot.link_regex.finditer(f.read()):
- # If the link is in interwiki format, the Page object may reside
- # on a different Site than the default.
- # This makes it possible to work on different wikis using a single
- # text file, but also could be dangerous because you might
- # inadvertently change pages on another wiki!
- yield pywikibot.Page(pywikibot.Link(linkmatch.groups("title"), site))
- f.close()
-
-
-def PagesFromTitlesGenerator(iterable, site=None):
- """Generate pages from the titles (unicode strings) yielded by
iterable."""
- if site is None:
- site = pywikibot.Site()
- for title in iterable:
- if not isinstance(title, basestring):
- break
- yield pywikibot.Page(pywikibot.Link(title, site))
-
-
-def UserContributionsGenerator(username, number=250, namespaces=None,
- site=None):
- """Yields number unique pages edited by user:username
- namespaces : list of namespace numbers to fetch contribs from
-
- """
- if site is None:
- site = pywikibot.Site()
- return site.usercontribs(user=username, limit=number, namespaces=namespaces)
-
-
-def NamespaceFilterPageGenerator(generator, namespaces, site=None):
- """
- Wraps around another generator. Yields only those pages that are in one
- of the given namespaces.
-
- The namespace list can contain both integers (namespace numbers) and
- strings/unicode strings (namespace names).
-
- """
- if site is None:
- site = pywikibot.Site()
- # convert namespace names to namespace numbers
- for i in xrange(len(namespaces)):
- ns = namespaces[i]
- if isinstance(ns, basestring):
- index = site.getNamespaceIndex(ns)
- if index is None:
- raise ValueError(u'Unknown namespace: %s' % ns)
- namespaces[i] = index
- for page in generator:
- if page.namespace() in namespaces:
- yield page
-
-
-def RedirectFilterPageGenerator(generator):
- """Yields pages from another generator that are not
redirects."""
- for page in generator:
- if not page.isRedirectPage():
- yield page
-
-
-def DuplicateFilterPageGenerator(generator):
- """Yield all unique pages from another generator, omitting
duplicates."""
- seenPages = {}
- for page in generator:
- if page not in seenPages:
- seenPages[page] = None
- yield page
-
-
-def RegexFilterPageGenerator(generator, regex):
- """Yield pages from another generator whose titles match
regex."""
- reg = re.compile(regex, re.I)
- for page in generator:
- if reg.match(page.titleWithoutNamespace()):
- yield page
-
-
-def CombinedPageGenerator(generators):
- return itertools.chain(*generators)
-
-
-def CategoryGenerator(generator):
- """Yield pages from another generator as Category objects.
-
- Makes sense only if it is ascertained that only categories are being
- retrieved.
-
- """
- for page in generator:
- yield pywikibot.Category(page)
-
-
-def PageWithTalkPageGenerator(generator):
- """
- Wraps around another generator. Yields the same pages, but for non-talk
- pages, it also includes associated talk pages.
- This generator does not check if the talk page in fact exists.
- """
- for page in generator:
- yield page
- if not page.isTalkPage():
- yield page.toggleTalkPage()
-
-
-def PreloadingGenerator(generator, pageNumber=60, lookahead=10):
- """Yield preloaded pages taken from another
generator."""
-
- # pages may be on more than one site, for example if an interwiki
- # generator is used, so use a separate preloader for each site
- sites = {}
- # build a list of pages for each site found in the iterator
- for page in generator:
- sites.setdefault(page.site(), []).append(page)
- return itertools.chain(*(site.preloadpages(sites[site], pageNumber)
- for site in sites))
-
-
-#TODO below
-
-def UnusedFilesGenerator(number=100, repeat=False, site=None, extension=None):
- if site is None:
- site = pywikibot.Site()
- for page in site.unusedfiles(number=number, repeat=repeat,
- extension=extension):
- yield pywikibot.ImagePage(page.site(), page.title())
-
-def WithoutInterwikiPageGenerator(number=100, repeat=False, site=None):
- if site is None:
- site = pywikibot.Site()
- for page in site.withoutinterwiki(number=number, repeat=repeat):
- yield page
-
-def UnCategorizedCategoryGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.uncategorizedcategories(number=number, repeat=repeat):
- yield page
-
-def UnCategorizedImageGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.uncategorizedimages(number=number, repeat=repeat):
- yield page
-
-def NewimagesPageGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.newimages(number, repeat=repeat):
- yield page[0]
-
-def UnCategorizedPageGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.uncategorizedpages(number=number, repeat=repeat):
- yield page
-
-def LonelyPagesPageGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.lonelypages(number=number, repeat=repeat):
- yield page
-
-def UnwatchedPagesPageGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.unwatchedpages(number=number, repeat=repeat):
- yield page
-
-def AncientPagesPageGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.ancientpages(number=number, repeat=repeat):
- yield page[0]
-
-def DeadendPagesPageGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.deadendpages(number=number, repeat=repeat):
- yield page
-
-def LongPagesPageGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.longpages(number=number, repeat=repeat):
- yield page[0]
-
-def ShortPagesPageGenerator(number = 100, repeat = False, site = None):
- if site is None:
- site = pywikibot.Site()
- for page in site.shortpages(number=number, repeat=repeat):
- yield page[0]
-
-def LinksearchPageGenerator(link, step=500, site=None):
- """Yields all pages that include a specified link, according to
- [[Special:Linksearch]].
-
- """
- if site is None:
- site = pywikibot.Site()
- for page in site.linksearch(link, limit=step):
- yield page
-
-def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
- """
- Provides a list of results using the internal MediaWiki search engine
- """
- if site is None:
- site = pywikibot.Site()
- for page in site.search(query, number=number, namespaces = namespaces):
- yield page[0]
-
-class YahooSearchPageGenerator:
- '''
- To use this generator, install pYsearch
- '''
- def __init__(self, query = None, count = 100, site = None): # values larger than 100
fail
- self.query = query or pywikibot.input(u'Please enter the search query:')
- self.count = count
- if site is None:
- site = pywikibot.Site()
- self.site = site
-
- def queryYahoo(self, query):
- from yahoo.search.web import WebSearch
- srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
-
- dom = srch.get_results()
- results = srch.parse_results(dom)
- for res in results:
- url = res.Url
- yield url
-
- def __iter__(self):
- # restrict query to local site
- localQuery = '%s site:%s' % (self.query, self.site.hostname())
- base = 'http://%s%s' % (self.site.hostname(),
self.site.nice_get_address(''))
- for url in self.queryYahoo(localQuery):
- if url[:len(base)] == base:
- title = url[len(base):]
- page = pywikibot.Page(self.site, title)
- yield page
-
-class GoogleSearchPageGenerator:
- '''
- To use this generator, you must install the pyGoogle module from
-
http://pygoogle.sf.net/ and get a Google Web API license key from
-
http://www.google.com/apis/index.html . The google_key must be set to your
- license key in your configuration.
- '''
- def __init__(self, query = None, site = None):
- self.query = query or pywikibot.input(u'Please enter the search query:')
- if site is None:
- site = pywikibot.Site()
- self.site = site
-
- #########
- # partially commented out because it is probably not in compliance with Google's
"Terms of
- # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
- def queryGoogle(self, query):
- #if config.google_key:
- if True:
- #try:
- for url in self.queryViaSoapApi(query):
- yield url
- return
- #except ImportError:
- #pass
- # No google license key, or pygoogle not installed. Do it the ugly way.
- #for url in self.queryViaWeb(query):
- # yield url
-
- def queryViaSoapApi(self, query):
- import google
- google.LICENSE_KEY = config.google_key
- offset = 0
- estimatedTotalResultsCount = None
- while not estimatedTotalResultsCount or offset < estimatedTotalResultsCount:
- while (True):
- # Google often yields 502 errors.
- try:
- pywikibot.output(u'Querying Google, offset %i' % offset)
- data = google.doGoogleSearch(query, start = offset, filter = False)
- break
- except KeyboardInterrupt:
- raise
- except:
- # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway)
- # can happen here, depending on the module used. It's not easy
- # to catch this properly because pygoogle decides which one of
- # the soap modules to use.
- pywikibot.output(u"An error occured. Retrying in 10
seconds...")
- time.sleep(10)
- continue
-
- for result in data.results:
- #print 'DBG: ', result.URL
- yield result.URL
- # give an estimate of pages to work on, but only once.
- if not estimatedTotalResultsCount:
- pywikibot.output(u'Estimated total result count: %i pages.' %
data.meta.estimatedTotalResultsCount)
- estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount
- #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount
- offset += 10
-
- #########
- # commented out because it is probably not in compliance with Google's
"Terms of
- # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
-
- #def queryViaWeb(self, query):
- #"""
- #Google has stopped giving out API license keys, and sooner or later
- #they will probably shut down the service.
- #This is a quick and ugly solution: we just grab the search results from
- #the normal web interface.
- #"""
- #linkR = re.compile(r'<a href="([^>"]+?)"
class=l>', re.IGNORECASE)
- #offset = 0
-
- #while True:
- #pywikibot.output("Google: Querying page %d" % (offset / 100 + 1))
- #address =
"http://www.google.com/search?q=%s&num=100&hl=en&start=%d" %
(urllib.quote_plus(query), offset)
- ## we fake being Firefox because Google blocks unknown browsers
- #request = urllib2.Request(address, None, {'User-Agent':
'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1
Firefox/1.5'})
- #urlfile = urllib2.urlopen(request)
- #page = urlfile.read()
- #urlfile.close()
- #for url in linkR.findall(page):
- #yield url
- #if "<div id=nn>" in page: # Is there a "Next" link
for next page of results?
- #offset += 100 # Yes, go to next page of results.
- #else:
- #return
- #########
-
- def __iter__(self):
- # restrict query to local site
- localQuery = '%s site:%s' % (self.query, self.site.hostname())
- base = 'http://%s%s' % (self.site.hostname(),
self.site.nice_get_address(''))
- for url in self.queryGoogle(localQuery):
- if url[:len(base)] == base:
- title = url[len(base):]
- page = pywikibot.Page(self.site, title)
- # Google contains links in the format
http://de.wikipedia.org/wiki/en:Foobar
- if page.site() == self.site:
- yield page
-
-def MySQLPageGenerator(query, site = None):
- import MySQLdb as mysqldb
- if site is None:
- site = pywikibot.Site()
- conn = mysqldb.connect(config.db_hostname, db = site.dbName(),
- user = config.db_username,
- passwd = config.db_password)
- cursor = conn.cursor()
- pywikibot.output(u'Executing query:\n%s' % query)
- query = query.encode(site.encoding())
- cursor.execute(query)
- while True:
- try:
- namespaceNumber, pageName = cursor.fetchone()
- print namespaceNumber, pageName
- except TypeError:
- # Limit reached or no more results
- break
- #print pageName
- if pageName:
- namespace = site.namespace(namespaceNumber)
- pageName = unicode(pageName, site.encoding())
- if namespace:
- pageTitle = '%s:%s' % (namespace, pageName)
- else:
- pageTitle = pageName
- page = pywikibot.Page(site, pageTitle)
- yield page
-
-def YearPageGenerator(start = 1, end = 2050, site = None):
- if site is None:
- site = pywikibot.Site()
- pywikibot.output(u"Starting with year %i" % start)
- for i in xrange(start, end + 1):
- if i % 100 == 0:
- pywikibot.output(u'Preparing %i...' % i)
- # There is no year 0
- if i != 0:
- current_year = date.formatYear(site.lang, i )
- yield pywikibot.Page(site, current_year)
-
-def DayPageGenerator(startMonth = 1, endMonth = 12, site = None):
- if site is None:
- site = pywikibot.Site()
- fd = date.FormatDate(site)
- firstPage = pywikibot.Page(site, fd(startMonth, 1))
- pywikibot.output(u"Starting with %s" % firstPage.aslink())
- for month in xrange(startMonth, endMonth+1):
- for day in xrange(1, date.getNumberOfDaysInMonth(month)+1):
- yield pywikibot.Page(site, fd(month, day))
-
-
-if __name__ == "__main__":
- try:
- gen = None
- genFactory = GeneratorFactory()
- for arg in pywikibot.handleArgs():
- generator = genFactory.handleArg(arg)
- if generator:
- gen = generator
- if gen:
- for page in gen:
- pywikibot.output(page.title(), toStdout = True)
- else:
- pywikibot.showHelp()
- finally:
- pywikibot.stopme()
+# -*- coding: utf-8 -*-
+"""This module offers a wide variety of page generators. A page generator
is an
+object that is iterable (see
http://www.python.org/dev/peps/pep-0255/ ) and
+that yields page objects on which other scripts can then work.
+
+In general, there is no need to run this script directly. It can, however,
+be run for testing purposes. It will then print the page titles to standard
+output.
+
+These parameters are supported to specify which pages titles to print:
+
+¶ms;
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import pywikibot
+
+import itertools
+import Queue
+import re
+import sys
+import threading
+
+
+# ported from version 1 for backwards-compatibility
+# most of these functions just wrap a Site or Page method that returns
+# a generator
+
+parameterHelp = """\
+-cat Work on all pages which are in a specific category.
+ Argument can also be given as "-cat:categoryname" or
+ as "-cat:categoryname|fromtitle".
+
+-catr Like -cat, but also recursively includes pages in
+ subcategories, sub-subcategories etc. of the
+ given category.
+ Argument can also be given as "-catr:categoryname" or
+ as "-catr:categoryname|fromtitle".
+
+-subcats Work on all subcategories of a specific category.
+ Argument can also be given as "-subcats:categoryname" or
+ as "-subcats:categoryname|fromtitle".
+
+-subcatsr Like -subcats, but also includes sub-subcategories etc. of
+ the given category.
+ Argument can also be given as "-subcatsr:categoryname" or
+ as "-subcatsr:categoryname|fromtitle".
+
+-uncat Work on all pages which are not categorised.
+
+-uncatcat Work on all categories which are not categorised.
+
+-uncatfiles Work on all files which are not categorised.
+
+-file Read a list of pages to treat from the named text file.
+ Page titles in the file must be enclosed with [[brackets]].
+ Argument can also be given as "-file:filename".
+
+-filelinks Work on all pages that use a certain image/media file.
+ Argument can also be given as "-filelinks:filename".
+
+-yahoo Work on all pages that are found in a Yahoo search.
+ Depends on python module pYsearch. See yahoo_appid in
+ config.py for instructions.
+
+-search Work on all pages that are found in a MediaWiki search
+ across all namespaces.
+
+-google Work on all pages that are found in a Google search.
+ You need a Google Web API license key. Note that Google
+ doesn't give out license keys anymore. See google_key in
+ config.py for instructions.
+ Argument can also be given as "-google:searchstring".
+
+-interwiki Work on the given page and all equivalent pages in other
+ languages. This can, for example, be used to fight
+ multi-site spamming.
+ Attention: this will cause the bot to modify
+ pages on several wiki sites, this is not well tested,
+ so check your edits!
+
+-links Work on all pages that are linked from a certain page.
+ Argument can also be given as "-links:linkingpagetitle".
+
+-new Work on the 60 newest pages. If given as -new:x, will work
+ on the x newest pages.
+
+-imagelinks Work on all images that are linked from a certain page.
+ Argument can also be given as
"-imagelinks:linkingpagetitle".
+
+-newimages Work on the 100 newest images. If given as -newimages:x,
+ will work on the x newest images.
+
+-ref Work on all pages that link to a certain page.
+ Argument can also be given as "-ref:referredpagetitle".
+
+-start Specifies that the robot should go alphabetically through
+ all pages on the home wiki, starting at the named page.
+ Argument can also be given as "-start:pagetitle".
+
+ You can also include a namespace. For example,
+ "-start:Template:!" will make the bot work on all pages
+ in the template namespace.
+
+-prefixindex Work on pages commencing with a common prefix.
+
+-regex Obsolete, use -titleregex
+
+-titleregex Work on titles that match the given regular expression.
+
+-transcludes Work on all pages that use a certain template.
+ Argument can also be given as "-transcludes:Template:Title".
+
+-unusedfiles Work on all description pages of images/media files that are
+ not used anywhere.
+ Argument can be given as "-unusedfiles:n" where
+ n is the maximum number of articles to work on.
+
+-unwatched Work on all articles that are not watched by anyone.
+ Argument can be given as "-unwatched:n" where
+ n is the maximum number of articles to work on.
+
+-usercontribs Work on all articles that were edited by a certain user :
+ Example : -usercontribs:DumZiBoT
+
+-weblink Work on all articles that contain an external link to
+ a given URL; may be given as "-weblink:url"
+
+-withoutinterwiki Work on all pages that don't have interlanguage links.
+ Argument can be given as "-withoutinterwiki:n" where
+ n is some number (??).
+"""
+
+docuReplacements = {'¶ms;': parameterHelp}
+
+# if a bot uses GeneratorFactory, the module should include the line
+# docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp}
+# and include the marker ¶ms; in the module's docstring
+
+
+class GeneratorFactory(object):
+ """Process command line arguments and return appropriate page
generator."""
+
+ def setCategoryGen(self, arg, length, recurse = False):
+ if len(arg) == length:
+ categoryname = pywikibot.input(u'Please enter the category name:')
+ else:
+ categoryname = arg[length + 1:]
+
+ ind = categoryname.find('|')
+ if ind > 0:
+ startfrom = categoryname[ind + 1:]
+ categoryname = categoryname[:ind]
+ else:
+ startfrom = None
+
+ cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
+ return CategorizedPageGenerator(cat, start=startfrom, recurse=recurse)
+
+ def setSubCategoriesGen(self, arg, length, recurse=False):
+ if len(arg) == length:
+ categoryname = pywikibot.input(u'Please enter the category name:')
+ else:
+ categoryname = arg[length + 1:]
+
+ ind = categoryname.find('|')
+ if ind > 0:
+ startfrom = categoryname[ind + 1:]
+ categoryname = categoryname[:ind]
+ else:
+ startfrom = None
+
+ cat = pywikibot.Category(pywikibot.Link('Category:%s' % categoryname))
+ return SubCategoriesPageGenerator(cat, start=startfrom, recurse=recurse)
+
+ def handleArg(self, arg):
+ gen = None
+ if arg.startswith('-filelinks'):
+ fileLinksPageTitle = arg[11:]
+ if not fileLinksPageTitle:
+ fileLinksPageTitle = pywikibot.input(
+ u'Links to which image page should be processed?')
+ if fileLinksPageTitle.startswith(pywikibot.Site().namespace(6)
+ + ":"):
+ fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
+ fileLinksPageTitle)
+ else:
+ fileLinksPage = pywikibot.ImagePage(pywikibot.Site(),
+ 'Image:' +
+ fileLinksPageTitle)
+ gen = FileLinksGenerator(fileLinksPage)
+ elif arg.startswith('-unusedfiles'):
+ if len(arg) == 12:
+ gen = UnusedFilesGenerator()
+ else:
+ gen = UnusedFilesGenerator(number = int(arg[13:]))
+ elif arg.startswith('-unwatched'):
+ if len(arg) == 10:
+ gen = UnwatchedPagesPageGenerator()
+ else:
+ gen = UnwatchedPagesPageGenerator(number = int(arg[11:]))
+ elif arg.startswith('-usercontribs'):
+ gen = UserContributionsGenerator(arg[14:])
+ elif arg.startswith('-withoutinterwiki'):
+ if len(arg) == 17:
+ gen = WithoutInterwikiPageGenerator()
+ else:
+ gen = WithoutInterwikiPageGenerator(number = int(arg[18:]))
+ elif arg.startswith('-interwiki'):
+ title = arg[11:]
+ if not title:
+ title = pywikibot.input(u'Which page should be processed?')
+ page = pywikibot.Page(pywikibot.Site(), title)
+ gen = InterwikiPageGenerator(page)
+ elif arg.startswith('-file'):
+ textfilename = arg[6:]
+ if not textfilename:
+ textfilename = pywikibot.input(
+ u'Please enter the local file name:')
+ gen = TextfilePageGenerator(textfilename)
+ elif arg.startswith('-catr'):
+ gen = self.setCategoryGen(arg, 5, recurse = True)
+ elif arg.startswith('-cat'):
+ gen = self.setCategoryGen(arg, 4)
+ elif arg.startswith('-subcatsr'):
+ gen = self.setSubCategoriesGen(arg, 9, recurse = True)
+ elif arg.startswith('-subcats'):
+ gen = self.setSubCategoriesGen(arg, 8)
+ elif arg.startswith('-uncatfiles'):
+ gen = UnCategorizedImageGenerator()
+ elif arg.startswith('-uncatcat'):
+ gen = UnCategorizedCategoryGenerator()
+ elif arg.startswith('-uncat'):
+ gen = UnCategorizedPageGenerator()
+ elif arg.startswith('-ref'):
+ referredPageTitle = arg[5:]
+ if not referredPageTitle:
+ referredPageTitle = pywikibot.input(
+ u'Links to which page should be processed?')
+ referredPage = pywikibot.Page(pywikibot.Site(), referredPageTitle)
+ gen = ReferringPageGenerator(referredPage)
+ elif arg.startswith('-links'):
+ linkingPageTitle = arg[7:]
+ if not linkingPageTitle:
+ linkingPageTitle = pywikibot.input(
+ u'Links from which page should be processed?')
+ linkingPage = pywikibot.Page(pywikibot.Site(), linkingPageTitle)
+ gen = LinkedPageGenerator(linkingPage)
+ elif arg.startswith('-weblink'):
+ url = arg[9:]
+ if not url:
+ url = pywikibot.input(
+ u'Pages with which weblink should be processed?')
+ gen = LinksearchPageGenerator(url)
+ elif arg.startswith('-transcludes'):
+ transclusionPageTitle = arg[len('-transcludes:'):]
+ if not transclusionPageTitle:
+ transclusionPageTitle = pywikibot.input(
+ u'Pages that transclude which page should be processed?')
+ transclusionPage = pywikibot.Page(pywikibot.Site(),
+ 'Template:%s' % transclusionPageTitle)
+ gen = ReferringPageGenerator(transclusionPage,
+ onlyTemplateInclusion=True)
+ elif arg.startswith('-start'):
+ if arg.startswith('-startxml'):
+ pywikibot.output(u'-startxml : wrong parameter')
+ raise ValueError
+ firstPageTitle = arg[7:]
+ if not firstPageTitle:
+ firstPageTitle = pywikibot.input(
+ u'At which page do you want to start?')
+ namespace = pywikibot.Page(pywikibot.Site(),
+ firstPageTitle).namespace()
+ firstPageTitle = pywikibot.Page(pywikibot.link(firstPageTitle)
+ ).titleWithoutNamespace()
+ gen = AllpagesPageGenerator(firstPageTitle, namespace,
+ includeredirects=False)
+ elif arg.startswith('-prefixindex'):
+ prefix = arg[13:]
+ namespace = None
+ if not prefix:
+ prefix = pywikibot.input(
+ u'What page names are you looking for?')
+ gen = PrefixingPageGenerator(prefix=prefix)
+ elif arg.startswith('-newimages'):
+ limit = arg[11:] or pywikibot.input(
+ u'How many images do you want to load?')
+ gen = NewimagesPageGenerator(number=int(limit))
+ elif arg.startswith('-new'):
+ if len(arg) >=5:
+ gen = NewpagesPageGenerator(number=int(arg[5:]))
+ else:
+ gen = NewpagesPageGenerator(number=60)
+ elif arg.startswith('-imagelinks'):
+ imagelinkstitle = arg[len('-imagelinks:'):]
+ if not imagelinkstitle:
+ imagelinkstitle = pywikibot.input(
+ u'Images on which page should be processed?')
+ imagelinksPage = pywikibot.Page(pywikibot.Link(imagelinkstitle))
+ gen = ImagesPageGenerator(imagelinksPage)
+ elif arg.startswith('-search'):
+ mediawikiQuery = arg[8:]
+ if not mediawikiQuery:
+ mediawikiQuery = pywikibot.input(
+ u'What do you want to search for?')
+ # In order to be useful, all namespaces are required
+ gen = SearchPageGenerator(mediawikiQuery, namespaces = [])
+ elif arg.startswith('-google'):
+ gen = GoogleSearchPageGenerator(arg[8:])
+ elif arg.startswith('-titleregex'):
+ if len(arg) == 6:
+ regex = pywikibot.input(
+ u'What page names are you looking for?')
+ else:
+ regex = arg[7:]
+ gen = RegexFilterPageGenerator(pywikibot.Site().allpages(), regex)
+ elif arg.startswith('-yahoo'):
+ gen = YahooSearchPageGenerator(arg[7:])
+ else:
+ return None
+ # make sure all yielded pages are unique
+ gen = DuplicateFilterPageGenerator(gen)
+ return gen
+
+
+class ThreadedGenerator(threading.Thread):
+ """Look-ahead generator class.
+
+ Runs a generator in a separate thread and queues the results; can
+ be called like a regular generator.
+
+ Subclasses should override self.generator, _not_ self.run
+
+ Important: the generator thread will stop itself if the generator's
+ internal queue is exhausted; but, if the calling program does not use
+ all the generated values, it must call the generator's stop() method to
+ stop the background thread. Example usage:
+
+ >>> gen = ThreadedGenerator(target=foo)
+ >>> try:
+ ... for data in gen:
+ ... do_work(data)
+ ... finally:
+ ... gen.stop()
+
+ """ #NOT CURRENTLY USED: Intended for future development
+
+ def __init__(self, group=None, target=None, name="GeneratorThread",
+ args=(), kwargs=None, qsize=65536):
+ """Constructor. Takes same keyword arguments as
threading.Thread.
+
+ target must be a generator function (or other callable that returns
+ an iterable object).
+
+ @param qsize: The size of the lookahead queue. The larger the qsize,
+ the more values will be computed in advance of use (which can eat
+ up memory and processor time).
+ @type qsize: int
+
+ """
+ if kwargs is None:
+ kwargs = {}
+ if target:
+ self.generator = target
+ if not hasattr(self, "generator"):
+ raise RuntimeError("No generator for ThreadedGenerator to run.")
+ self.args, self.kwargs = args, kwargs
+ threading.Thread.__init__(self, group=group, name=name)
+ self.queue = Queue.Queue(qsize)
+ self.finished = threading.Event()
+
+ def __iter__(self):
+ """Iterate results from the queue."""
+ if not self.isAlive() and not self.finished.isSet():
+ self.start()
+ # if there is an item in the queue, yield it, otherwise wait
+ while not self.finished.isSet():
+ try:
+ yield self.queue.get(True, 0.25)
+ except Queue.Empty:
+ pass
+ except KeyboardInterrupt:
+ self.stop()
+
+ def stop(self):
+ """Stop the background thread."""
+## if not self.finished.isSet():
+## pywikibot.output("DEBUG: signalling %s to stop." % self)
+ self.finished.set()
+
+ def run(self):
+ """Run the generator and store the results on the
queue."""
+ self.__gen = self.generator(*self.args, **self.kwargs)
+ for result in self.__gen:
+ while True:
+ if self.finished.isSet():
+## pywikibot.output("DEBUG: %s received stop signal." %
self)
+ return
+ try:
+ self.queue.put_nowait(result)
+ except Queue.Full:
+ time.sleep(0.25)
+ continue
+ break
+ # wait for queue to be emptied, then kill the thread
+ while not self.finished.isSet() and not self.queue.empty():
+ time.sleep(0.25)
+ self.stop()
+## pywikibot.output("DEBUG: %s stopped because generator exhausted." %
self)
+
+
+def AllpagesPageGenerator(start ='!', namespace=None, includeredirects=True,
+ site=None):
+ """
+ Using the Allpages special page, retrieve all articles' titles, and yield
+ page objects.
+ If includeredirects is False, redirects are not included. If
+ includeredirects equals the string 'only', only redirects are added.
+ """
+ if site is None:
+ site = pywikibot.getSite()
+ if includeredirects:
+ if includeredirects == 'only':
+ filterredir = True
+ else:
+ filterredir = None
+ else:
+ filterredir = False
+ return site.allpages(start=start, namespace=namespace,
+ filterredir=filterredir)
+
+
+def PrefixingPageGenerator(prefix, namespace=None, includeredirects=True,
+ site=None):
+ if site is None:
+ site = pywikibot.Site()
+ page = pywikibot.Page(site, prefix)
+ if namespace is None:
+ namespace = page.namespace()
+ title = page.titleWithoutNamespace()
+ if includeredirects:
+ if includeredirects == 'only':
+ filterredir = True
+ else:
+ filterredir = None
+ else:
+ filterredir = False
+ return site.allpages(prefix=title, namespace=namespace,
+ filterredir=filterredir)
+
+
+def NewpagesPageGenerator(number=100, get_redirect=False, repeat=False,
+ site=None):
+ # API does not (yet) have a newpages function, so this tries to duplicate
+ # it by filtering the recentchanges output
+ # defaults to namespace 0 because that's how Special:Newpages defaults
+ if site is None:
+ site = pywikibot.Site()
+ return site.recentchanges(limit=number, showredirects=get_redirect,
+ changetype="new", namespaces=0)
+
+
+def FileLinksGenerator(referredImagePage):
+ return referredImagePage.usingPages()
+
+
+def ImagesPageGenerator(pageWithImages):
+ return pageWithImages.imagelinks()
+
+
+def InterwikiPageGenerator(page):
+ """Iterator over all interwiki (non-language) links on a
page."""
+ for link in page.interwiki():
+ yield pywikibot.Page(link)
+
+
+def LanguageLinksPageGenerator(page):
+ """Iterator over all interwiki language links on a
page."""
+ for link in page.langlinks():
+ yield pywikibot.Page(link)
+
+
+def ReferringPageGenerator(referredPage, followRedirects=False,
+ withTemplateInclusion=True,
+ onlyTemplateInclusion=False):
+ '''Yields all pages referring to a specific page.'''
+ return referredPage.getReferences(
+ follow_redirects=followRedirects,
+ withTemplateInclusion=withTemplateInclusion,
+ onlyTemplateInclusion=onlyTemplateInclusion)
+
+
+def CategorizedPageGenerator(category, recurse=False, start=None):
+ '''Yield all pages in a specific category.
+
+ If recurse is True, pages in subcategories are included as well; if
+ recurse is an int, only subcategories to that depth will be included
+ (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
+ not go any further).
+ If start is a string value, only pages whose sortkey comes after start
+ alphabetically are included.
+
+ ''' # TODO: page generator could be modified to use cmstartsortkey ...
+ for a in category.articles(recurse=recurse):
+ if start is None or a.title(withNamespace=False) >= start:
+ yield a
+
+
+def SubCategoriesPageGenerator(category, recurse=False, start=None):
+ '''Yields all subcategories in a specific category.
+
+ If recurse is True, pages in subcategories are included as well; if
+ recurse is an int, only subcategories to that depth will be included
+ (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
+ not go any further).
+ If start is a string value, only categories whose sortkey comes after
+ start alphabetically are included.
+
+ ''' # TODO: page generator could be modified to use cmstartsortkey ...
+ for s in category.subcategories(recurse=recurse):
+ if start is None or s.title(withNamespace=False) >= start:
+ yield s
+
+
+def LinkedPageGenerator(linkingPage):
+ """Yields all pages linked from a specific page."""
+ return linkingPage.linkedPages()
+
+
+def TextfilePageGenerator(filename=None, site=None):
+ """Iterate pages from a list in a text file.
+
+ The file must contain page links between double-square-brackets. The
+ generator will yield each corresponding Page object.
+
+ @param filename: the name of the file that should be read. If no name is
+ given, the generator prompts the user.
+ @param site: the default Site for which Page objects should be created
+
+ """
+ if filename is None:
+ filename = pywikibot.input(u'Please enter the filename:')
+ if site is None:
+ site = pywikibot.Site()
+ f = codecs.open(filename, 'r', config.textfile_encoding)
+ for linkmatch in pywikibot.link_regex.finditer(f.read()):
+ # If the link is in interwiki format, the Page object may reside
+ # on a different Site than the default.
+ # This makes it possible to work on different wikis using a single
+ # text file, but also could be dangerous because you might
+ # inadvertently change pages on another wiki!
+ yield pywikibot.Page(pywikibot.Link(linkmatch.groups("title"), site))
+ f.close()
+
+
+def PagesFromTitlesGenerator(iterable, site=None):
+ """Generate pages from the titles (unicode strings) yielded by
iterable."""
+ if site is None:
+ site = pywikibot.Site()
+ for title in iterable:
+ if not isinstance(title, basestring):
+ break
+ yield pywikibot.Page(pywikibot.Link(title, site))
+
+
+def UserContributionsGenerator(username, number=250, namespaces=None,
+ site=None):
+ """Yields number unique pages edited by user:username
+ namespaces : list of namespace numbers to fetch contribs from
+
+ """
+ if site is None:
+ site = pywikibot.Site()
+ return site.usercontribs(user=username, limit=number, namespaces=namespaces)
+
+
+def NamespaceFilterPageGenerator(generator, namespaces, site=None):
+ """
+ Wraps around another generator. Yields only those pages that are in one
+ of the given namespaces.
+
+ The namespace list can contain both integers (namespace numbers) and
+ strings/unicode strings (namespace names).
+
+ """
+ if site is None:
+ site = pywikibot.Site()
+ # convert namespace names to namespace numbers
+ for i in xrange(len(namespaces)):
+ ns = namespaces[i]
+ if isinstance(ns, basestring):
+ index = site.getNamespaceIndex(ns)
+ if index is None:
+ raise ValueError(u'Unknown namespace: %s' % ns)
+ namespaces[i] = index
+ for page in generator:
+ if page.namespace() in namespaces:
+ yield page
+
+
+def RedirectFilterPageGenerator(generator):
+ """Yields pages from another generator that are not
redirects."""
+ for page in generator:
+ if not page.isRedirectPage():
+ yield page
+
+
+def DuplicateFilterPageGenerator(generator):
+ """Yield all unique pages from another generator, omitting
duplicates."""
+ seenPages = {}
+ for page in generator:
+ if page not in seenPages:
+ seenPages[page] = None
+ yield page
+
+
+def RegexFilterPageGenerator(generator, regex):
+ """Yield pages from another generator whose titles match
regex."""
+ reg = re.compile(regex, re.I)
+ for page in generator:
+ if reg.match(page.titleWithoutNamespace()):
+ yield page
+
+
+def CombinedPageGenerator(generators):
+ return itertools.chain(*generators)
+
+
+def CategoryGenerator(generator):
+ """Yield pages from another generator as Category objects.
+
+ Makes sense only if it is ascertained that only categories are being
+ retrieved.
+
+ """
+ for page in generator:
+ yield pywikibot.Category(page)
+
+
+def PageWithTalkPageGenerator(generator):
+ """
+ Wraps around another generator. Yields the same pages, but for non-talk
+ pages, it also includes associated talk pages.
+ This generator does not check if the talk page in fact exists.
+ """
+ for page in generator:
+ yield page
+ if not page.isTalkPage():
+ yield page.toggleTalkPage()
+
+
+def PreloadingGenerator(generator, pageNumber=60, lookahead=10):
+ """Yield preloaded pages taken from another
generator."""
+
+ # pages may be on more than one site, for example if an interwiki
+ # generator is used, so use a separate preloader for each site
+ sites = {}
+ # build a list of pages for each site found in the iterator
+ for page in generator:
+ sites.setdefault(page.site(), []).append(page)
+ return itertools.chain(*(site.preloadpages(sites[site], pageNumber)
+ for site in sites))
+
+
+#TODO below
+
+def UnusedFilesGenerator(number=100, repeat=False, site=None, extension=None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.unusedfiles(number=number, repeat=repeat,
+ extension=extension):
+ yield pywikibot.ImagePage(page.site(), page.title())
+
+def WithoutInterwikiPageGenerator(number=100, repeat=False, site=None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.withoutinterwiki(number=number, repeat=repeat):
+ yield page
+
+def UnCategorizedCategoryGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.uncategorizedcategories(number=number, repeat=repeat):
+ yield page
+
+def UnCategorizedImageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.uncategorizedimages(number=number, repeat=repeat):
+ yield page
+
+def NewimagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.newimages(number, repeat=repeat):
+ yield page[0]
+
+def UnCategorizedPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.uncategorizedpages(number=number, repeat=repeat):
+ yield page
+
+def LonelyPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.lonelypages(number=number, repeat=repeat):
+ yield page
+
+def UnwatchedPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.unwatchedpages(number=number, repeat=repeat):
+ yield page
+
+def AncientPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.ancientpages(number=number, repeat=repeat):
+ yield page[0]
+
+def DeadendPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.deadendpages(number=number, repeat=repeat):
+ yield page
+
+def LongPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.longpages(number=number, repeat=repeat):
+ yield page[0]
+
+def ShortPagesPageGenerator(number = 100, repeat = False, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.shortpages(number=number, repeat=repeat):
+ yield page[0]
+
+def LinksearchPageGenerator(link, step=500, site=None):
+ """Yields all pages that include a specified link, according to
+ [[Special:Linksearch]].
+
+ """
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.linksearch(link, limit=step):
+ yield page
+
+def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
+ """
+ Provides a list of results using the internal MediaWiki search engine
+ """
+ if site is None:
+ site = pywikibot.Site()
+ for page in site.search(query, number=number, namespaces = namespaces):
+ yield page[0]
+
+class YahooSearchPageGenerator:
+ '''
+ To use this generator, install pYsearch
+ '''
+ def __init__(self, query = None, count = 100, site = None): # values larger than 100
fail
+ self.query = query or pywikibot.input(u'Please enter the search query:')
+ self.count = count
+ if site is None:
+ site = pywikibot.Site()
+ self.site = site
+
+ def queryYahoo(self, query):
+ from yahoo.search.web import WebSearch
+ srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
+
+ dom = srch.get_results()
+ results = srch.parse_results(dom)
+ for res in results:
+ url = res.Url
+ yield url
+
+ def __iter__(self):
+ # restrict query to local site
+ localQuery = '%s site:%s' % (self.query, self.site.hostname())
+ base = 'http://%s%s' % (self.site.hostname(),
self.site.nice_get_address(''))
+ for url in self.queryYahoo(localQuery):
+ if url[:len(base)] == base:
+ title = url[len(base):]
+ page = pywikibot.Page(self.site, title)
+ yield page
+
+class GoogleSearchPageGenerator:
+ '''
+ To use this generator, you must install the pyGoogle module from
+
http://pygoogle.sf.net/ and get a Google Web API license key from
+
http://www.google.com/apis/index.html . The google_key must be set to your
+ license key in your configuration.
+ '''
+ def __init__(self, query = None, site = None):
+ self.query = query or pywikibot.input(u'Please enter the search query:')
+ if site is None:
+ site = pywikibot.Site()
+ self.site = site
+
+ #########
+ # partially commented out because it is probably not in compliance with Google's
"Terms of
+ # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
+ def queryGoogle(self, query):
+ #if config.google_key:
+ if True:
+ #try:
+ for url in self.queryViaSoapApi(query):
+ yield url
+ return
+ #except ImportError:
+ #pass
+ # No google license key, or pygoogle not installed. Do it the ugly way.
+ #for url in self.queryViaWeb(query):
+ # yield url
+
+ def queryViaSoapApi(self, query):
+ import google
+ google.LICENSE_KEY = config.google_key
+ offset = 0
+ estimatedTotalResultsCount = None
+ while not estimatedTotalResultsCount or offset < estimatedTotalResultsCount:
+ while (True):
+ # Google often yields 502 errors.
+ try:
+ pywikibot.output(u'Querying Google, offset %i' % offset)
+ data = google.doGoogleSearch(query, start = offset, filter = False)
+ break
+ except KeyboardInterrupt:
+ raise
+ except:
+ # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway)
+ # can happen here, depending on the module used. It's not easy
+ # to catch this properly because pygoogle decides which one of
+ # the soap modules to use.
+ pywikibot.output(u"An error occured. Retrying in 10
seconds...")
+ time.sleep(10)
+ continue
+
+ for result in data.results:
+ #print 'DBG: ', result.URL
+ yield result.URL
+ # give an estimate of pages to work on, but only once.
+ if not estimatedTotalResultsCount:
+ pywikibot.output(u'Estimated total result count: %i pages.' %
data.meta.estimatedTotalResultsCount)
+ estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount
+ #print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount
+ offset += 10
+
+ #########
+ # commented out because it is probably not in compliance with Google's
"Terms of
+ # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
+
+ #def queryViaWeb(self, query):
+ #"""
+ #Google has stopped giving out API license keys, and sooner or later
+ #they will probably shut down the service.
+ #This is a quick and ugly solution: we just grab the search results from
+ #the normal web interface.
+ #"""
+ #linkR = re.compile(r'<a href="([^>"]+?)"
class=l>', re.IGNORECASE)
+ #offset = 0
+
+ #while True:
+ #pywikibot.output("Google: Querying page %d" % (offset / 100 + 1))
+ #address =
"http://www.google.com/search?q=%s&num=100&hl=en&start=%d" %
(urllib.quote_plus(query), offset)
+ ## we fake being Firefox because Google blocks unknown browsers
+ #request = urllib2.Request(address, None, {'User-Agent':
'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1
Firefox/1.5'})
+ #urlfile = urllib2.urlopen(request)
+ #page = urlfile.read()
+ #urlfile.close()
+ #for url in linkR.findall(page):
+ #yield url
+ #if "<div id=nn>" in page: # Is there a "Next" link
for next page of results?
+ #offset += 100 # Yes, go to next page of results.
+ #else:
+ #return
+ #########
+
+ def __iter__(self):
+ # restrict query to local site
+ localQuery = '%s site:%s' % (self.query, self.site.hostname())
+ base = 'http://%s%s' % (self.site.hostname(),
self.site.nice_get_address(''))
+ for url in self.queryGoogle(localQuery):
+ if url[:len(base)] == base:
+ title = url[len(base):]
+ page = pywikibot.Page(self.site, title)
+ # Google contains links in the format
http://de.wikipedia.org/wiki/en:Foobar
+ if page.site() == self.site:
+ yield page
+
+def MySQLPageGenerator(query, site = None):
+ import MySQLdb as mysqldb
+ if site is None:
+ site = pywikibot.Site()
+ conn = mysqldb.connect(config.db_hostname, db = site.dbName(),
+ user = config.db_username,
+ passwd = config.db_password)
+ cursor = conn.cursor()
+ pywikibot.output(u'Executing query:\n%s' % query)
+ query = query.encode(site.encoding())
+ cursor.execute(query)
+ while True:
+ try:
+ namespaceNumber, pageName = cursor.fetchone()
+ print namespaceNumber, pageName
+ except TypeError:
+ # Limit reached or no more results
+ break
+ #print pageName
+ if pageName:
+ namespace = site.namespace(namespaceNumber)
+ pageName = unicode(pageName, site.encoding())
+ if namespace:
+ pageTitle = '%s:%s' % (namespace, pageName)
+ else:
+ pageTitle = pageName
+ page = pywikibot.Page(site, pageTitle)
+ yield page
+
+def YearPageGenerator(start = 1, end = 2050, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ pywikibot.output(u"Starting with year %i" % start)
+ for i in xrange(start, end + 1):
+ if i % 100 == 0:
+ pywikibot.output(u'Preparing %i...' % i)
+ # There is no year 0
+ if i != 0:
+ current_year = date.formatYear(site.lang, i )
+ yield pywikibot.Page(site, current_year)
+
+def DayPageGenerator(startMonth = 1, endMonth = 12, site = None):
+ if site is None:
+ site = pywikibot.Site()
+ fd = date.FormatDate(site)
+ firstPage = pywikibot.Page(site, fd(startMonth, 1))
+ pywikibot.output(u"Starting with %s" % firstPage.aslink())
+ for month in xrange(startMonth, endMonth+1):
+ for day in xrange(1, date.getNumberOfDaysInMonth(month)+1):
+ yield pywikibot.Page(site, fd(month, day))
+
+
+if __name__ == "__main__":
+ try:
+ gen = None
+ genFactory = GeneratorFactory()
+ for arg in pywikibot.handleArgs():
+ generator = genFactory.handleArg(arg)
+ if generator:
+ gen = generator
+ if gen:
+ for page in gen:
+ pywikibot.output(page.title(), toStdout = True)
+ else:
+ pywikibot.showHelp()
+ finally:
+ pywikibot.stopme()
Property changes on: branches/rewrite/pywikibot/pagegenerators.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Added: svn:eol-style
+ native
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-12-16 19:34:48 UTC (rev 6155)
+++ branches/rewrite/pywikibot/site.py 2008-12-16 19:40:20 UTC (rev 6156)
@@ -1,2861 +1,2861 @@
- # -*- coding: utf-8 -*-
-"""
-Objects representing MediaWiki sites (wikis) and families (groups of wikis
-on the same topic in different languages).
-"""
-#
-# (C) Pywikipedia bot team, 2008
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id: $'
-
-import pywikibot
-from pywikibot import deprecate_arg
-from pywikibot import config
-from pywikibot.throttle import Throttle
-from pywikibot.data import api
-from pywikibot.exceptions import *
-
-try:
- from hashlib import md5
-except ImportError:
- from md5 import md5
-import logging
-import os
-import re
-import sys
-import threading
-import urllib
-
-logger = logging.getLogger("wiki")
-
-class PageInUse(pywikibot.Error):
- """Page cannot be reserved for writing due to existing
lock."""
-
-
-def Family(fam=None, fatal=True):
- """Import the named family.
-
- @param fam: family name (if omitted, uses the configured default)
- @type fam: str
- @param fatal: if True, the bot will stop running if the given family is
- unknown. If False, it will only raise a ValueError exception.
- @param fatal: bool
- @return: a Family instance configured for the named family.
-
- """
- if fam == None:
- fam = config.family
- try:
- # first try the built-in families
- exec "import pywikibot.families.%s_family as myfamily" % fam
- except ImportError:
- # next see if user has defined a local family module
- try:
- sys.path.append(config.datafilepath('families'))
- exec "import %s_family as myfamily" % fam
- except ImportError:
- if fatal:
- logger.exception(u"""\
-Error importing the %s family. This probably means the family
-does not exist. Also check your configuration file."""
- % fam)
- sys.exit(1)
- else:
- raise Error("Family %s does not exist" % fam)
- return myfamily.Family()
-
-
-class BaseSite(object):
- """Site methods that are independent of the communication
interface."""
- # to implement a specific interface, define a Site class that inherits
- # from this
-
- def __init__(self, code, fam=None, user=None, sysop=None):
- """
- @param code: the site's language code
- @type code: str
- @param fam: wiki family name (optional)
- @type fam: str or Family
- @param user: bot user name (optional)
- @type user: str
- @param sysop: sysop account user name (optional)
- @type sysop: str
-
- """
- self.__code = code.lower()
- if isinstance(fam, basestring) or fam is None:
- self.__family = Family(fam, fatal=False)
- else:
- self.__family = fam
-
- # if we got an outdated language code, use the new one instead.
- if self.__family.obsolete.has_key(self.__code):
- if self.__family.obsolete[self.__code] is not None:
- self.__code = self.__family.obsolete[self.__code]
- else:
- # no such language anymore
- raise NoSuchSite("Language %s in family %s is obsolete"
- % (self.__code, self.__family.name))
- if self.__code not in self.languages():
- if self.__code == 'zh-classic' and 'zh-classical' in
self.languages():
- self.__code = 'zh-classical'
- # database hack (database is varchar[10] -> zh-classical
- # is cut to zh-classic.
- else:
- raise NoSuchSite("Language %s does not exist in family %s"
- % (self.__code, self.__family.name))
-
- self._username = [user, sysop]
-
- # following are for use with lock_page and unlock_page methods
- self._pagemutex = threading.Lock()
- self._locked_pages = []
-
- @property
- def throttle(self):
- """Return this Site's throttle. Initialize a new one if
needed."""
-
- if not hasattr(self, "_throttle"):
- self._throttle = Throttle(self, multiplydelay=True,
- verbosedelay=True)
- try:
- self.login(False)
- except pywikibot.NoUsername:
- pass
- return self._throttle
-
- @property
- def family(self):
- """The Family object for this Site's wiki
family."""
-
- return self.__family
-
- @property
- def code(self):
- """The identifying code for this Site."""
-
- return self.__code
-
- @property
- def lang(self):
- """The ISO language code for this Site.
-
- Presumed to be equal to the wiki prefix, but this can be overridden.
-
- """
- return self.__code
-
- def __cmp__(self, other):
- """Perform equality and inequality tests on Site
objects."""
-
- if not isinstance(other, BaseSite):
- return 1
- if self.family == other.family:
- return cmp(self.code, other.code)
- return cmp(self.family.name, other.family.name)
-
- def user(self):
- """Return the currently-logged in bot user, or
None."""
-
- if self.logged_in(True):
- return self._username[True]
- elif self.logged_in(False):
- return self._username[False]
- return None
-
- def username(self, sysop = False):
- return self._username[sysop]
-
- def __getattr__(self, attr):
- """Calls to methods not defined in this object are passed to
Family."""
-
- if hasattr(self.__class__, attr):
- return self.__class__.attr
- try:
- method = getattr(self.family, attr)
- f = lambda *args, **kwargs: \
- method(self.code, *args, **kwargs)
- if hasattr(method, "__doc__"):
- f.__doc__ = method.__doc__
- return f
- except AttributeError:
- raise AttributeError("%s instance has no attribute '%s'"
- % (self.__class__.__name__, attr) )
-
- def sitename(self):
- """Return string representing this Site's name and
language."""
-
- return self.family.name+':'+self.code
-
- __str__ = sitename
-
- def __repr__(self):
- return 'Site("%s", "%s")' % (self.code,
self.family.name)
-
- def __hash__(self):
- return hash(repr(self))
-
- def linktrail(self):
- """Return regex for trailing chars displayed as part of a link.
-
- Returns a string, not a compiled regular expression object.
-
- This reads from the family file, and ''not'' from
- [[MediaWiki:Linktrail]], because the MW software currently uses a
- built-in linktrail from its message files and ignores the wiki
- value.
-
- """
- return self.family.linktrail(self.code)
-
- def languages(self):
- """Return list of all valid language codes for this site's
Family."""
-
- return self.family.langs.keys()
-
- def validLanguageLinks(self):
- """Return list of language codes that can be used in interwiki
links."""
-
- nsnames = sum(self.namespaces().values(), [])
- return [l for l in self.languages()
- if l[:1].upper() + l[1:] not in self.namespaces()]
-
- def ns_index(self, namespace):
- """Given a namespace name, return its int index, or None if
invalid."""
-
- for ns in self.namespaces():
- if namespace.lower() in [name.lower()
- for name in self.namespaces()[ns]]:
- return ns
- return None
-
- getNamespaceIndex = ns_index # for backwards-compatibility
-
- def namespaces(self):
- """Return dict of valid namespaces on this
wiki."""
-
- return self._namespaces
-
- def ns_normalize(self, value):
- """Return canonical local form of namespace name.
-
- @param value: A namespace name
- @type value: unicode
-
- """
- index = self.ns_index(value)
- return self.namespace(index)
-
- normalizeNamespace = ns_normalize # for backwards-compatibility
-
- def redirect(self, default=True):
- """Return the localized redirect tag for the site.
-
- If default is True, falls back to 'REDIRECT' if the site has no
- special redirect tag.
-
- """
- if default:
- return self.family.redirect.get(self.code, [u"REDIRECT"])[0]
- else:
- return self.family.redirect.get(self.code, None)
-
- def lock_page(self, page, block=True):
- """Lock page for writing. Must be called before writing any
page.
-
- We don't want different threads trying to write to the same page
- at the same time, even to different sections.
-
- @param page: the page to be locked
- @type page: pywikibot.Page
- @param block: if true, wait until the page is available to be locked;
- otherwise, raise an exception if page can't be locked
-
- """
- self._pagemutex.acquire()
- try:
- while page in self._locked_pages:
- if not block:
- raise PageInUse
- time.sleep(.25)
- self._locked_pages.append(page.title(withSection=False))
- finally:
- self._pagemutex.release()
-
- def unlock_page(self, page):
- """Unlock page. Call as soon as a write operation has completed.
-
- @param page: the page to be locked
- @type page: pywikibot.Page
-
- """
- self._pagemutex.acquire()
- try:
- self._locked_pages.remove(page.title(withSection=False))
- finally:
- self._pagemutex.release()
-
- def disambcategory(self):
- """Return Category in which disambig pages are
listed."""
-
- try:
- name = self.namespace(14)+':'+self.family.disambcatname[self.code]
- except KeyError:
- raise Error(u"No disambiguation category name found for %(site)s"
- % {'site': self})
- return pywikibot.Category(pywikibot.Link(name, self))
-
- def linkto(self, title, othersite = None):
- """Return unicode string in the form of a wikilink to
'title'
-
- Use optional Site argument 'othersite' to generate an interwiki link.
-
- """
- logger.debug("Site.linkto() method is deprecated; use pywikibot.Link")
- return pywikibot.Link(title, self).astext(othersite)
-
- def isInterwikiLink(self, s):
- """Return True if s is in the form of an interwiki link.
-
- If a link object constructed using "s" as the link text parses as
- belonging to a different site, this method returns True.
-
- """
- return (pywikibot.Link(s, self).site != self)
-
- def redirectRegex(self):
- """Return a compiled regular expression matching on redirect
pages.
-
- Group 1 in the regex match object will be the target title.
-
- """
- #TODO: is this needed, since the API identifies redirects?
- # (maybe, the API can give false positives)
- default = 'REDIRECT'
- try:
- keywords = set(self.family.redirect[self.code])
- keywords.add(default)
- pattern = r'(?:' + '|'.join(keywords) + ')'
- except KeyError:
- # no localized keyword for redirects
- pattern = r'%s' % default
- # A redirect starts with hash (#), followed by a keyword, then
- # arbitrary stuff, then a wikilink. The wikilink may contain
- # a label, although this is not useful.
- return re.compile(r'\s*#%(pattern)s\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]'
- % locals(),
- re.IGNORECASE | re.UNICODE | re.DOTALL)
-
- # namespace shortcuts for backwards-compatibility
-
- def special_namespace(self):
- return self.namespace(-1)
-
- def image_namespace(self):
- return self.namespace(6)
-
- def mediawiki_namespace(self):
- return self.namespace(8)
-
- def template_namespace(self):
- return self.namespace(10)
-
- def category_namespace(self):
- return self.namespace(14)
-
- def category_namespaces(self):
- return self.namespace(14, all=True)
-
- # site-specific formatting preferences
-
- def category_on_one_line(self):
- """Return True if this site wants all category links on one
line."""
-
- return self.code in self.family.category_on_one_line
-
- def interwiki_putfirst(self):
- """Return list of language codes for ordering of interwiki
links."""
-
- return self.family.interwiki_putfirst.get(self.code, None)
-
- def interwiki_putfirst_doubled(self, list_of_links):
- # TODO: is this even needed? No family in the framework has this
- # dictionary defined!
- if self.lang in self.family.interwiki_putfirst_doubled:
- if len(list_of_links) >= \
- self.family.interwiki_putfirst_doubled[self.lang][0]:
- links2 = [lang.language() for lang in list_of_links]
- result = []
- for lang in self.family.interwiki_putfirst_doubled[self.lang][1]:
- try:
- result.append(list_of_links[links2.index(lang)])
- except ValueError:
- pass
- return result
- else:
- return False
- else:
- return False
-
- def getSite(self, code):
- """Return Site object for language 'code' in this
Family."""
-
- return pywikibot.Site(code=code, fam=self.family, user=self.user)
-
- # deprecated methods for backwards-compatibility
-
- def fam(self):
- """Return Family object for this Site."""
- return self.family
-
- def urlEncode(self, query):
- """DEPRECATED"""
- return urllib.urlencode(query)
-
- def getUrl(self, path, retry=True, sysop=False, data=None,
- compress=True, no_hostname=False, cookie_only=False):
- """DEPRECATED.
-
- Retained for compatibility only. All arguments except path and data
- are ignored.
-
- """
- if data:
- if not isinstance(data, basestring):
- data = urllib.urlencode(data)
- return pywikibot.comms.data.request(self, path, method="PUT",
- body=data)
- else:
- return pywikibot.comms.data.request(self, path)
-
- def postForm(self, address, predata, sysop=False, cookies=None):
- """DEPRECATED"""
- return self.getUrl(address, data=predata)
-
- def postData(self, address, data, contentType=None, sysop=False,
- compress=True, cookies=None):
- """DEPRECATED"""
- return self.getUrl(address, data=data)
-
- # unsupported methods from version 1
-
- def checkCharset(self, charset):
- raise NotImplementedError
- def getToken(self, getalways=True, getagain=False, sysop=False):
- raise NotImplementedError
- def export_address(self):
- raise NotImplementedError
- def move_address(self):
- raise NotImplementedError
- def delete_address(self, s):
- raise NotImplementedError
- def undelete_view_address(self, s, ts=''):
- raise NotImplementedError
- def undelete_address(self):
- raise NotImplementedError
- def protect_address(self, s):
- raise NotImplementedError
- def unprotect_address(self, s):
- raise NotImplementedError
- def put_address(self, s):
- raise NotImplementedError
- def get_address(self, s):
- raise NotImplementedError
- def nice_get_address(self, s):
- raise NotImplementedError
- def edit_address(self, s):
- raise NotImplementedError
- def purge_address(self, s):
- raise NotImplementedError
- def block_address(self):
- raise NotImplementedError
- def unblock_address(self):
- raise NotImplementedError
- def blocksearch_address(self, s):
- raise NotImplementedError
- def linksearch_address(self, s, limit=500, offset=0):
- raise NotImplementedError
- def search_address(self, q, n=50, ns=0):
- raise NotImplementedError
- def allpages_address(self, s, ns = 0):
- raise NotImplementedError
- def log_address(self, n=50, mode = ''):
- raise NotImplementedError
- def newpages_address(self, n=50):
- raise NotImplementedError
- def longpages_address(self, n=500):
- raise NotImplementedError
- def shortpages_address(self, n=500):
- raise NotImplementedError
- def unusedfiles_address(self, n=500):
- raise NotImplementedError
- def categories_address(self, n=500):
- raise NotImplementedError
- def deadendpages_address(self, n=500):
- raise NotImplementedError
- def ancientpages_address(self, n=500):
- raise NotImplementedError
- def lonelypages_address(self, n=500):
- raise NotImplementedError
- def protectedpages_address(self, n=500):
- raise NotImplementedError
- def unwatchedpages_address(self, n=500):
- raise NotImplementedError
- def uncategorizedcategories_address(self, n=500):
- raise NotImplementedError
- def uncategorizedimages_address(self, n=500):
- raise NotImplementedError
- def uncategorizedpages_address(self, n=500):
- raise NotImplementedError
- def unusedcategories_address(self, n=500):
- raise NotImplementedError
- def withoutinterwiki_address(self, n=500):
- raise NotImplementedError
- def references_address(self, s):
- raise NotImplementedError
- def allmessages_address(self):
- raise NotImplementedError
- def upload_address(self):
- raise NotImplementedError
- def double_redirects_address(self, default_limit = True):
- raise NotImplementedError
- def broken_redirects_address(self, default_limit = True):
- raise NotImplementedError
- def login_address(self):
- raise NotImplementedError
- def captcha_image_address(self, id):
- raise NotImplementedError
- def watchlist_address(self):
- raise NotImplementedError
- def contribs_address(self, target, limit=500, offset=''):
- raise NotImplementedError
-
-
-class APISite(BaseSite):
- """API interface to MediaWiki site.
-
- Do not use directly; use pywikibot.Site function.
-
- """
-## Site methods from version 1.0 (as these are implemented in this file,
-## or declared deprecated/obsolete, they will be removed from this list)
-##########
-## cookies: return user's cookies as a string
-##
-## urlEncode: Encode a query to be sent using an http POST request.
-## postForm: Post form data to an address at this site.
-## postData: Post encoded form data to an http address at this site.
-##
-## shared_image_repository: Return tuple of image repositories used by this
-## site.
-## version: Return MediaWiki version string from Family file.
-## versionnumber: Return int identifying the MediaWiki version.
-## live_version: Return version number read from Special:Version.
-## checkCharset(charset): Warn if charset doesn't match family file.
-##
-## linktrail: Return regex for trailing chars displayed as part of a link.
-## disambcategory: Category in which disambiguation pages are listed.
-##
-## Methods that yield Page objects derived from a wiki's Special: pages
-## (note, some methods yield other information in a tuple along with the
-## Pages; see method docs for details) --
-##
-## newpages(): Special:Newpages
-## newimages(): Special:Log&type=upload
-## longpages(): Special:Longpages
-## shortpages(): Special:Shortpages
-## deadendpages(): Special:Deadendpages
-## ancientpages(): Special:Ancientpages
-## lonelypages(): Special:Lonelypages
-## unwatchedpages(): Special:Unwatchedpages (sysop accounts only)
-## uncategorizedcategories(): Special:Uncategorizedcategories (yields
-## Category objects)
-## uncategorizedpages(): Special:Uncategorizedpages
-## uncategorizedimages(): Special:Uncategorizedimages (yields
-## ImagePage objects)
-## unusedcategories(): Special:Unusuedcategories (yields Category)
-## unusedfiles(): Special:Unusedimages (yields ImagePage)
-## withoutinterwiki: Special:Withoutinterwiki
-## linksearch: Special:Linksearch
-
- def __init__(self, code, fam=None, user=None, sysop=None):
- BaseSite.__init__(self, code, fam, user, sysop)
- self._namespaces = {
- # these are the MediaWiki built-in names, which always work
- # localized names are loaded later upon accessing the wiki
- # namespace prefixes are always case-insensitive, but the
- # canonical forms are capitalized
- -2: [u"Media"],
- -1: [u"Special"],
- 0: [u""],
- 1: [u"Talk"],
- 2: [u"User"],
- 3: [u"User talk"],
- 4: [u"Project"],
- 5: [u"Project talk"],
- 6: [u"Image"],
- 7: [u"Image talk"],
- 8: [u"MediaWiki"],
- 9: [u"MediaWiki talk"],
- 10: [u"Template"],
- 11: [u"Template talk"],
- 12: [u"Help"],
- 13: [u"Help talk"],
- 14: [u"Category"],
- 15: [u"Category talk"],
- }
- self.sitelock = threading.Lock()
- self._msgcache = {}
- return
-
-# ANYTHING BELOW THIS POINT IS NOT YET IMPLEMENTED IN __init__()
- self.nocapitalize = self.__code in self.family.nocapitalize
- # Calculating valid languages took quite long, so we calculate it once
- # in initialization instead of each time it is used.
- self._validlanguages = []
- for language in self.languages():
- if not language[:1].upper() + language[1:] in self.namespaces():
- self._validlanguages.append(language)
-
- def logged_in(self, sysop=False):
- """Return True if logged in with specified privileges, otherwise
False.
-
- @param sysop: if True, require sysop privileges.
-
- """
- if self.userinfo['name'] != self._username[sysop]:
- return False
- return (not sysop) or 'sysop' in self.userinfo['groups']
-
- def loggedInAs(self, sysop = False):
- """Return the current username if logged in, otherwise return
None.
-
- DEPRECATED (use .user() method instead)
-
- """
- logger.debug("Site.loggedInAs() method is deprecated.")
- return self.logged_in(sysop) and self.user()
-
- def login(self, sysop=False):
- """Log the user in if not already logged in."""
- if not hasattr(self, "_siteinfo"):
- self._getsiteinfo()
- # check whether a login cookie already exists for this user
- if hasattr(self, "_userinfo"):
- if self.userinfo['name'] == self._username[sysop]:
- return
- if not self.logged_in(sysop):
- loginMan = api.LoginManager(site=self, sysop=sysop,
- user=self._username[sysop])
- if loginMan.login(retry = True):
- self._username[sysop] = loginMan.username
- if hasattr(self, "_userinfo"):
- del self._userinfo
- self.getuserinfo()
-
- forceLogin = login # alias for backward-compatibility
-
- def getuserinfo(self):
- """Retrieve userinfo from site and store in _userinfo attribute.
-
- self._userinfo will be a dict with the following keys and values:
-
- - id: user id (numeric str)
- - name: username (if user is logged in)
- - anon: present if user is not logged in
- - groups: list of groups (could be empty)
- - rights: list of rights (could be empty)
- - message: present if user has a new message on talk page
- - blockinfo: present if user is blocked (dict)
-
- """
- if (not hasattr(self, "_userinfo")
- or "rights" not in self._userinfo
- or self._userinfo['name']
- != self._username["sysop" in
self._userinfo["groups"]]):
- uirequest = api.Request(
- site=self,
- action="query",
- meta="userinfo",
- uiprop="blockinfo|hasmsg|groups|rights"
- )
- uidata = uirequest.submit()
- assert 'query' in uidata, \
- "API userinfo response lacks 'query' key"
- assert 'userinfo' in uidata['query'], \
- "API userinfo response lacks 'userinfo' key"
- self._userinfo = uidata['query']['userinfo']
- return self._userinfo
-
- userinfo = property(fget=getuserinfo, doc=getuserinfo.__doc__)
-
- def is_blocked(self, sysop=False):
- """Return true if and only if user is blocked.
-
- @param sysop: If true, log in to sysop account (if available)
-
- """
- if not self.logged_in(sysop):
- self.login(sysop)
- return 'blockinfo' in self._userinfo
-
- def isBlocked(self, sysop=False):
- """Deprecated synonym for is_blocked"""
- logger.debug(
- "Site method 'isBlocked' should be changed to
'is_blocked'")
- return self.is_blocked(sysop)
-
- def checkBlocks(self, sysop = False):
- """Check if the user is blocked, and raise an exception if
so."""
- if self.is_blocked(sysop):
- # User blocked
- raise UserBlocked('User is blocked in site %s' % self)
-
- def has_right(self, right, sysop=False):
- """Return true if and only if the user has a specific right.
-
- Possible values of 'right' may vary depending on wiki settings,
- but will usually include:
-
- * Actions: edit, move, delete, protect, upload
- * User levels: autoconfirmed, sysop, bot
-
- """
- if not self.logged_in(sysop):
- self.login(sysop)
- return right.lower() in self._userinfo['rights']
-
- def isAllowed(self, right, sysop=False):
- """Deprecated; retained for
backwards-compatibility"""
- logger.debug("Site.isAllowed() method is deprecated; use has_right()")
- return self.has_right(right, sysop)
-
- def has_group(self, group, sysop=False):
- """Return true if and only if the user is a member of specified
group.
-
- Possible values of 'group' may vary depending on wiki settings,
- but will usually include bot.
-
- """
- if not self.logged_in(sysop):
- self.login(sysop)
- return group.lower() in self._userinfo['groups']
-
- def messages(self, sysop=False):
- """Returns true if the user has new messages, and false
otherwise."""
- if not self.logged_in(sysop):
- self.login(sysop)
- return 'hasmsg' in self._userinfo
-
- def mediawiki_message(self, key):
- """Return the MediaWiki message text for key "key"
"""
- if not key in self._msgcache:
- msg_query = api.QueryGenerator(site=self, meta="allmessages",
- amfilter=key)
- for msg in msg_query:
- if msg['name'] == key and not 'missing' in msg:
- self._msgcache[key] = msg['*']
- break
- else:
- raise KeyError("Site %(self)s has no message
'%(key)s'"
- % locals())
- return self._msgcache[key]
-
- def has_mediawiki_message(self, key):
- """Return True iff this site defines a MediaWiki message for
'key'."""
- try:
- v = self.mediawiki_message(key)
- return True
- except KeyError:
- return False
-
- def getcurrenttimestamp(self):
- """Return (Mediawiki) timestamp, {{CURRENTTIMESTAMP}}, the server
time.
-
- Format is yyyymmddhhmmss
-
- """
- r = api.Request(site=self,
- action="parse",
- text="{{CURRENTTIMESTAMP}}")
- result = r.submit()
- return re.search('\d+',
result['parse']['text']['*']).group()
-
- def _getsiteinfo(self):
- """Retrieve siteinfo and namespaces from site."""
- sirequest = api.Request(
- site=self,
- action="query",
- meta="siteinfo",
- siprop="general|namespaces|namespacealiases"
- )
- try:
- sidata = sirequest.submit()
- except api.APIError:
- # hack for older sites that don't support 1.12 properties
- # probably should delete if we're not going to support pre-1.12
- sirequest = api.Request(
- site=self,
- action="query",
- meta="siteinfo",
- siprop="general|namespaces"
- )
- sidata = sirequest.submit()
-
- assert 'query' in sidata, \
- "API siteinfo response lacks 'query' key"
- sidata = sidata['query']
- assert 'general' in sidata, \
- "API siteinfo response lacks 'general' key"
- assert 'namespaces' in sidata, \
- "API siteinfo response lacks 'namespaces' key"
- self._siteinfo = sidata['general']
- nsdata = sidata['namespaces']
- for nskey in nsdata:
- ns = int(nskey)
- if ns in self._namespaces:
- if nsdata[nskey]["*"] in self._namespaces[ns]:
- continue
- # this is the preferred form so it goes at front of list
- self._namespaces[ns].insert(0, nsdata[nskey]["*"])
- else:
- self._namespaces[ns] = [nsdata[nskey]["*"]]
- if 'namespacealiases' in sidata:
- aliasdata = sidata['namespacealiases']
- for item in aliasdata:
- if item["*"] in self._namespaces[int(item['id'])]:
- continue
- # this is a less preferred form so it goes at the end
- self._namespaces[int(item['id'])].append(item["*"])
-
- @property
- def siteinfo(self):
- """Site information dict."""
-
- if not hasattr(self, "_siteinfo"):
- self._getsiteinfo()
- return self._siteinfo
-
- def case(self):
- """Return this site's capitalization rule."""
-
- return self.siteinfo['case']
-
- def language(self):
- """Return the code for the language of this
Site."""
-
- return self.siteinfo['lang']
-
- lang = property(fget=language, doc=language.__doc__)
-
- def namespaces(self):
- """Return dict of valid namespaces on this
wiki."""
-
- if not hasattr(self, "_siteinfo"):
- self._getsiteinfo()
- return self._namespaces
-
- def namespace(self, num, all=False):
- """Return string containing local name of namespace
'num'.
-
- If optional argument 'all' is true, return a list of all recognized
- values for this namespace.
-
- """
- if all:
- return self.namespaces()[num]
- return self.namespaces()[num][0]
-
- def live_version(self):
- """Return the 'real' version number found on
[[Special:Version]]
-
- Return value is a tuple (int, int, str) of the major and minor
- version numbers and any other text contained in the version.
-
- """
- versionstring = self.siteinfo['generator']
- m = re.match(r"^MediaWiki ([0-9]+)\.([0-9]+)(.*)$", versionstring)
- if m:
- return (int(m.group(1)), int(m.group(2)), m.group(3))
- else:
- return None
-
- def loadpageinfo(self, page):
- """Load page info from api and save in page
attributes"""
- title = page.title(withSection=False)
- query = api.PropertyGenerator("info", site=self,
- titles=title.encode(self.encoding()),
- inprop="protection")
- for pageitem in query:
- if pageitem['title'] != title:
- raise Error(
- u"loadpageinfo: Query on %s returned data on '%s'"
- % (page, pageitem['title']))
- api.update_page(page, pageitem)
-
- def loadimageinfo(self, page, history=False):
- """Load image info from api and save in page attributes
-
- @param history: if true, return the image's version history
-
- """
- title = page.title(withSection=False)
- query = api.PropertyGenerator("imageinfo", site=self,
- titles=title.encode(self.encoding()),
- iiprop=["timestamp", "user",
"comment",
- "url", "size",
"sha1", "mime",
- "metadata",
"archivename"])
- if history:
- query.request["iilimit"] = "max"
- for pageitem in query:
- if pageitem['title'] != title:
- raise Error(
- u"loadpageinfo: Query on %s returned data on '%s'"
- % (page, pageitem['title']))
- api.update_page(page, pageitem)
- if history:
- return pageitem['imageinfo']
-
- def page_exists(self, page):
- """Return True if and only if page is an existing page on
site."""
- if not hasattr(page, "_pageid"):
- self.loadpageinfo(page)
- return page._pageid > 0
-
- def page_restrictions(self, page):
- """Returns a dictionary reflecting page
protections"""
- if not self.page_exists(page):
- raise NoPage(u'No page %s.' % page)
- if not hasattr(page, "_protection"):
- self.loadpageinfo(page)
- return page._protection
-
- def page_can_be_edited(self, page):
- """
- Returns True if and only if:
- - page is unprotected, and bot has an account for this site, or
- - page is protected, and bot has a sysop account for this site.
-
- """
- rest = self.page_restrictions(page)
- sysop_protected = rest.has_key('edit') and rest['edit'][0] ==
'sysop'
- try:
- api.LoginManager(site=self, sysop=sysop_protected)
- except NoUsername:
- return False
- return True
-
- def page_isredirect(self, page):
- """Return True if and only if page is a
redirect."""
- if not hasattr(page, "_redir"):
- self.loadpageinfo(page)
- return bool(page._redir)
-
- def getredirtarget(self, page):
- """Return Page object for the redirect target of
page."""
- if not hasattr(page, "_redir"):
- self.loadpageinfo(page)
- if not page._redir:
- raise pywikibot.IsNotRedirectPage(page.title())
- title = page.title(withSection=False)
- query = api.Request(site=self, action="query",
property="info",
- inprop="protection|talkid|subjectid",
- titles=title.encode(self.encoding()),
- redirects="")
- result = query.submit()
- if "query" not in result or "redirects" not in
result["query"]:
- raise RuntimeError(
- "getredirtarget: No 'redirects' found for page %s."
- % title)
- redirmap = dict((item['from'], item['to'])
- for item in result['query']['redirects'])
- if title not in redirmap:
- raise RuntimeError(
- "getredirtarget: 'redirects' contains no key for page
%s."
- % title)
- if "pages" not in result['query']:
- # no "pages" element indicates a circular redirect
- raise pywikibot.CircularRedirect(redirmap[title])
- for pagedata in result['query']['pages'].values():
- # there should be only one value in 'pages', and it is the target
- if pagedata['title'] not in redirmap.values():
- raise RuntimeError(
- "getredirtarget: target page '%s' not found in
'redirects'"
- % pagedata['title'])
- target = pywikibot.Page(self, pagedata['title'],
pagedata['ns'])
- api.update_page(target, pagedata)
- page._redir = target
-
- def preloadpages(self, pagelist, groupsize=60):
- """Return a generator to a list of preloaded pages.
-
- Note that [at least in current implementation] pages may be iterated
- in a different order than in the underlying pagelist.
-
- @param pagelist: an iterable that returns Page objects
- @param groupsize: how many Pages to query at a time
- @type groupsize: int
-
- """
- from pywikibot.tools import itergroup
- for sublist in itergroup(pagelist, groupsize):
- pageids = [str(p._pageid) for p in sublist
- if hasattr(p, "_pageid")
- and p._pageid > 0]
- cache = dict((p.title(withSection=False), p) for p in sublist)
- rvgen = api.PropertyGenerator("revisions|info", site=self)
- rvgen.limit = -1
- if len(pageids) == len(sublist):
- # only use pageids if all pages have them
- rvgen.request["pageids"] = "|".join(pageids)
- else:
- rvgen.request["titles"] = "|".join(cache.keys())
- rvgen.request[u"rvprop"] = \
- u"ids|flags|timestamp|user|comment|content"
- logger.info(u"Retrieving %s pages from %s."
- % (len(cache), self)
- )
- for pagedata in rvgen:
- logger.debug("Preloading %s" % pagedata)
- try:
- if pagedata['title'] not in cache:
- raise Error(
- u"preloadpages: Query returned unexpected title
'%s'"
- % pagedata['title']
- )
- except KeyError:
- logger.debug("No 'title' in %s" % pagedata)
- logger.debug("pageids=%s" % pageids)
- logger.debug("titles=%s" % cache.keys())
- continue
- page = cache[pagedata['title']]
- api.update_page(page, pagedata)
- yield page
-
- def token(self, page, tokentype):
- """Return token retrieved from wiki to allow changing page
content.
-
- @param page: the Page for which a token should be retrieved
- @param tokentype: the type of token (e.g., "edit", "move",
"delete");
- see API documentation for full list of types
-
- """
- query = api.PropertyGenerator("info|revisions", site=self,
- titles=page.title(withSection=False),
- intoken=tokentype)
- for item in query:
- if item['title'] != page.title(withSection=False):
- raise Error(
- u"token: Query on page %s returned data on page [[%s]]"
- % (page.title(withSection=False, asLink=True),
- item['title']))
- api.update_page(page, item)
- logging.debug(str(item))
- return item[tokentype + "token"]
-
- # following group of methods map more-or-less directly to API queries
-
- def pagebacklinks(self, page, followRedirects=False, filterRedirects=None,
- namespaces=None):
- """Iterate all pages that link to the given page.
-
- @param page: The Page to get links to.
- @param followRedirects: Also return links to redirects pointing to
- the given page.
- @param filterRedirects: If True, only return redirects to the given
- page. If False, only return non-redirect links. If None, return
- both (no filtering).
- @param namespaces: If present, only return links from the namespaces
- in this list.
-
- """
- bltitle = page.title(withSection=False).encode(self.encoding())
- blgen = api.PageGenerator("backlinks", gbltitle=bltitle, site=self)
- if isinstance(namespaces, list):
- blgen.request["gblnamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- blgen.request["gblnamespace"] = str(namespaces)
- if filterRedirects is not None:
- blgen.request["gblfilterredir"] = filterRedirects and
"redirects"\
- or
"nonredirects"
- if followRedirects:
- # bug: see
http://bugzilla.wikimedia.org/show_bug.cgi?id=16218
- # links identified by MediaWiki as redirects may not really be,
- # so we have to check each "redirect" page and see if it
- # really redirects to this page
- blgen.request["gblfilterredir"] = "nonredirects"
- redirgen = api.PageGenerator("backlinks", gbltitle=bltitle,
- site=self,
gblfilterredir="redirects")
- if "gblnamespace" in blgen.request:
- redirgen.request["gblnamespace"] =
blgen.request["gblnamespace"]
- genlist = [blgen]
- for redir in redirgen:
- if redir.getRedirectTarget() == page:
- genlist.append(
- self.pagebacklinks(
- redir, True, None, namespaces))
- import itertools
- return itertools.chain(*genlist)
- return blgen
-
- def page_embeddedin(self, page, filterRedirects=None, namespaces=None):
- """Iterate all pages that embedded the given page as a template.
-
- @param page: The Page to get inclusions for.
- @param filterRedirects: If True, only return redirects that embed
- the given page. If False, only return non-redirect links. If
- None, return both (no filtering).
- @param namespaces: If present, only return links from the namespaces
- in this list.
-
- """
- eititle = page.title(withSection=False).encode(self.encoding())
- eigen = api.PageGenerator("embeddedin", geititle=eititle, site=self)
- if isinstance(namespaces, list):
- eigen.request["geinamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- eigen.request["geinamespace"] = str(namespaces)
- if filterRedirects is not None:
- eigen.request["geifilterredir"] = filterRedirects and
"redirects"\
- or
"nonredirects"
- return eigen
-
- def pagereferences(self, page, followRedirects=False, filterRedirects=None,
- withTemplateInclusion=True, onlyTemplateInclusion=False,
- namespaces=None):
- """Convenience method combining pagebacklinks and
page_embeddedin."""
-
- if onlyTemplateInclusion:
- return self.page_embeddedin(page, namespaces=namespaces)
- if not withTemplateInclusion:
- return self.pagebacklinks(page, followRedirects,
- namespaces=namespaces)
- import itertools
- return itertools.chain(
- self.pagebacklinks(page, followRedirects,
- filterRedirects, namespaces=namespaces),
- self.page_embeddedin(page, filterRedirects,
- namespaces=namespaces)
- )
-
- def pagelinks(self, page, namespaces=None, follow_redirects=False,
- limit=None):
- """Iterate internal wikilinks contained (or transcluded) on page.
-
- @param namespaces: Only iterate pages in these namespaces (default: all)
- @type namespaces: list of ints
- @param follow_redirects: if True, yields the target of any redirects,
- rather than the redirect page
-
- """
- plgen = api.PageGenerator("links", site=self)
- if isinstance(limit, int):
- plgen.limit = limit
- if hasattr(page, "_pageid"):
- plgen.request['pageids'] = str(page._pageid)
- else:
- pltitle = page.title(withSection=False).encode(self.encoding())
- plgen.request['titles'] = pltitle
- if follow_redirects:
- plgen.request['redirects'] = ''
- if isinstance(namespaces, list):
- plgen.request["gplnamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- plgen.request["gplnamespace"] = str(namespaces)
- return plgen
-
- @deprecate_arg("withSortKey", None) # Sortkey doesn't work with
generator
- def pagecategories(self, page, withSortKey=None):
- """Iterate categories to which page belongs."""
-
- clgen = api.CategoryPageGenerator("categories", site=self)
- if hasattr(page, "_pageid"):
- clgen.request['pageids'] = str(page._pageid)
- else:
- cltitle = page.title(withSection=False).encode(self.encoding())
- clgen.request['titles'] = cltitle
- return clgen
-
- def pageimages(self, page):
- """Iterate images used (not just linked) on the
page."""
-
- imtitle = page.title(withSection=False).encode(self.encoding())
- imgen = api.ImagePageGenerator("images", titles=imtitle, site=self)
- return imgen
-
- def pagetemplates(self, page, namespaces=None):
- """Iterate templates transcluded (not just linked) on the
page."""
-
- tltitle = page.title(withSection=False).encode(self.encoding())
- tlgen = api.PageGenerator("templates", titles=tltitle, site=self)
- if isinstance(namespaces, list):
- tlgen.request["gtlnamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- tlgen.request["gtlnamespace"] = str(namespaces)
- return tlgen
-
- def categorymembers(self, category, namespaces=None, limit=None):
- """Iterate members of specified category.
-
- @param category: The Category to iterate.
- @param namespaces: If present, only return category members from
- these namespaces. For example, use namespaces=[14] to yield
- subcategories, use namespaces=[6] to yield image files, etc. Note,
- however, that the iterated values are always Page objects, even
- if in the Category or Image namespace.
- @type namespaces: list of ints
- @param limit: maximum number of pages to iterate (default: all)
- @type limit: int
-
- """
- if category.namespace() != 14:
- raise Error(
- u"categorymembers: non-Category page '%s' specified"
- % category.title())
- cmtitle = category.title(withSection=False).encode(self.encoding())
- cmgen = api.PageGenerator("categorymembers", gcmtitle=cmtitle,
- gcmprop="ids|title|sortkey", site=self)
- if isinstance(namespaces, list):
- cmgen.request["gcmnamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- cmgen.request["gcmnamespace"] = str(namespaces)
- if isinstance(limit, int):
- cmgen.limit = limit
- return cmgen
-
- def loadrevisions(self, page=None, getText=False, revids=None,
- limit=None, startid=None, endid=None, starttime=None,
- endtime=None, rvdir=None, user=None, excludeuser=None,
- section=None, sysop=False):
- """Retrieve and store revision information.
-
- By default, retrieves the last (current) revision of the page,
- I{unless} any of the optional parameters revids, startid, endid,
- starttime, endtime, rvdir, user, excludeuser, or limit are
- specified. Unless noted below, all parameters not specified
- default to False.
-
- If rvdir is False or not specified, startid must be greater than
- endid if both are specified; likewise, starttime must be greater
- than endtime. If rvdir is True, these relationships are reversed.
-
- @param page: retrieve revisions of this Page (required unless ids
- is specified)
- @param getText: if True, retrieve the wiki-text of each revision;
- otherwise, only retrieve the revision metadata (default)
- @param section: if specified, retrieve only this section of the text
- (getText must be True); section must be given by number (top of
- the article is section 0), not name
- @type section: int
- @param revids: retrieve only the specified revision ids (required
- unless page is specified)
- @type revids: list of ints
- @param limit: Retrieve no more than this number of revisions
- @type limit: int
- @param startid: retrieve revisions starting with this revid
- @param endid: stop upon retrieving this revid
- @param starttime: retrieve revisions starting at this timestamp
- @param endtime: stop upon reaching this timestamp
- @param rvdir: if false, retrieve newest revisions first (default);
- if true, retrieve earliest first
- @param user: retrieve only revisions authored by this user
- @param excludeuser: retrieve all revisions not authored by this user
- @param sysop: if True, switch to sysop account (if available) to
- retrieve this page
-
- """
- latest = (revids is None and
- startid is None and
- endid is None and
- starttime is None and
- endtime is None and
- rvdir is None and
- user is None and
- excludeuser is None and
- limit is None) # if True, we are retrieving current revision
-
- # check for invalid argument combinations
- if page is None and revids is None:
- raise ValueError(
- "loadrevisions: either page or revids argument required")
- if (startid is not None or endid is not None) and \
- (starttime is not None or endtime is not None):
- raise ValueError(
- "loadrevisions: startid/endid combined with
starttime/endtime")
- if starttime is not None and endtime is not None:
- if rvdir and starttime >= endtime:
- raise ValueError(
- "loadrevisions: starttime > endtime with rvdir=True")
- if (not rvdir) and endtime >= starttime:
- raise ValueError(
- "loadrevisions: endtime > starttime with rvdir=False")
- if startid is not None and endid is not None:
- if rvdir and startid >= endid:
- raise ValueError(
- "loadrevisions: startid > endid with rvdir=True")
- if (not rvdir) and endid >= startid:
- raise ValueError(
- "loadrevisions: endid > startid with rvdir=False")
-
- # assemble API request
- if revids is None:
- rvtitle = page.title(withSection=False).encode(self.encoding())
- rvgen = api.PropertyGenerator(u"info|revisions", titles=rvtitle,
- site=self)
- else:
- if isinstance(revids, (int, basestring)):
- ids = unicode(revids)
- else:
- ids = u"|".join(unicode(r) for r in revids)
- rvgen = api.PropertyGenerator(u"info|revisions", revids=ids,
- site=self)
- if getText:
- rvgen.request[u"rvprop"] = \
- u"ids|flags|timestamp|user|comment|content"
- if section is not None:
- rvgen.request[u"rvsection"] = unicode(section)
- if latest or "revids" in rvgen.request:
- rvgen.limit = -1 # suppress use of rvlimit parameter
- elif isinstance(limit, int):
- rvgen.limit = limit
- if rvdir:
- rvgen.request[u"rvdir"] = u"newer"
- elif rvdir is not None:
- rvgen.request[u"rvdir"] = u"older"
- if startid:
- rvgen.request[u"rvstartid"] = startid
- if endid:
- rvgen.request[u"rvendid"] = endid
- if starttime:
- rvgen.request[u"rvstart"] = starttime
- if endtime:
- rvgen.request[u"rvend"] = endtime
- if user:
- rvgen.request[u"rvuser"] = user
- elif excludeuser:
- rvgen.request[u"rvexcludeuser"] = excludeuser
- # TODO if sysop: something
- rvgen.continuekey = "revisions"
- for pagedata in rvgen:
- if page is not None:
- if pagedata['title'] != page.title(withSection=False):
- raise Error(
- u"loadrevisions: Query on %s returned data on
'%s'"
- % (page, pagedata['title']))
- if pagedata.has_key('missing'):
- raise NoPage(u'Page %s does not exist'
- % page.title(asLink=True))
- else:
- page = Page(self, pagedata['title'])
- api.update_page(page, pagedata)
-
- def pageinterwiki(self, page):
- # No such function in the API (this method isn't called anywhere)
- raise NotImplementedError
-
- def pagelanglinks(self, page):
- """Iterate all interlanguage links on page, yielding Link
objects."""
- lltitle = page.title(withSection=False)
- llquery = api.PropertyGenerator("langlinks",
- titles=lltitle.encode(self.encoding()),
- site=self)
- for pageitem in llquery:
- if pageitem['title'] != lltitle:
- raise Error(
- u"getlanglinks: Query on %s returned data on '%s'"
- % (page, pageitem['title']))
- if 'langlinks' not in pageitem:
- continue
- for linkdata in pageitem['langlinks']:
- yield pywikibot.Link(linkdata['*'],
- source=pywikibot.Site(linkdata['lang']))
-
- def page_extlinks(self, page):
- """Iterate all external links on page, yielding URL
strings."""
- eltitle = page.title(withSection=False)
- elquery = api.PropertyGenerator("extlinks",
- titles=eltitle.encode(self.encoding()),
- site=self)
- for pageitem in elquery:
- if pageitem['title'] != eltitle:
- raise RuntimeError(
- "getlanglinks: Query on %s returned data on '%s'"
- % (page, pageitem['title']))
- if 'extlinks' not in pageitem:
- continue
- for linkdata in pageitem['extlinks']:
- yield linkdata['*']
-
- @deprecate_arg("throttle", None)
- @deprecate_arg("includeredirects", "filterredir")
- def allpages(self, start="!", prefix="", namespace=0,
filterredir=None,
- filterlanglinks=None, minsize=None, maxsize=None,
- protect_type=None, protect_level=None, limit=None,
- reverse=False, includeredirects=None):
- """Iterate pages in a single namespace.
-
- Note: parameters includeRedirects and throttle are deprecated and
- included only for backwards compatibility.
-
- @param start: Start at this title (page need not exist).
- @param prefix: Only yield pages starting with this string.
- @param namespace: Iterate pages from this (single) namespace
- (default: 0)
- @param filterredir: if True, only yield redirects; if False (and not
- None), only yield non-redirects (default: yield both)
- @param filterlanglinks: if True, only yield pages with language links;
- if False (and not None), only yield pages without language links
- (default: yield both)
- @param minsize: if present, only yield pages at least this many
- bytes in size
- @param maxsize: if present, only yield pages at most this many bytes
- in size
- @param protect_type: only yield pages that have a protection of the
- specified type
- @type protect_type: str
- @param protect_level: only yield pages that have protection at this
- level; can only be used if protect_type is specified
- @param limit: maximum number of pages to iterate (default: iterate
- all pages in namespace)
- @param reverse: if True, iterate in reverse Unicode lexigraphic
- order (default: iterate in forward order)
- @param includeredirects: DEPRECATED, use filterredirs instead
-
- """
- if not isinstance(namespace, int):
- raise Error("allpages: only one namespace permitted.")
- if includeredirects is not None:
- logger.debug(
-"allpages: 'includeRedirects' argument is deprecated; use
'filterredirs'.")
- if includeredirects:
- if includeredirects == "only":
- filterredirs = True
- else:
- filterredirs = None
- else:
- filterredirs = False
-
- apgen = api.PageGenerator("allpages", gapnamespace=str(namespace),
- gapfrom=start, site=self)
- if prefix:
- apgen.request["gapprefix"] = prefix
- if filterredir is not None:
- apgen.request["gapfilterredir"] = (filterredir
- and "redirects"
- or "nonredirects")
- if filterlanglinks is not None:
- apgen.request["gapfilterlanglinks"] = (filterlanglinks
- and "withlanglinks"
- or "withoutlanglinks")
- if isinstance(minsize, int):
- apgen.request["gapminsize"] = str(minsize)
- if isinstance(maxsize, int):
- apgen.request["gapmaxsize"] = str(maxsize)
- if isinstance(protect_type, basestring):
- apgen.request["gapprtype"] = protect_type
- if isinstance(protect_level, basestring):
- apgen.request["gapprlevel"] = protect_level
- if isinstance(limit, int):
- apgen.limit = limit
- if reverse:
- apgen.request["gapdir"] = "descending"
- return apgen
-
- def prefixindex(self, prefix, namespace=0, includeredirects=True):
- """Yield all pages with a given prefix. Deprecated.
-
- Use allpages() with the prefix= parameter instead of this method.
-
- """
- logger.debug("Site.prefixindex() is deprecated; use allpages
instead.")
- return self.allpages(prefix=prefix, namespace=namespace,
- includeredirects=includeredirects)
-
-
- def alllinks(self, start="!", prefix="", namespace=0,
unique=False,
- limit=None, fromids=False):
- """Iterate all links to pages (which need not exist) in one
namespace.
-
- Note that, in practice, links that were found on pages that have
- been deleted may not have been removed from the links table, so this
- method can return false positives.
-
- @param start: Start at this title (page need not exist).
- @param prefix: Only yield pages starting with this string.
- @param namespace: Iterate pages from this (single) namespace
- (default: 0)
- @param unique: If True, only iterate each link title once (default:
- iterate once for each linking page)
- @param limit: maximum number of pages to iterate (default: iterate
- all pages in namespace)
- @param fromids: if True, include the pageid of the page containing
- each link (default: False) as the '_fromid' attribute of the Page;
- cannot be combined with unique
-
- """
- if unique and fromids:
- raise Error("alllinks: unique and fromids cannot both be True.")
- if not isinstance(namespace, int):
- raise Error("alllinks: only one namespace permitted.")
- algen = api.ListGenerator("alllinks", alnamespace=str(namespace),
- alfrom=start, site=self)
- if prefix:
- algen.request["alprefix"] = prefix
- if isinstance(limit, int):
- algen.limit = limit
- if unique:
- algen.request["alunique"] = ""
- if fromids:
- algen.request["alprop"] = "title|ids"
- for link in algen:
- p = pywikibot.Page(self, link['title'], link['ns'])
- if fromids:
- p._fromid = link['fromid']
- yield p
-
- def allcategories(self, start="!", prefix="", limit=None,
- reverse=False):
- """Iterate categories used (which need not have a Category page).
-
- Iterator yields Category objects. Note that, in practice, links that
- were found on pages that have been deleted may not have been removed
- from the database table, so this method can return false positives.
-
- @param start: Start at this category title (category need not exist).
- @param prefix: Only yield categories starting with this string.
- @param limit: maximum number of categories to iterate (default:
- iterate all)
- @param reverse: if True, iterate in reverse Unicode lexigraphic
- order (default: iterate in forward order)
-
- """
- acgen = api.CategoryPageGenerator("allcategories",
- gacfrom=start, site=self)
- if prefix:
- acgen.request["gacprefix"] = prefix
- if isinstance(limit, int):
- acgen.limit = limit
- if reverse:
- acgen.request["gacdir"] = "descending"
- return acgen
-
- def categories(self, number=10, repeat=False):
- """Deprecated; retained for
backwards-compatibility"""
- logger.debug(
- "Site.categories() method is deprecated; use .allcategories()")
- if repeat:
- limit = None
- else:
- limit = number
- return self.allcategories(limit=limit)
-
- def allusers(self, start="!", prefix="", limit=None,
group=None):
- """Iterate registered users, ordered by username.
-
- Iterated values are dicts containing 'name', 'editcount',
- 'registration', and (sometimes) 'groups' keys. 'groups'
will be
- present only if the user is a member of at least 1 group, and will
- be a list of unicodes; all the other values are unicodes and should
- always be present.
-
- @param start: start at this username (name need not exist)
- @param prefix: only iterate usernames starting with this substring
- @param limit: maximum number of users to iterate (default: all)
- @param group: only iterate users that are members of this group
- @type group: str
-
- """
- augen = api.ListGenerator("allusers", aufrom=start,
- auprop="editcount|groups|registration",
- site=self)
- if prefix:
- augen.request["auprefix"] = prefix
- if group:
- augen.request["augroup"] = group
- if isinstance(limit, int):
- augen.limit = limit
- return augen
-
- def allimages(self, start="!", prefix="", minsize=None,
maxsize=None,
- limit=None, reverse=False, sha1=None, sha1base36=None):
- """Iterate all images, ordered by image title.
-
- Yields ImagePages, but these pages need not exist on the wiki.
-
- @param start: start at this title (name need not exist)
- @param prefix: only iterate titles starting with this substring
- @param limit: maximum number of titles to iterate (default: all)
- @param minsize: only iterate images of at least this many bytes
- @param maxsize: only iterate images of no more than this many bytes
- @param reverse: if True, iterate in reverse lexigraphic order
- @param sha1: only iterate image (it is theoretically possible there
- could be more than one) with this sha1 hash
- @param sha1base36: same as sha1 but in base 36
-
- """
- aigen = api.ImagePageGenerator("allimages", gaifrom=start,
- site=self)
- if prefix:
- aigen.request["gaiprefix"] = prefix
- if isinstance(limit, int):
- aigen.limit = limit
- if isinstance(minsize, int):
- aigen.request["gaiminsize"] = str(minsize)
- if isinstance(maxsize, int):
- aigen.request["gaimaxsize"] = str(maxsize)
- if reverse:
- aigen.request["gaidir"] = "descending"
- if sha1:
- aigen.request["gaisha1"] = sha1
- if sha1base36:
- aigen.request["gaisha1base36"] = sha1base36
- return aigen
-
- def blocks(self, starttime=None, endtime=None, reverse=False,
- blockids=None, users=None, limit=None):
- """Iterate all current blocks, in order of creation.
-
- Note that logevents only logs user blocks, while this method
- iterates all blocks including IP ranges. The iterator yields dicts
- containing keys corresponding to the block properties (see
-
http://www.mediawiki.org/wiki/API:Query_-_Lists for documentation).
-
- @param starttime: start iterating at this timestamp
- @param endtime: stop iterating at this timestamp
- @param reverse: if True, iterate oldest blocks first (default: newest)
- @param blockids: only iterate blocks with these id numbers
- @param users: only iterate blocks affecting these usernames or IPs
- @param limit: maximum number of blocks to iterate (default: all)
-
- """
- if starttime and endtime:
- if reverse:
- if starttime > endtime:
- raise pywikibot.Error(
- "blocks: starttime must be before endtime with reverse=True")
- else:
- if endtime > starttime:
- raise pywikibot.Error(
- "blocks: endtime must be before starttime with reverse=False")
- bkgen = api.ListGenerator("blocks", site=self)
- bkgen.request["bkprop"] = \
- "id|user|by|timestamp|expiry|reason|range|flags"
- if starttime:
- bkgen.request["bkstart"] = starttime
- if endtime:
- bkgen.request["bkend"] = endtime
- if reverse:
- bkgen.request["bkdir"] = "newer"
- if blockids:
- bkgen.request["bkids"] = blockids
- if users:
- bkgen.request["bkusers"] = users
- if isinstance(limit, int):
- bkgen.limit = limit
- return bkgen
-
- def exturlusage(self, url, protocol="http", namespaces=None,
- limit=None):
- """Iterate Pages that contain links to the given URL.
-
- @param url: The URL to search for (without the protocol prefix);
- this many include a '*' as a wildcard, only at the start of the
- hostname
- @param protocol: The protocol prefix (default: "http")
- @param namespaces: Only iterate pages in these namespaces (default: all)
- @type namespaces: list of ints
- @param limit: Only iterate this many linking pages (default: all)
-
- """
- eugen = api.PageGenerator("exturlusage", geuquery=url,
- geuprotocol=protocol, site=self)
- if isinstance(namespaces, list):
- eugen.request["geunamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- eugen.request["geunamespace"] = str(namespaces)
- if isinstance(limit, int):
- eugen.limit = limit
- return eugen
-
- def imageusage(self, image, namespaces=None, filterredir=None,
- limit=None):
- """Iterate Pages that contain links to the given ImagePage.
-
- @param image: the image to search for (ImagePage need not exist on the wiki)
- @type image: ImagePage
- @param namespaces: Only iterate pages in these namespaces (default: all)
- @type namespaces: list of ints
- @param filterredir: if True, only yield redirects; if False (and not
- None), only yield non-redirects (default: yield both)
- @param limit: Only iterate this many linking pages (default: all)
-
- """
- iugen = api.PageGenerator("imageusage", site=self,
- giutitle=image.title(withSection=False))
- if isinstance(namespaces, list):
- iugen.request["giunamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- iugen.request["giunamespace"] = str(namespaces)
- if isinstance(limit, int):
- iugen.limit = limit
- if filterredir is not None:
- iugen.request["giufilterredir"] = (filterredir and
"redirects"
- or "nonredirects")
- return iugen
-
- def logevents(self, logtype=None, user=None, page=None,
- start=None, end=None, reverse=False, limit=None):
- """Iterate all log entries.
-
- @param logtype: only iterate entries of this type (see wiki
- documentation for available types, which will include "block",
- "protect", "rights", "delete",
"upload", "move", "import",
- "patrol", "merge")
- @param user: only iterate entries that match this user name
- @param page: only iterate entries affecting this page
- @param start: only iterate entries from and after this timestamp
- @param end: only iterate entries up to and through this timestamp
- @param reverse: if True, iterate oldest entries first (default: newest)
- @param limit: only iterate up to this many entries
-
- """
- if start and end:
- if reverse:
- if end < start:
- raise Error(
- "logevents: end must be later than start with reverse=True")
- else:
- if start < end:
- raise Error(
- "logevents: start must be later than end with
reverse=False")
- legen = api.ListGenerator("logevents", site=self)
- if logtype is not None:
- legen.request["letype"] = logtype
- if user is not None:
- legen.request["leuser"] = user
- if page is not None:
- legen.request["letitle"] = page.title(withSection=False)
- if start is not None:
- legen.request["lestart"] = start
- if end is not None:
- legen.request["leend"] = end
- if reverse:
- legen.request["ledir"] = "newer"
- if isinstance(limit, int):
- legen.limit = limit
- return legen
-
- def recentchanges(self, start=None, end=None, reverse=False, limit=None,
- namespaces=None, pagelist=None, changetype=None,
- showMinor=None, showBot=None, showAnon=None,
- showRedirects=None, showPatrolled=None):
- """Iterate recent changes.
-
- @param start: timestamp to start listing from
- @param end: timestamp to end listing at
- @param reverse: if True, start with oldest changes (default: newest)
- @param limit: iterate no more than this number of entries
- @param namespaces: iterate changes to pages in these namespaces only
- @type namespaces: list of ints
- @param pagelist: iterate changes to pages in this list only
- @param pagelist: list of Pages
- @param changetype: only iterate changes of this type ("edit" for
- edits to existing pages, "new" for new pages, "log" for
log
- entries)
- @param showMinor: if True, only list minor edits; if False (and not
- None), only list non-minor edits
- @param showBot: if True, only list bot edits; if False (and not
- None), only list non-bot edits
- @param showAnon: if True, only list anon edits; if False (and not
- None), only list non-anon edits
- @param showRedirects: if True, only list edits to redirect pages; if
- False (and not None), only list edits to non-redirect pages
- @param showPatrolled: if True, only list patrolled edits; if False
- (and not None), only list non-patrolled edits
-
- """
- if start and end:
- if reverse:
- if end < start:
- raise Error(
- "recentchanges: end must be later than start with reverse=True")
- else:
- if start < end:
- raise Error(
- "recentchanges: start must be later than end with reverse=False")
- rcgen = api.ListGenerator("recentchanges", site=self,
- rcprop="user|comment|timestamp|title|ids"
- "|redirect|patrolled|loginfo|flags")
- if start is not None:
- rcgen.request["rcstart"] = start
- if end is not None:
- rcgen.request["rcend"] = end
- if reverse:
- rcgen.request["rcdir"] = "newer"
- if isinstance(limit, int):
- rcgen.limit = limit
- if isinstance(namespaces, list):
- rcgen.request["rcnamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- rcgen.request["rcnamespace"] = str(namespaces)
- if pagelist:
- rcgen.request["rctitles"] =
u"|".join(p.title(withSection=False)
- for p in pagelist)
- if changetype:
- rcgen.request["rctype"] = changetype
- filters = {'minor': showMinor,
- 'bot': showBot,
- 'anon': showAnon,
- 'redirect': showRedirects,
- 'patrolled': showPatrolled}
- rcshow = []
- for item in filters:
- if filters[item] is not None:
- rcshow.append(filters[item] and item or ("!"+item))
- if rcshow:
- rcgen.request["rcshow"] = "|".join(rcshow)
- return rcgen
-
- @deprecate_arg("number", "limit")
- def search(self, searchstring, namespaces=None, where="text",
- getredirects=False, limit=None):
- """Iterate Pages that contain the searchstring.
-
- Note that this may include non-existing Pages if the wiki's database
- table contains outdated entries.
-
- @param searchstring: the text to search for
- @type searchstring: unicode
- @param where: Where to search; value must be "text" or
"titles" (many
- wikis do not support title search)
- @param namespaces: search only in these namespaces (defaults to 0)
- @type namespaces: list of ints
- @param getredirects: if True, include redirects in results
- @param limit: maximum number of results to iterate
-
- """
- if not searchstring:
- raise Error("search: searchstring cannot be empty")
- if where not in ("text", "titles"):
- raise Error("search: unrecognized 'where' value: %s" %
where)
- srgen = api.PageGenerator("search", gsrsearch=searchstring,
- gsrwhat=where, site=self)
- if not namespaces:
- logger.warning("search: namespaces cannot be empty; using [0].")
- namespaces = [0]
- if isinstance(namespaces, list):
- srgen.request["gsrnamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- else:
- srgen.request["gsrnamespace"] = str(namespaces)
- if getredirects:
- srgen.request["gsrredirects"] = ""
- if isinstance(limit, int):
- srgen.limit = limit
- return srgen
-
- def usercontribs(self, user=None, userprefix=None, start=None, end=None,
- reverse=False, limit=None, namespaces=None,
- showMinor=None):
- """Iterate contributions by a particular user.
-
- Iterated values are in the same format as recentchanges.
-
- @param user: Iterate contributions by this user (name or IP)
- @param userprefix: Iterate contributions by all users whose names
- or IPs start with this substring
- @param start: Iterate contributions starting at this timestamp
- @param end: Iterate contributions ending at this timestamp
- @param reverse: Iterate oldest contributions first (default: newest)
- @param limit: Maximum number of contributions to iterate
- @param namespaces: Only iterate contributions in these namespaces
- @type namespaces: list of ints
- @param showMinor: if True, iterate only minor edits; if False and
- not None, iterate only non-minor edits (default: iterate both)
-
- """
- if not (user or userprefix):
- raise Error(
- "usercontribs: either user or userprefix must be non-empty")
- if start and end:
- if reverse:
- if end < start:
- raise Error(
- "usercontribs: end must be later than start with
reverse=True")
- else:
- if start < end:
- raise Error(
- "usercontribs: start must be later than end with
reverse=False")
- ucgen = api.ListGenerator("usercontribs", site=self,
- ucprop="ids|title|timestamp|comment|flags")
- if user:
- ucgen.request["ucuser"] = user
- if userprefix:
- ucgen.request["ucuserprefix"] = userprefix
- if start is not None:
- ucgen.request["ucstart"] = start
- if end is not None:
- ucgen.request["ucend"] = end
- if reverse:
- ucgen.request["ucdir"] = "newer"
- if isinstance(limit, int):
- ucgen.limit = limit
- if isinstance(namespaces, list):
- ucgen.request["ucnamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- ucgen.request["ucnamespace"] = str(namespaces)
- if showMinor is not None:
- ucgen.request["ucshow"] = showMinor and "minor" or
"!minor"
- return ucgen
-
- def watchlist_revs(self, start=None, end=None, reverse=False,
- namespaces=None, showMinor=None, showBot=None,
- showAnon=None, limit=None):
- """Iterate revisions to pages on the bot user's watchlist.
-
- Iterated values will be in same format as recentchanges.
-
- @param start: Iterate revisions starting at this timestamp
- @param end: Iterate revisions ending at this timestamp
- @param reverse: Iterate oldest revisions first (default: newest)
- @param namespaces: only iterate revisions to pages in these
- namespaces (default: all)
- @type namespaces: list of ints
- @param showMinor: if True, only list minor edits; if False (and not
- None), only list non-minor edits
- @param showBot: if True, only list bot edits; if False (and not
- None), only list non-bot edits
- @param showAnon: if True, only list anon edits; if False (and not
- None), only list non-anon edits
- @param limit: Maximum number of revisions to iterate
-
- """
- if start and end:
- if reverse:
- if end < start:
- raise Error(
- "watchlist_revs: end must be later than start with reverse=True")
- else:
- if start < end:
- raise Error(
- "watchlist_revs: start must be later than end with reverse=False")
- wlgen = api.ListGenerator("watchlist", wlallrev="",
site=self,
- wlprop="user|comment|timestamp|title|ids|flags")
- #TODO: allow users to ask for "patrol" as well?
- if start is not None:
- wlgen.request["wlstart"] = start
- if end is not None:
- wlgen.request["wlend"] = end
- if reverse:
- wlgen.request["wldir"] = "newer"
- if isinstance(limit, int):
- wlgen.limit = limit
- if isinstance(namespaces, list):
- wlgen.request["wlnamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- wlgen.request["wlnamespace"] = str(namespaces)
- filters = {'minor': showMinor,
- 'bot': showBot,
- 'anon': showAnon}
- wlshow = []
- for item in filters:
- if filters[item] is not None:
- wlshow.append(filters[item] and item or ("!"+item))
- if wlshow:
- wlgen.request["wlshow"] = "|".join(wlshow)
- return wlgen
-
- def deletedrevs(self, page, start=None, end=None, reverse=None, limit=None,
- get_text=False):
- """Iterate deleted revisions.
-
- Each value returned by the iterator will be a dict containing the
- 'title' and 'ns' keys for a particular Page and a
'revisions' key
- whose value is a list of revisions in the same format as
- recentchanges (plus a 'content' element if requested). If get_text
- is true, the toplevel dict will contain a 'token' key as well.
-
- @param page: The page to check for deleted revisions
- @param start: Iterate revisions starting at this timestamp
- @param end: Iterate revisions ending at this timestamp
- @param reverse: Iterate oldest revisions first (default: newest)
- @param limit: Iterate no more than this number of revisions.
- @param get_text: If True, retrieve the content of each revision and
- an undelete token
-
- """
- if start and end:
- if reverse:
- if end < start:
- raise Error(
-"deletedrevs: end must be later than start with reverse=True")
- else:
- if start < end:
- raise Error(
-"deletedrevs: start must be later than end with reverse=False")
- if not self.logged_in():
- self.login()
- if "deletedhistory" not in self.userinfo['rights']:
- try:
- self.login(True)
- except NoUsername:
- pass
- if "deletedhistory" not in self.userinfo['rights']:
- raise Error(
-"deletedrevs: User:%s not authorized to access deleted revisions."
- % self.user())
- if get_text:
- if "undelete" not in self.userinfo['rights']:
- try:
- self.login(True)
- except NoUsername:
- pass
- if "undelete" not in self.userinfo['rights']:
- raise Error(
-"deletedrevs: User:%s not authorized to view deleted content."
- % self.user())
-
- drgen = api.ListGenerator("deletedrevs", site=self,
- titles=page.title(withSection=False),
- drprop="revid|user|comment|minor")
- if get_text:
- drgen.request['drprop'] = drgen.request['drprop'] +
"|content|token"
- if start is not None:
- drgen.request["drstart"] = start
- if end is not None:
- drgen.request["drend"] = end
- if reverse:
- drgen.request["drdir"] = "newer"
- if isinstance(limit, int):
- drgen.limit = limit
- return drgen
-
- def users(self, usernames):
- """Iterate info about a list of users by name or IP.
-
- @param usernames: a list of user names
- @type usernames: list, or other iterable, of unicodes
-
- """
- if not isinstance(usernames, basestring):
- usernames = u"|".join(usernames)
- usgen = api.ListGenerator("users", ususers=usernames, site=self,
- usprop="blockinfo|groups|editcount|registration")
- return usgen
-
- def randompages(self, limit=1, namespaces=None, redirects=False):
- """Iterate a number of random pages.
-
- Pages are listed in a fixed sequence, only the starting point is
- random.
-
- @param limit: the maximum number of pages to iterate (default: 1)
- @param namespaces: only iterate pages in these namespaces.
- @param redirects: if True, include only redirect pages in results
- (default: include only non-redirects)
-
- """
- rngen = api.PageGenerator("random", site=self)
- rngen.limit = limit
- if isinstance(namespaces, list):
- rngen.request["grnnamespace"] = u"|".join(unicode(ns)
- for ns in namespaces)
- elif namespaces is not None:
- rngen.request["grnnamespace"] = str(namespaces)
- if redirects:
- rngen.request["grnredirect"] = ""
- return rngen
-
- # catalog of editpage error codes, for use in generating messages
- _ep_errors = {
- "noapiwrite": "API editing not enabled on %(site)s wiki",
- "writeapidenied":
-"User %(user)s is not authorized to edit on %(site)s wiki",
- "protectedtitle":
-"Title %(title)s is protected against creation on %(site)s",
- "cantcreate":
-"User %(user)s not authorized to create new pages on %(site)s wiki",
- "cantcreate-anon":
-"""Bot is not logged in, and anon users are not authorized to create new
pages
-on %(site)s wiki""",
- "articleexists": "Page %(title)s already exists on %(site)s
wiki",
- "noimageredirect-anon":
-"""Bot is not logged in, and anon users are not authorized to create
image
-redirects on %(site)s wiki""",
- "noimageredirect":
-"User %(user)s not authorized to create image redirects on %(site)s wiki",
- "spamdetected":
-"Edit to page %(title)s rejected by spam filter due to content:\n",
- "filtered": "%(info)s",
- "contenttoobig": "%(info)s",
- "noedit-anon":
-"""Bot is not logged in, and anon users are not authorized to edit on
-%(site)s wiki""",
- "noedit": "User %(user)s not authorized to edit pages on %(site)s
wiki",
- "pagedeleted":
-"Page %(title)s has been deleted since last retrieved from %(site)s wiki",
- "editconflict": "Page %(title)s not saved due to edit
conflict.",
- }
-
- def editpage(self, page, summary, minor=True, notminor=False,
- recreate=True, createonly=False, watch=False, unwatch=False):
- """Submit an edited Page object to be saved to the wiki.
-
- @param page: The Page to be saved; its .text property will be used
- as the new text to be saved to the wiki
- @param token: the edit token retrieved using Site.token()
- @param summary: the edit summary (required!)
- @param minor: if True (default), mark edit as minor
- @param notminor: if True, override account preferences to mark edit
- as non-minor
- @param recreate: if True (default), create new page even if this
- title has previously been deleted
- @param createonly: if True, raise an error if this title already
- exists on the wiki
- @param watch: if True, add this Page to bot's watchlist
- @param unwatch: if True, remove this Page from bot's watchlist if
- possible
- @return: True if edit succeeded, False if it failed
-
- """
- text = page.text
- if not text:
- raise Error("editpage: no text to be saved")
- try:
- lastrev = page.latestRevision()
- except NoPage:
- lastrev = None
- if not recreate:
- raise Error("Page %s does not exist on %s wiki."
- % (page.title(withSection=False), self))
- token = self.token(page, "edit")
- self.lock_page(page)
- if lastrev is not None and page.latestRevision() != lastrev:
- raise Error("editpage: Edit conflict detected; saving aborted.")
- req = api.Request(site=self, action="edit",
- title=page.title(withSection=False),
- text=text, token=token, summary=summary)
-## if lastrev is not None:
-## req["basetimestamp"] = page._revisions[lastrev].timestamp
- if minor:
- req['minor'] = ""
- elif notminor:
- req['notminor'] = ""
- if 'bot' in self.userinfo['groups']:
- req['bot'] = ""
- if recreate:
- req['recreate'] = ""
- if createonly:
- req['createonly'] = ""
- if watch:
- req['watch'] = ""
- elif unwatch:
- req['unwatch'] = ""
-## FIXME: API gives 'badmd5' error
-## md5hash = md5()
-## md5hash.update(urllib.quote_plus(text.encode(self.encoding())))
-## req['md5'] = md5hash.digest()
- while True:
- try:
- result = req.submit()
- logger.debug("editpage response: %s" % result)
- except api.APIError, err:
- self.unlock_page(page)
- if err.code.endswith("anon") and self.logged_in():
- logger.debug(
-"editpage: received '%s' even though bot is logged in" % err.code)
- errdata = {
- 'site': self,
- 'title': page.title(withSection=False),
- 'user': self.user(),
- 'info': err.info
- }
- if err.code == "spamdetected":
- raise SpamfilterError(self._ep_errors[err.code] % errdata
- + err.info[ err.info.index("fragment: ") + 9: ])
-
- if err.code == "editconflict":
- raise EditConflict(self._ep_errors[err.code] % errdata)
- if err.code in self._ep_errors:
- raise Error(self._ep_errors[err.code] % errdata)
- logger.debug("editpage: Unexpected error code '%s'
received."
- % err.code)
- raise
- assert ("edit" in result and "result" in
result["edit"]), result
- if result["edit"]["result"] == "Success":
- self.unlock_page(page)
- if "nochange" in result["edit"]:
- # null edit, page not changed
- # TODO: do we want to notify the user of this?
- return True
- page._revid = result["edit"]["newrevid"]
- # see
http://www.mediawiki.org/wiki/API:Wikimania_2006_API_discussion#Notes
- # not safe to assume that saved text is the same as sent
- self.loadrevisions(page, getText=True)
- return True
- elif result["edit"]["result"] == "Failure":
- if "captcha" in result["edit"]:
- captcha = result["edit"]["captcha"]
- req['captchaid'] = captcha['id']
- if captcha["type"] == "math":
- req['captchaword'] =
input(captcha["question"])
- continue
- elif "url" in captcha:
- webbrowser.open(url)
- req['captchaword'] = cap_answerwikipedia.input(
-"Please view CAPTCHA in your browser, then type answer here:")
- continue
- else:
- self.unlock_page(page)
- logger.error(
-"editpage: unknown CAPTCHA response %s, page not saved"
- % captcha)
- return False
- else:
- self.unlock_page(page)
- logger.error("editpage: unknown failure reason %s"
- % str(result))
- return False
- else:
- self.unlock_page(page)
- logger.error(
-"editpage: Unknown result code '%s' received; page not saved"
- % result["edit"]["result"])
- logger.error(str(result))
- return False
-
- # catalog of move errors for use in error messages
- _mv_errors = {
- "noapiwrite": "API editing not enabled on %(site)s wiki",
- "writeapidenied":
-"User %(user)s is not authorized to edit on %(site)s wiki",
- "nosuppress":
-"User %(user)s is not authorized to move pages without creating redirects",
- "cantmove-anon":
-"""Bot is not logged in, and anon users are not authorized to move pages
on
-%(site)s wiki""",
- "cantmove":
-"User %(user)s is not authorized to move pages on %(site)s wiki",
- "immobilenamespace":
-"Pages in %(oldnamespace)s namespace cannot be moved on %(site)s wiki",
- "articleexists":
-"Cannot move because page [[%(newtitle)s]] already exists on %(site)s wiki",
- "protectedpage":
-"Page [[%(oldtitle)s]] is protected against moving on %(site)s wiki",
- "protectedtitle":
-"Page [[%(newtitle)s]] is protected against creation on %(site)s wiki",
- "nonfilenamespace":
-"Cannot move a file to %(newnamespace)s namespace on %(site)s wiki",
- "filetypemismatch":
-"[[%(newtitle)s]] file extension does not match content of [[%(oldtitle)s]]"
- }
-
- def movepage(self, page, newtitle, summary, movetalk=True,
- noredirect=False):
- """Move a Page to a new title.
-
- @param page: the Page to be moved (must exist)
- @param newtitle: the new title for the Page
- @type newtitle: unicode
- @param summary: edit summary (required!)
- @param movetalk: if True (default), also move the talk page if possible
- @param noredirect: if True, suppress creation of a redirect from the
- old title to the new one
- @return: Page object with the new title
-
- """
- oldtitle = page.title(withSection=False)
- newlink = pywikibot.Link(newtitle, self)
- if newlink.namespace:
- newtitle = self.namespace(newlink.namespace) + ":" + newlink.title
- else:
- newtitle = newlink.title
- if oldtitle == newtitle:
- raise Error("Cannot move page %s to its own title."
- % oldtitle)
- if not page.exists():
- raise Error("Cannot move page %s because it does not exist on %s."
- % (oldtitle, self))
- token = self.token(page, "move")
- self.lock_page(page)
- req = api.Request(site=self, action="move", to=newtitle,
- token=token, reason=summary)
- req['from'] = oldtitle # "from" is a python keyword
- if movetalk:
- req['movetalk'] = ""
- if noredirect:
- req['noredirect'] = ""
- try:
- result = req.submit()
- logger.debug("movepage response: %s" % result)
- except api.APIError, err:
- if err.code.endswith("anon") and self.logged_in():
- logger.debug(
-"movepage: received '%s' even though bot is logged in" % err.code)
- errdata = {
- 'site': self,
- 'oldtitle': oldtitle,
- 'oldnamespace': self.namespace(page.namespace()),
- 'newtitle': newtitle,
- 'newnamespace': self.namespace(newlink.namespace),
- 'user': self.user(),
- }
- if err.code in self._mv_errors:
- raise Error(self._mv_errors[err.code] % errdata)
- logger.debug("movepage: Unexpected error code '%s'
received."
- % err.code)
- raise
- finally:
- self.unlock_page(page)
- if "move" not in result:
- logger.error("movepage: %s" % result)
- raise Error("movepage: unexpected response")
- # TODO: Check for talkmove-error messages
- if "talkmove-error-code" in result["move"]:
- logger.warning(u"movepage: Talk page %s not moved"
- % (page.toggleTalkPage().title(asLink=True)))
- return pywikibot.Page(page, newtitle)
-
- # catalog of rollback errors for use in error messages
- _rb_errors = {
- "noapiwrite":
- "API editing not enabled on %(site)s wiki",
- "writeapidenied":
- "User %(user)s not allowed to edit through the API",
- "alreadyrolled":
- "Page [[%(title)s]] already rolled back; action aborted.",
- } # other errors shouldn't arise because we check for those errors
-
- def rollbackpage(self, page, summary=u''):
- """Roll back page to version before last user's edits.
-
- As a precaution against errors, this method will fail unless
- the page history contains at least two revisions, and at least
- one that is not by the same user who made the last edit.
-
- @param page: the Page to be rolled back (must exist)
- @param summary: edit summary (defaults to a standardized message)
-
- """
- if len(page._revisions) < 2:
- raise pywikibot.Error(
- u"Rollback of %s aborted; load revision history first."
- % page.title(asLink=True))
- last_rev = page._revisions[page.latestRevision()]
- last_user = last_rev.user
- for rev in sorted(page._revisions.keys(), reverse=True):
- # start with most recent revision first
- if rev.user != last_user:
- prev_user = rev.user
- break
- else:
- raise pywikibot.Error(
- u"Rollback of %s aborted; only one user in revision
history."
- % page.title(asLink=True))
- summary = summary or (
-u"Reverted edits by [[Special:Contributions/%(last_user)s|%(last_user)s]] "
-u"([[User talk:%(last_user)s|Talk]]) to last version by %(prev_user)s"
- % locals())
- token = self.token(page, "rollback")
- self.lock_page(page)
- req = api.Request(site=self, action="rollback",
- title=page.title(withSection=False),
- user=last_user,
- token=token)
- try:
- result = req.submit()
- except api.APIError, err:
- errdata = {
- 'site': self,
- 'title': page.title(withSection=False),
- 'user': self.user(),
- }
- if err.code in self._rb_errors:
- raise Error(self._rb_errors[err.code] % errdata)
- logger.debug("rollback: Unexpected error code '%s'
received."
- % err.code)
- raise
- finally:
- self.unlock_page(page)
-
- # catalog of delete errors for use in error messages
- _dl_errors = {
- "noapiwrite":
- "API editing not enabled on %(site)s wiki",
- "writeapidenied":
- "User %(user)s not allowed to edit through the API",
- "permissiondenied":
- "User %(user)s not authorized to delete pages on %(site)s wiki.",
- "cantdelete":
- "Could not delete [[%(title)s]]. Maybe it was deleted already.",
- } # other errors shouldn't occur because of pre-submission checks
-
- def deletepage(self, page, summary):
- """Delete page from the wiki. Requires appropriate privilege
level.
-
- @param page: Page to be deleted.
- @param summary: Edit summary (required!).
-
- """
- try:
- self.login(sysop=True)
- except pywikibot.Error, e:
- raise Error("delete: Unable to login as sysop (%s)"
- % e.__class__.__name__)
- if not self.logged_in(sysop=True):
- raise Error("delete: Unable to login as sysop")
- token = self.token("delete")
- req = api.Request(site=self, action="delete", token=token,
- title=page.title(withSection=False),
- reason=summary)
- try:
- result = req.submit()
- except api.APIError, err:
- errdata = {
- 'site': self,
- 'title': page.title(withSection=False),
- 'user': self.user(),
- }
- if err.code in self._dl_errors:
- raise Error(self._dl_errors[err.code] % errdata)
- logger.debug("delete: Unexpected error code '%s'
received."
- % err.code)
- raise
- finally:
- self.unlock_page(page)
-
- # TODO: implement undelete
-
- # TODO: implement patrol
-
- def linksearch(self, siteurl, limit=500):
- """Backwards-compatible interface to
exturlusage()"""
- return self.exturlusage(siteurl, limit=limit)
-
- @deprecate_arg("repeat", None)
- def newimages(self, number=100, lestart=None, leend=None, leuser=None,
- letitle=None):
- """Yield ImagePages from most recent uploads"""
- return self.logevents(logtype="upload", limit=number, start=lestart,
- end=leend, user=leuser, title=letitle)
-
- def getImagesFromAnHash(self, hash_found=None):
- """Return all images that have the same hash.
-
- Useful to find duplicates or nowcommons.
-
- NOTE: it returns also the image itself, if you don't want it, just
- filter the list returned.
-
- NOTE 2: it returns the image title WITHOUT the image namespace.
-
- """
- if hash_found == None: # If the hash is none return None and not continue
- return None
- return [image.title(withNamespace=False)
- for image in self.allimages(sha1=hash_found)]
-
-
-#### METHODS NOT IMPLEMENTED YET ####
-class NotImplementedYet:
-
- # TODO: is this needed any more? can it be obtained from the http module?
- def cookies(self, sysop = False):
- """Return a string containing the user's current
cookies."""
- self._loadCookies(sysop = sysop)
- index = self._userIndex(sysop)
- return self._cookies[index]
-
- def _loadCookies(self, sysop = False):
- """Retrieve session cookies for login"""
- index = self._userIndex(sysop)
- if self._cookies[index] is not None:
- return
- try:
- if sysop:
- try:
- username = config.sysopnames[self.family.name
- ][self.code]
- except KeyError:
- raise NoUsername("""\
-You tried to perform an action that requires admin privileges, but you haven't
-entered your sysop name in your user-config.py. Please add
-sysopnames['%s']['%s']='name' to your
user-config.py"""
- % (self.family.name, self.code))
- else:
- username = pywikiobt.config2.usernames[self.family.name
- ][self.code]
- except KeyError:
- self._cookies[index] = None
- self._isLoggedIn[index] = False
- else:
- tmp = '%s-%s-%s-login.data' % (
- self.family.name, self.code, username)
- fn = config.datafilepath('login-data', tmp)
- if not os.path.exists(fn):
- self._cookies[index] = None
- self._isLoggedIn[index] = False
- else:
- f = open(fn)
- self._cookies[index] = '; '.join([x.strip() for x in
f.readlines()])
- f.close()
-
- # THESE ARE FUNCTIONS NOT YET IMPLEMENTED IN THE API
- # TODO: avoid code duplication for the following methods
- def newpages(self, number = 10, get_redirect = False, repeat = False):
- """Yield new articles (as Page objects) from Special:Newpages.
-
- Starts with the newest article and fetches the number of articles
- specified in the first argument. If repeat is True, it fetches
- Newpages again. If there is no new page, it blocks until there is
- one, sleeping between subsequent fetches of Newpages.
-
- The objects yielded are tuples composed of the Page object,
- timestamp (unicode), length (int), an empty unicode string, username
- or IP address (str), comment (unicode).
-
- """
- # TODO: in recent MW versions Special:Newpages takes a namespace parameter,
- # and defaults to 0 if not specified.
- # TODO: Detection of unregistered users is broken
- # TODO: Repeat mechanism doesn't make much sense as implemented;
- # should use both offset and limit parameters, and have an
- # option to fetch older rather than newer pages
- seen = set()
- while True:
- path = self.newpages_address(n=number)
- # The throttling is important here, so always enabled.
- get_throttle()
- html = self.getUrl(path)
-
- entryR = re.compile(
-'<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"'
-'
title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<length>[\d,.]+)[^\)\]]*[\)\]]'
-' .?<a href=".+?"
title=".+?:(?P<username>.+?)">'
- )
- for m in entryR.finditer(html):
- date = m.group('date')
- title = m.group('title')
- title = title.replace('"', '"')
- length = int(re.sub("[,.]", "",
m.group('length')))
- loggedIn = u''
- username = m.group('username')
- comment = u''
-
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page, date, length, loggedIn, username, comment
- if not repeat:
- break
-
- def longpages(self, number = 10, repeat = False):
- """Yield Pages from Special:Longpages.
-
- Return values are a tuple of Page object, length(int).
-
- """
- #TODO: should use offset and limit parameters; 'repeat' as now
- # implemented is fairly useless
- # this comment applies to all the XXXXpages methods following, as well
- seen = set()
- while True:
- path = self.longpages_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- entryR = re.compile(ur'<li>\(<a href=".+?"
title=".+?">hist</a>\) <a href=".+?"
title="(?P<title>.+?)">.+?</a>
\[(?P<length>\d+)(.+?)\]</li>')
- for m in entryR.finditer(html):
- title = m.group('title')
- length = int(m.group('length'))
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page, length
- if not repeat:
- break
-
- def shortpages(self, number = 10, repeat = False):
- """Yield Pages and lengths from
Special:Shortpages."""
- throttle = True
- seen = set()
- while True:
- path = self.shortpages_address(n = number)
- get_throttle()
- html = self.getUrl(path)
- entryR = re.compile(ur'<li>\(<a href=".+?"
title=".+?">hist</a>\) <a href=".+?"
title="(?P<title>.+?)">.+?</a>
\[(?P<length>\d+)(.+?)\]</li>')
- for m in entryR.finditer(html):
- title = m.group('title')
- length = int(m.group('length'))
-
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page, length
- if not repeat:
- break
-
- def deadendpages(self, number = 10, repeat = False):
- """Yield Page objects retrieved from
Special:Deadendpages."""
- seen = set()
- while True:
- path = self.deadendpages_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- entryR = re.compile(
- '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
- for m in entryR.finditer(html):
- title = m.group('title')
-
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page
- if not repeat:
- break
-
- def ancientpages(self, number = 10, repeat = False):
- """Yield Pages, datestamps from
Special:Ancientpages."""
- seen = set()
- while True:
- path = self.ancientpages_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- entryR = re.compile(
-'<li><a href=".+?"
title="(?P<title>.+?)">.+?</a>
(?P<date>.+?)</li>')
- for m in entryR.finditer(html):
- title = m.group('title')
- date = m.group('date')
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page, date
- if not repeat:
- break
-
- def lonelypages(self, number = 10, repeat = False):
- """Yield Pages retrieved from
Special:Lonelypages."""
- throttle = True
- seen = set()
- while True:
- path = self.lonelypages_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- entryR = re.compile(
- '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
- for m in entryR.finditer(html):
- title = m.group('title')
-
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page
- if not repeat:
- break
-
- def unwatchedpages(self, number = 10, repeat = False):
- """Yield Pages from Special:Unwatchedpages (requires Admin
privileges)."""
- seen = set()
- while True:
- path = self.unwatchedpages_address(n=number)
- get_throttle()
- html = self.getUrl(path, sysop = True)
- entryR = re.compile(
- '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a>.+?</li>')
- for m in entryR.finditer(html):
- title = m.group('title')
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page
- if not repeat:
- break
-
- def uncategorizedcategories(self, number = 10, repeat = False):
- """Yield Categories from
Special:Uncategorizedcategories."""
- import catlib
- seen = set()
- while True:
- path = self.uncategorizedcategories_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- entryR = re.compile(
- '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
- for m in entryR.finditer(html):
- title = m.group('title')
- if title not in seen:
- seen.add(title)
- page = catlib.Category(self, title)
- yield page
- if not repeat:
- break
-
- def newimages(self, number = 10, repeat = False):
- """Yield ImagePages from
Special:Log&type=upload"""
-
- seen = set()
- regexp = re.compile('<li[^>]*>(?P<date>.+?)\s+<a
href=.*?>(?P<user>.+?)</a>\s+\(.+?</a>\).*?<a
href=".*?"(?P<new> class="new")?
title="(?P<image>.+?)"\s*>(?:.*?<span
class="comment">(?P<comment>.*?)</span>)?', re.UNICODE)
-
- while True:
- path = self.log_address(number, mode = 'upload')
- get_throttle()
- html = self.getUrl(path)
-
- for m in regexp.finditer(html):
- image = m.group('image')
-
- if image not in seen:
- seen.add(image)
-
- if m.group('new'):
- output(u"Image \'%s\' has been deleted." %
image)
- continue
-
- date = m.group('date')
- user = m.group('user')
- comment = m.group('comment') or ''
-
- yield ImagePage(self, image), date, user, comment
- if not repeat:
- break
-
- def uncategorizedimages(self, number = 10, repeat = False):
- """Yield ImagePages from
Special:Uncategorizedimages."""
- seen = set()
- ns = self.image_namespace()
- entryR = re.compile(
- '<a href=".+?"
title="(?P<title>%s:.+?)">.+?</a>' % ns)
- while True:
- path = self.uncategorizedimages_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- for m in entryR.finditer(html):
- title = m.group('title')
- if title not in seen:
- seen.add(title)
- page = ImagePage(self, title)
- yield page
- if not repeat:
- break
-
- def uncategorizedpages(self, number = 10, repeat = False):
- """Yield Pages from Special:Uncategorizedpages."""
- seen = set()
- while True:
- path = self.uncategorizedpages_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- entryR = re.compile(
- '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
- for m in entryR.finditer(html):
- title = m.group('title')
-
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page
- if not repeat:
- break
-
- def unusedcategories(self, number = 10, repeat = False):
- """Yield Category objects from
Special:Unusedcategories."""
- import catlib
- seen = set()
- while True:
- path = self.unusedcategories_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- entryR = re.compile('<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
- for m in entryR.finditer(html):
- title = m.group('title')
-
- if title not in seen:
- seen.add(title)
- page = catlib.Category(self, title)
- yield page
- if not repeat:
- break
-
- def unusedfiles(self, number = 10, repeat = False, extension = None):
- """Yield ImagePage objects from
Special:Unusedimages."""
- seen = set()
- ns = self.image_namespace()
- entryR = re.compile(
- '<a href=".+?"
title="(?P<title>%s:.+?)">.+?</a>' % ns)
- while True:
- path = self.unusedfiles_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- for m in entryR.finditer(html):
- fileext = None
- title = m.group('title')
- if extension:
- fileext = title[len(title)-3:]
- if title not in seen and fileext == extension:
- ## Check whether the media is used in a Proofread page
- # code disabled because it slows this method down, and
- # because it is unclear what it's supposed to do.
- #basename = title[6:]
- #page = Page(self, 'Page:' + basename)
-
- #if not page.exists():
- seen.add(title)
- image = ImagePage(self, title)
- yield image
- if not repeat:
- break
-
- def withoutinterwiki(self, number=10, repeat=False):
- """Yield Pages without language links from
Special:Withoutinterwiki."""
- seen = set()
- while True:
- path = self.withoutinterwiki_address(n=number)
- get_throttle()
- html = self.getUrl(path)
- entryR = re.compile('<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
- for m in entryR.finditer(html):
- title = m.group('title')
- if title not in seen:
- seen.add(title)
- page = Page(self, title)
- yield page
- if not repeat:
- break
-
- def linksearch(self, siteurl):
- """Yield Pages from results of Special:Linksearch for
'siteurl'."""
- if siteurl.startswith('*.'):
- siteurl = siteurl[2:]
- output(u'Querying [[Special:Linksearch]]...')
- cache = []
- for url in [siteurl, '*.' + siteurl]:
- path = self.linksearch_address(url)
- get_throttle()
- html = self.getUrl(path)
- loc = html.find('<div class="mw-spcontent">')
- if loc > -1:
- html = html[loc:]
- loc = html.find('<div class="printfooter">')
- if loc > -1:
- html = html[:loc]
- R = re.compile('title ?=\"(.*?)\"')
- for title in R.findall(html):
- if not siteurl in title:
- # the links themselves have similar form
- if title in cache:
- continue
- else:
- cache.append(title)
- yield Page(self, title)
-
+ # -*- coding: utf-8 -*-
+"""
+Objects representing MediaWiki sites (wikis) and families (groups of wikis
+on the same topic in different languages).
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import pywikibot
+from pywikibot import deprecate_arg
+from pywikibot import config
+from pywikibot.throttle import Throttle
+from pywikibot.data import api
+from pywikibot.exceptions import *
+
+try:
+ from hashlib import md5
+except ImportError:
+ from md5 import md5
+import logging
+import os
+import re
+import sys
+import threading
+import urllib
+
+logger = logging.getLogger("wiki")
+
+class PageInUse(pywikibot.Error):
+ """Page cannot be reserved for writing due to existing
lock."""
+
+
+def Family(fam=None, fatal=True):
+ """Import the named family.
+
+ @param fam: family name (if omitted, uses the configured default)
+ @type fam: str
+ @param fatal: if True, the bot will stop running if the given family is
+ unknown. If False, it will only raise a ValueError exception.
+ @param fatal: bool
+ @return: a Family instance configured for the named family.
+
+ """
+ if fam == None:
+ fam = config.family
+ try:
+ # first try the built-in families
+ exec "import pywikibot.families.%s_family as myfamily" % fam
+ except ImportError:
+ # next see if user has defined a local family module
+ try:
+ sys.path.append(config.datafilepath('families'))
+ exec "import %s_family as myfamily" % fam
+ except ImportError:
+ if fatal:
+ logger.exception(u"""\
+Error importing the %s family. This probably means the family
+does not exist. Also check your configuration file."""
+ % fam)
+ sys.exit(1)
+ else:
+ raise Error("Family %s does not exist" % fam)
+ return myfamily.Family()
+
+
+class BaseSite(object):
+ """Site methods that are independent of the communication
interface."""
+ # to implement a specific interface, define a Site class that inherits
+ # from this
+
+ def __init__(self, code, fam=None, user=None, sysop=None):
+ """
+ @param code: the site's language code
+ @type code: str
+ @param fam: wiki family name (optional)
+ @type fam: str or Family
+ @param user: bot user name (optional)
+ @type user: str
+ @param sysop: sysop account user name (optional)
+ @type sysop: str
+
+ """
+ self.__code = code.lower()
+ if isinstance(fam, basestring) or fam is None:
+ self.__family = Family(fam, fatal=False)
+ else:
+ self.__family = fam
+
+ # if we got an outdated language code, use the new one instead.
+ if self.__family.obsolete.has_key(self.__code):
+ if self.__family.obsolete[self.__code] is not None:
+ self.__code = self.__family.obsolete[self.__code]
+ else:
+ # no such language anymore
+ raise NoSuchSite("Language %s in family %s is obsolete"
+ % (self.__code, self.__family.name))
+ if self.__code not in self.languages():
+ if self.__code == 'zh-classic' and 'zh-classical' in
self.languages():
+ self.__code = 'zh-classical'
+ # database hack (database is varchar[10] -> zh-classical
+ # is cut to zh-classic.
+ else:
+ raise NoSuchSite("Language %s does not exist in family %s"
+ % (self.__code, self.__family.name))
+
+ self._username = [user, sysop]
+
+ # following are for use with lock_page and unlock_page methods
+ self._pagemutex = threading.Lock()
+ self._locked_pages = []
+
+ @property
+ def throttle(self):
+ """Return this Site's throttle. Initialize a new one if
needed."""
+
+ if not hasattr(self, "_throttle"):
+ self._throttle = Throttle(self, multiplydelay=True,
+ verbosedelay=True)
+ try:
+ self.login(False)
+ except pywikibot.NoUsername:
+ pass
+ return self._throttle
+
+ @property
+ def family(self):
+ """The Family object for this Site's wiki
family."""
+
+ return self.__family
+
+ @property
+ def code(self):
+ """The identifying code for this Site."""
+
+ return self.__code
+
+ @property
+ def lang(self):
+ """The ISO language code for this Site.
+
+ Presumed to be equal to the wiki prefix, but this can be overridden.
+
+ """
+ return self.__code
+
+ def __cmp__(self, other):
+ """Perform equality and inequality tests on Site
objects."""
+
+ if not isinstance(other, BaseSite):
+ return 1
+ if self.family == other.family:
+ return cmp(self.code, other.code)
+ return cmp(self.family.name, other.family.name)
+
+ def user(self):
+ """Return the currently-logged in bot user, or
None."""
+
+ if self.logged_in(True):
+ return self._username[True]
+ elif self.logged_in(False):
+ return self._username[False]
+ return None
+
+ def username(self, sysop = False):
+ return self._username[sysop]
+
+ def __getattr__(self, attr):
+ """Calls to methods not defined in this object are passed to
Family."""
+
+ if hasattr(self.__class__, attr):
+ return self.__class__.attr
+ try:
+ method = getattr(self.family, attr)
+ f = lambda *args, **kwargs: \
+ method(self.code, *args, **kwargs)
+ if hasattr(method, "__doc__"):
+ f.__doc__ = method.__doc__
+ return f
+ except AttributeError:
+ raise AttributeError("%s instance has no attribute '%s'"
+ % (self.__class__.__name__, attr) )
+
+ def sitename(self):
+ """Return string representing this Site's name and
language."""
+
+ return self.family.name+':'+self.code
+
+ __str__ = sitename
+
+ def __repr__(self):
+ return 'Site("%s", "%s")' % (self.code,
self.family.name)
+
+ def __hash__(self):
+ return hash(repr(self))
+
+ def linktrail(self):
+ """Return regex for trailing chars displayed as part of a link.
+
+ Returns a string, not a compiled regular expression object.
+
+ This reads from the family file, and ''not'' from
+ [[MediaWiki:Linktrail]], because the MW software currently uses a
+ built-in linktrail from its message files and ignores the wiki
+ value.
+
+ """
+ return self.family.linktrail(self.code)
+
+ def languages(self):
+ """Return list of all valid language codes for this site's
Family."""
+
+ return self.family.langs.keys()
+
+ def validLanguageLinks(self):
+ """Return list of language codes that can be used in interwiki
links."""
+
+ nsnames = sum(self.namespaces().values(), [])
+ return [l for l in self.languages()
+ if l[:1].upper() + l[1:] not in self.namespaces()]
+
+ def ns_index(self, namespace):
+ """Given a namespace name, return its int index, or None if
invalid."""
+
+ for ns in self.namespaces():
+ if namespace.lower() in [name.lower()
+ for name in self.namespaces()[ns]]:
+ return ns
+ return None
+
+ getNamespaceIndex = ns_index # for backwards-compatibility
+
+ def namespaces(self):
+ """Return dict of valid namespaces on this
wiki."""
+
+ return self._namespaces
+
+ def ns_normalize(self, value):
+ """Return canonical local form of namespace name.
+
+ @param value: A namespace name
+ @type value: unicode
+
+ """
+ index = self.ns_index(value)
+ return self.namespace(index)
+
+ normalizeNamespace = ns_normalize # for backwards-compatibility
+
+ def redirect(self, default=True):
+ """Return the localized redirect tag for the site.
+
+ If default is True, falls back to 'REDIRECT' if the site has no
+ special redirect tag.
+
+ """
+ if default:
+ return self.family.redirect.get(self.code, [u"REDIRECT"])[0]
+ else:
+ return self.family.redirect.get(self.code, None)
+
+ def lock_page(self, page, block=True):
+ """Lock page for writing. Must be called before writing any
page.
+
+ We don't want different threads trying to write to the same page
+ at the same time, even to different sections.
+
+ @param page: the page to be locked
+ @type page: pywikibot.Page
+ @param block: if true, wait until the page is available to be locked;
+ otherwise, raise an exception if page can't be locked
+
+ """
+ self._pagemutex.acquire()
+ try:
+ while page in self._locked_pages:
+ if not block:
+ raise PageInUse
+ time.sleep(.25)
+ self._locked_pages.append(page.title(withSection=False))
+ finally:
+ self._pagemutex.release()
+
+ def unlock_page(self, page):
+ """Unlock page. Call as soon as a write operation has completed.
+
+ @param page: the page to be locked
+ @type page: pywikibot.Page
+
+ """
+ self._pagemutex.acquire()
+ try:
+ self._locked_pages.remove(page.title(withSection=False))
+ finally:
+ self._pagemutex.release()
+
+ def disambcategory(self):
+ """Return Category in which disambig pages are
listed."""
+
+ try:
+ name = self.namespace(14)+':'+self.family.disambcatname[self.code]
+ except KeyError:
+ raise Error(u"No disambiguation category name found for %(site)s"
+ % {'site': self})
+ return pywikibot.Category(pywikibot.Link(name, self))
+
+ def linkto(self, title, othersite = None):
+ """Return unicode string in the form of a wikilink to
'title'
+
+ Use optional Site argument 'othersite' to generate an interwiki link.
+
+ """
+ logger.debug("Site.linkto() method is deprecated; use pywikibot.Link")
+ return pywikibot.Link(title, self).astext(othersite)
+
+ def isInterwikiLink(self, s):
+ """Return True if s is in the form of an interwiki link.
+
+ If a link object constructed using "s" as the link text parses as
+ belonging to a different site, this method returns True.
+
+ """
+ return (pywikibot.Link(s, self).site != self)
+
+ def redirectRegex(self):
+ """Return a compiled regular expression matching on redirect
pages.
+
+ Group 1 in the regex match object will be the target title.
+
+ """
+ #TODO: is this needed, since the API identifies redirects?
+ # (maybe, the API can give false positives)
+ default = 'REDIRECT'
+ try:
+ keywords = set(self.family.redirect[self.code])
+ keywords.add(default)
+ pattern = r'(?:' + '|'.join(keywords) + ')'
+ except KeyError:
+ # no localized keyword for redirects
+ pattern = r'%s' % default
+ # A redirect starts with hash (#), followed by a keyword, then
+ # arbitrary stuff, then a wikilink. The wikilink may contain
+ # a label, although this is not useful.
+ return re.compile(r'\s*#%(pattern)s\s*:?\s*\[\[(.+?)(?:\|.*?)?\]\]'
+ % locals(),
+ re.IGNORECASE | re.UNICODE | re.DOTALL)
+
+ # namespace shortcuts for backwards-compatibility
+
+ def special_namespace(self):
+ return self.namespace(-1)
+
+ def image_namespace(self):
+ return self.namespace(6)
+
+ def mediawiki_namespace(self):
+ return self.namespace(8)
+
+ def template_namespace(self):
+ return self.namespace(10)
+
+ def category_namespace(self):
+ return self.namespace(14)
+
+ def category_namespaces(self):
+ return self.namespace(14, all=True)
+
+ # site-specific formatting preferences
+
+ def category_on_one_line(self):
+ """Return True if this site wants all category links on one
line."""
+
+ return self.code in self.family.category_on_one_line
+
+ def interwiki_putfirst(self):
+ """Return list of language codes for ordering of interwiki
links."""
+
+ return self.family.interwiki_putfirst.get(self.code, None)
+
+ def interwiki_putfirst_doubled(self, list_of_links):
+ # TODO: is this even needed? No family in the framework has this
+ # dictionary defined!
+ if self.lang in self.family.interwiki_putfirst_doubled:
+ if len(list_of_links) >= \
+ self.family.interwiki_putfirst_doubled[self.lang][0]:
+ links2 = [lang.language() for lang in list_of_links]
+ result = []
+ for lang in self.family.interwiki_putfirst_doubled[self.lang][1]:
+ try:
+ result.append(list_of_links[links2.index(lang)])
+ except ValueError:
+ pass
+ return result
+ else:
+ return False
+ else:
+ return False
+
+ def getSite(self, code):
+ """Return Site object for language 'code' in this
Family."""
+
+ return pywikibot.Site(code=code, fam=self.family, user=self.user)
+
+ # deprecated methods for backwards-compatibility
+
+ def fam(self):
+ """Return Family object for this Site."""
+ return self.family
+
+ def urlEncode(self, query):
+ """DEPRECATED"""
+ return urllib.urlencode(query)
+
+ def getUrl(self, path, retry=True, sysop=False, data=None,
+ compress=True, no_hostname=False, cookie_only=False):
+ """DEPRECATED.
+
+ Retained for compatibility only. All arguments except path and data
+ are ignored.
+
+ """
+ if data:
+ if not isinstance(data, basestring):
+ data = urllib.urlencode(data)
+ return pywikibot.comms.data.request(self, path, method="PUT",
+ body=data)
+ else:
+ return pywikibot.comms.data.request(self, path)
+
+ def postForm(self, address, predata, sysop=False, cookies=None):
+ """DEPRECATED"""
+ return self.getUrl(address, data=predata)
+
+ def postData(self, address, data, contentType=None, sysop=False,
+ compress=True, cookies=None):
+ """DEPRECATED"""
+ return self.getUrl(address, data=data)
+
+ # unsupported methods from version 1
+
+ def checkCharset(self, charset):
+ raise NotImplementedError
+ def getToken(self, getalways=True, getagain=False, sysop=False):
+ raise NotImplementedError
+ def export_address(self):
+ raise NotImplementedError
+ def move_address(self):
+ raise NotImplementedError
+ def delete_address(self, s):
+ raise NotImplementedError
+ def undelete_view_address(self, s, ts=''):
+ raise NotImplementedError
+ def undelete_address(self):
+ raise NotImplementedError
+ def protect_address(self, s):
+ raise NotImplementedError
+ def unprotect_address(self, s):
+ raise NotImplementedError
+ def put_address(self, s):
+ raise NotImplementedError
+ def get_address(self, s):
+ raise NotImplementedError
+ def nice_get_address(self, s):
+ raise NotImplementedError
+ def edit_address(self, s):
+ raise NotImplementedError
+ def purge_address(self, s):
+ raise NotImplementedError
+ def block_address(self):
+ raise NotImplementedError
+ def unblock_address(self):
+ raise NotImplementedError
+ def blocksearch_address(self, s):
+ raise NotImplementedError
+ def linksearch_address(self, s, limit=500, offset=0):
+ raise NotImplementedError
+ def search_address(self, q, n=50, ns=0):
+ raise NotImplementedError
+ def allpages_address(self, s, ns = 0):
+ raise NotImplementedError
+ def log_address(self, n=50, mode = ''):
+ raise NotImplementedError
+ def newpages_address(self, n=50):
+ raise NotImplementedError
+ def longpages_address(self, n=500):
+ raise NotImplementedError
+ def shortpages_address(self, n=500):
+ raise NotImplementedError
+ def unusedfiles_address(self, n=500):
+ raise NotImplementedError
+ def categories_address(self, n=500):
+ raise NotImplementedError
+ def deadendpages_address(self, n=500):
+ raise NotImplementedError
+ def ancientpages_address(self, n=500):
+ raise NotImplementedError
+ def lonelypages_address(self, n=500):
+ raise NotImplementedError
+ def protectedpages_address(self, n=500):
+ raise NotImplementedError
+ def unwatchedpages_address(self, n=500):
+ raise NotImplementedError
+ def uncategorizedcategories_address(self, n=500):
+ raise NotImplementedError
+ def uncategorizedimages_address(self, n=500):
+ raise NotImplementedError
+ def uncategorizedpages_address(self, n=500):
+ raise NotImplementedError
+ def unusedcategories_address(self, n=500):
+ raise NotImplementedError
+ def withoutinterwiki_address(self, n=500):
+ raise NotImplementedError
+ def references_address(self, s):
+ raise NotImplementedError
+ def allmessages_address(self):
+ raise NotImplementedError
+ def upload_address(self):
+ raise NotImplementedError
+ def double_redirects_address(self, default_limit = True):
+ raise NotImplementedError
+ def broken_redirects_address(self, default_limit = True):
+ raise NotImplementedError
+ def login_address(self):
+ raise NotImplementedError
+ def captcha_image_address(self, id):
+ raise NotImplementedError
+ def watchlist_address(self):
+ raise NotImplementedError
+ def contribs_address(self, target, limit=500, offset=''):
+ raise NotImplementedError
+
+
+class APISite(BaseSite):
+ """API interface to MediaWiki site.
+
+ Do not use directly; use pywikibot.Site function.
+
+ """
+## Site methods from version 1.0 (as these are implemented in this file,
+## or declared deprecated/obsolete, they will be removed from this list)
+##########
+## cookies: return user's cookies as a string
+##
+## urlEncode: Encode a query to be sent using an http POST request.
+## postForm: Post form data to an address at this site.
+## postData: Post encoded form data to an http address at this site.
+##
+## shared_image_repository: Return tuple of image repositories used by this
+## site.
+## version: Return MediaWiki version string from Family file.
+## versionnumber: Return int identifying the MediaWiki version.
+## live_version: Return version number read from Special:Version.
+## checkCharset(charset): Warn if charset doesn't match family file.
+##
+## linktrail: Return regex for trailing chars displayed as part of a link.
+## disambcategory: Category in which disambiguation pages are listed.
+##
+## Methods that yield Page objects derived from a wiki's Special: pages
+## (note, some methods yield other information in a tuple along with the
+## Pages; see method docs for details) --
+##
+## newpages(): Special:Newpages
+## newimages(): Special:Log&type=upload
+## longpages(): Special:Longpages
+## shortpages(): Special:Shortpages
+## deadendpages(): Special:Deadendpages
+## ancientpages(): Special:Ancientpages
+## lonelypages(): Special:Lonelypages
+## unwatchedpages(): Special:Unwatchedpages (sysop accounts only)
+## uncategorizedcategories(): Special:Uncategorizedcategories (yields
+## Category objects)
+## uncategorizedpages(): Special:Uncategorizedpages
+## uncategorizedimages(): Special:Uncategorizedimages (yields
+## ImagePage objects)
+## unusedcategories(): Special:Unusuedcategories (yields Category)
+## unusedfiles(): Special:Unusedimages (yields ImagePage)
+## withoutinterwiki: Special:Withoutinterwiki
+## linksearch: Special:Linksearch
+
+ def __init__(self, code, fam=None, user=None, sysop=None):
+ BaseSite.__init__(self, code, fam, user, sysop)
+ self._namespaces = {
+ # these are the MediaWiki built-in names, which always work
+ # localized names are loaded later upon accessing the wiki
+ # namespace prefixes are always case-insensitive, but the
+ # canonical forms are capitalized
+ -2: [u"Media"],
+ -1: [u"Special"],
+ 0: [u""],
+ 1: [u"Talk"],
+ 2: [u"User"],
+ 3: [u"User talk"],
+ 4: [u"Project"],
+ 5: [u"Project talk"],
+ 6: [u"Image"],
+ 7: [u"Image talk"],
+ 8: [u"MediaWiki"],
+ 9: [u"MediaWiki talk"],
+ 10: [u"Template"],
+ 11: [u"Template talk"],
+ 12: [u"Help"],
+ 13: [u"Help talk"],
+ 14: [u"Category"],
+ 15: [u"Category talk"],
+ }
+ self.sitelock = threading.Lock()
+ self._msgcache = {}
+ return
+
+# ANYTHING BELOW THIS POINT IS NOT YET IMPLEMENTED IN __init__()
+ self.nocapitalize = self.__code in self.family.nocapitalize
+ # Calculating valid languages took quite long, so we calculate it once
+ # in initialization instead of each time it is used.
+ self._validlanguages = []
+ for language in self.languages():
+ if not language[:1].upper() + language[1:] in self.namespaces():
+ self._validlanguages.append(language)
+
+ def logged_in(self, sysop=False):
+ """Return True if logged in with specified privileges, otherwise
False.
+
+ @param sysop: if True, require sysop privileges.
+
+ """
+ if self.userinfo['name'] != self._username[sysop]:
+ return False
+ return (not sysop) or 'sysop' in self.userinfo['groups']
+
+ def loggedInAs(self, sysop = False):
+ """Return the current username if logged in, otherwise return
None.
+
+ DEPRECATED (use .user() method instead)
+
+ """
+ logger.debug("Site.loggedInAs() method is deprecated.")
+ return self.logged_in(sysop) and self.user()
+
+ def login(self, sysop=False):
+ """Log the user in if not already logged in."""
+ if not hasattr(self, "_siteinfo"):
+ self._getsiteinfo()
+ # check whether a login cookie already exists for this user
+ if hasattr(self, "_userinfo"):
+ if self.userinfo['name'] == self._username[sysop]:
+ return
+ if not self.logged_in(sysop):
+ loginMan = api.LoginManager(site=self, sysop=sysop,
+ user=self._username[sysop])
+ if loginMan.login(retry = True):
+ self._username[sysop] = loginMan.username
+ if hasattr(self, "_userinfo"):
+ del self._userinfo
+ self.getuserinfo()
+
+ forceLogin = login # alias for backward-compatibility
+
+ def getuserinfo(self):
+ """Retrieve userinfo from site and store in _userinfo attribute.
+
+ self._userinfo will be a dict with the following keys and values:
+
+ - id: user id (numeric str)
+ - name: username (if user is logged in)
+ - anon: present if user is not logged in
+ - groups: list of groups (could be empty)
+ - rights: list of rights (could be empty)
+ - message: present if user has a new message on talk page
+ - blockinfo: present if user is blocked (dict)
+
+ """
+ if (not hasattr(self, "_userinfo")
+ or "rights" not in self._userinfo
+ or self._userinfo['name']
+ != self._username["sysop" in
self._userinfo["groups"]]):
+ uirequest = api.Request(
+ site=self,
+ action="query",
+ meta="userinfo",
+ uiprop="blockinfo|hasmsg|groups|rights"
+ )
+ uidata = uirequest.submit()
+ assert 'query' in uidata, \
+ "API userinfo response lacks 'query' key"
+ assert 'userinfo' in uidata['query'], \
+ "API userinfo response lacks 'userinfo' key"
+ self._userinfo = uidata['query']['userinfo']
+ return self._userinfo
+
+ userinfo = property(fget=getuserinfo, doc=getuserinfo.__doc__)
+
+ def is_blocked(self, sysop=False):
+ """Return true if and only if user is blocked.
+
+ @param sysop: If true, log in to sysop account (if available)
+
+ """
+ if not self.logged_in(sysop):
+ self.login(sysop)
+ return 'blockinfo' in self._userinfo
+
+ def isBlocked(self, sysop=False):
+ """Deprecated synonym for is_blocked"""
+ logger.debug(
+ "Site method 'isBlocked' should be changed to
'is_blocked'")
+ return self.is_blocked(sysop)
+
+ def checkBlocks(self, sysop = False):
+ """Check if the user is blocked, and raise an exception if
so."""
+ if self.is_blocked(sysop):
+ # User blocked
+ raise UserBlocked('User is blocked in site %s' % self)
+
+ def has_right(self, right, sysop=False):
+ """Return true if and only if the user has a specific right.
+
+ Possible values of 'right' may vary depending on wiki settings,
+ but will usually include:
+
+ * Actions: edit, move, delete, protect, upload
+ * User levels: autoconfirmed, sysop, bot
+
+ """
+ if not self.logged_in(sysop):
+ self.login(sysop)
+ return right.lower() in self._userinfo['rights']
+
+ def isAllowed(self, right, sysop=False):
+ """Deprecated; retained for
backwards-compatibility"""
+ logger.debug("Site.isAllowed() method is deprecated; use has_right()")
+ return self.has_right(right, sysop)
+
+ def has_group(self, group, sysop=False):
+ """Return true if and only if the user is a member of specified
group.
+
+ Possible values of 'group' may vary depending on wiki settings,
+ but will usually include bot.
+
+ """
+ if not self.logged_in(sysop):
+ self.login(sysop)
+ return group.lower() in self._userinfo['groups']
+
+ def messages(self, sysop=False):
+ """Returns true if the user has new messages, and false
otherwise."""
+ if not self.logged_in(sysop):
+ self.login(sysop)
+ return 'hasmsg' in self._userinfo
+
+ def mediawiki_message(self, key):
+ """Return the MediaWiki message text for key "key"
"""
+ if not key in self._msgcache:
+ msg_query = api.QueryGenerator(site=self, meta="allmessages",
+ amfilter=key)
+ for msg in msg_query:
+ if msg['name'] == key and not 'missing' in msg:
+ self._msgcache[key] = msg['*']
+ break
+ else:
+ raise KeyError("Site %(self)s has no message
'%(key)s'"
+ % locals())
+ return self._msgcache[key]
+
+ def has_mediawiki_message(self, key):
+ """Return True iff this site defines a MediaWiki message for
'key'."""
+ try:
+ v = self.mediawiki_message(key)
+ return True
+ except KeyError:
+ return False
+
+ def getcurrenttimestamp(self):
+ """Return (Mediawiki) timestamp, {{CURRENTTIMESTAMP}}, the server
time.
+
+ Format is yyyymmddhhmmss
+
+ """
+ r = api.Request(site=self,
+ action="parse",
+ text="{{CURRENTTIMESTAMP}}")
+ result = r.submit()
+ return re.search('\d+',
result['parse']['text']['*']).group()
+
+ def _getsiteinfo(self):
+ """Retrieve siteinfo and namespaces from site."""
+ sirequest = api.Request(
+ site=self,
+ action="query",
+ meta="siteinfo",
+ siprop="general|namespaces|namespacealiases"
+ )
+ try:
+ sidata = sirequest.submit()
+ except api.APIError:
+ # hack for older sites that don't support 1.12 properties
+ # probably should delete if we're not going to support pre-1.12
+ sirequest = api.Request(
+ site=self,
+ action="query",
+ meta="siteinfo",
+ siprop="general|namespaces"
+ )
+ sidata = sirequest.submit()
+
+ assert 'query' in sidata, \
+ "API siteinfo response lacks 'query' key"
+ sidata = sidata['query']
+ assert 'general' in sidata, \
+ "API siteinfo response lacks 'general' key"
+ assert 'namespaces' in sidata, \
+ "API siteinfo response lacks 'namespaces' key"
+ self._siteinfo = sidata['general']
+ nsdata = sidata['namespaces']
+ for nskey in nsdata:
+ ns = int(nskey)
+ if ns in self._namespaces:
+ if nsdata[nskey]["*"] in self._namespaces[ns]:
+ continue
+ # this is the preferred form so it goes at front of list
+ self._namespaces[ns].insert(0, nsdata[nskey]["*"])
+ else:
+ self._namespaces[ns] = [nsdata[nskey]["*"]]
+ if 'namespacealiases' in sidata:
+ aliasdata = sidata['namespacealiases']
+ for item in aliasdata:
+ if item["*"] in self._namespaces[int(item['id'])]:
+ continue
+ # this is a less preferred form so it goes at the end
+ self._namespaces[int(item['id'])].append(item["*"])
+
+ @property
+ def siteinfo(self):
+ """Site information dict."""
+
+ if not hasattr(self, "_siteinfo"):
+ self._getsiteinfo()
+ return self._siteinfo
+
+ def case(self):
+ """Return this site's capitalization rule."""
+
+ return self.siteinfo['case']
+
+ def language(self):
+ """Return the code for the language of this
Site."""
+
+ return self.siteinfo['lang']
+
+ lang = property(fget=language, doc=language.__doc__)
+
+ def namespaces(self):
+ """Return dict of valid namespaces on this
wiki."""
+
+ if not hasattr(self, "_siteinfo"):
+ self._getsiteinfo()
+ return self._namespaces
+
+ def namespace(self, num, all=False):
+ """Return string containing local name of namespace
'num'.
+
+ If optional argument 'all' is true, return a list of all recognized
+ values for this namespace.
+
+ """
+ if all:
+ return self.namespaces()[num]
+ return self.namespaces()[num][0]
+
+ def live_version(self):
+ """Return the 'real' version number found on
[[Special:Version]]
+
+ Return value is a tuple (int, int, str) of the major and minor
+ version numbers and any other text contained in the version.
+
+ """
+ versionstring = self.siteinfo['generator']
+ m = re.match(r"^MediaWiki ([0-9]+)\.([0-9]+)(.*)$", versionstring)
+ if m:
+ return (int(m.group(1)), int(m.group(2)), m.group(3))
+ else:
+ return None
+
+ def loadpageinfo(self, page):
+ """Load page info from api and save in page
attributes"""
+ title = page.title(withSection=False)
+ query = api.PropertyGenerator("info", site=self,
+ titles=title.encode(self.encoding()),
+ inprop="protection")
+ for pageitem in query:
+ if pageitem['title'] != title:
+ raise Error(
+ u"loadpageinfo: Query on %s returned data on '%s'"
+ % (page, pageitem['title']))
+ api.update_page(page, pageitem)
+
+ def loadimageinfo(self, page, history=False):
+ """Load image info from api and save in page attributes
+
+ @param history: if true, return the image's version history
+
+ """
+ title = page.title(withSection=False)
+ query = api.PropertyGenerator("imageinfo", site=self,
+ titles=title.encode(self.encoding()),
+ iiprop=["timestamp", "user",
"comment",
+ "url", "size",
"sha1", "mime",
+ "metadata",
"archivename"])
+ if history:
+ query.request["iilimit"] = "max"
+ for pageitem in query:
+ if pageitem['title'] != title:
+ raise Error(
+ u"loadpageinfo: Query on %s returned data on '%s'"
+ % (page, pageitem['title']))
+ api.update_page(page, pageitem)
+ if history:
+ return pageitem['imageinfo']
+
+ def page_exists(self, page):
+ """Return True if and only if page is an existing page on
site."""
+ if not hasattr(page, "_pageid"):
+ self.loadpageinfo(page)
+ return page._pageid > 0
+
+ def page_restrictions(self, page):
+ """Returns a dictionary reflecting page
protections"""
+ if not self.page_exists(page):
+ raise NoPage(u'No page %s.' % page)
+ if not hasattr(page, "_protection"):
+ self.loadpageinfo(page)
+ return page._protection
+
+ def page_can_be_edited(self, page):
+ """
+ Returns True if and only if:
+ - page is unprotected, and bot has an account for this site, or
+ - page is protected, and bot has a sysop account for this site.
+
+ """
+ rest = self.page_restrictions(page)
+ sysop_protected = rest.has_key('edit') and rest['edit'][0] ==
'sysop'
+ try:
+ api.LoginManager(site=self, sysop=sysop_protected)
+ except NoUsername:
+ return False
+ return True
+
+ def page_isredirect(self, page):
+ """Return True if and only if page is a
redirect."""
+ if not hasattr(page, "_redir"):
+ self.loadpageinfo(page)
+ return bool(page._redir)
+
+ def getredirtarget(self, page):
+ """Return Page object for the redirect target of
page."""
+ if not hasattr(page, "_redir"):
+ self.loadpageinfo(page)
+ if not page._redir:
+ raise pywikibot.IsNotRedirectPage(page.title())
+ title = page.title(withSection=False)
+ query = api.Request(site=self, action="query",
property="info",
+ inprop="protection|talkid|subjectid",
+ titles=title.encode(self.encoding()),
+ redirects="")
+ result = query.submit()
+ if "query" not in result or "redirects" not in
result["query"]:
+ raise RuntimeError(
+ "getredirtarget: No 'redirects' found for page %s."
+ % title)
+ redirmap = dict((item['from'], item['to'])
+ for item in result['query']['redirects'])
+ if title not in redirmap:
+ raise RuntimeError(
+ "getredirtarget: 'redirects' contains no key for page
%s."
+ % title)
+ if "pages" not in result['query']:
+ # no "pages" element indicates a circular redirect
+ raise pywikibot.CircularRedirect(redirmap[title])
+ for pagedata in result['query']['pages'].values():
+ # there should be only one value in 'pages', and it is the target
+ if pagedata['title'] not in redirmap.values():
+ raise RuntimeError(
+ "getredirtarget: target page '%s' not found in
'redirects'"
+ % pagedata['title'])
+ target = pywikibot.Page(self, pagedata['title'],
pagedata['ns'])
+ api.update_page(target, pagedata)
+ page._redir = target
+
+ def preloadpages(self, pagelist, groupsize=60):
+ """Return a generator to a list of preloaded pages.
+
+ Note that [at least in current implementation] pages may be iterated
+ in a different order than in the underlying pagelist.
+
+ @param pagelist: an iterable that returns Page objects
+ @param groupsize: how many Pages to query at a time
+ @type groupsize: int
+
+ """
+ from pywikibot.tools import itergroup
+ for sublist in itergroup(pagelist, groupsize):
+ pageids = [str(p._pageid) for p in sublist
+ if hasattr(p, "_pageid")
+ and p._pageid > 0]
+ cache = dict((p.title(withSection=False), p) for p in sublist)
+ rvgen = api.PropertyGenerator("revisions|info", site=self)
+ rvgen.limit = -1
+ if len(pageids) == len(sublist):
+ # only use pageids if all pages have them
+ rvgen.request["pageids"] = "|".join(pageids)
+ else:
+ rvgen.request["titles"] = "|".join(cache.keys())
+ rvgen.request[u"rvprop"] = \
+ u"ids|flags|timestamp|user|comment|content"
+ logger.info(u"Retrieving %s pages from %s."
+ % (len(cache), self)
+ )
+ for pagedata in rvgen:
+ logger.debug("Preloading %s" % pagedata)
+ try:
+ if pagedata['title'] not in cache:
+ raise Error(
+ u"preloadpages: Query returned unexpected title
'%s'"
+ % pagedata['title']
+ )
+ except KeyError:
+ logger.debug("No 'title' in %s" % pagedata)
+ logger.debug("pageids=%s" % pageids)
+ logger.debug("titles=%s" % cache.keys())
+ continue
+ page = cache[pagedata['title']]
+ api.update_page(page, pagedata)
+ yield page
+
+ def token(self, page, tokentype):
+ """Return token retrieved from wiki to allow changing page
content.
+
+ @param page: the Page for which a token should be retrieved
+ @param tokentype: the type of token (e.g., "edit", "move",
"delete");
+ see API documentation for full list of types
+
+ """
+ query = api.PropertyGenerator("info|revisions", site=self,
+ titles=page.title(withSection=False),
+ intoken=tokentype)
+ for item in query:
+ if item['title'] != page.title(withSection=False):
+ raise Error(
+ u"token: Query on page %s returned data on page [[%s]]"
+ % (page.title(withSection=False, asLink=True),
+ item['title']))
+ api.update_page(page, item)
+ logging.debug(str(item))
+ return item[tokentype + "token"]
+
+ # following group of methods map more-or-less directly to API queries
+
+ def pagebacklinks(self, page, followRedirects=False, filterRedirects=None,
+ namespaces=None):
+ """Iterate all pages that link to the given page.
+
+ @param page: The Page to get links to.
+ @param followRedirects: Also return links to redirects pointing to
+ the given page.
+ @param filterRedirects: If True, only return redirects to the given
+ page. If False, only return non-redirect links. If None, return
+ both (no filtering).
+ @param namespaces: If present, only return links from the namespaces
+ in this list.
+
+ """
+ bltitle = page.title(withSection=False).encode(self.encoding())
+ blgen = api.PageGenerator("backlinks", gbltitle=bltitle, site=self)
+ if isinstance(namespaces, list):
+ blgen.request["gblnamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ blgen.request["gblnamespace"] = str(namespaces)
+ if filterRedirects is not None:
+ blgen.request["gblfilterredir"] = filterRedirects and
"redirects"\
+ or
"nonredirects"
+ if followRedirects:
+ # bug: see
http://bugzilla.wikimedia.org/show_bug.cgi?id=16218
+ # links identified by MediaWiki as redirects may not really be,
+ # so we have to check each "redirect" page and see if it
+ # really redirects to this page
+ blgen.request["gblfilterredir"] = "nonredirects"
+ redirgen = api.PageGenerator("backlinks", gbltitle=bltitle,
+ site=self,
gblfilterredir="redirects")
+ if "gblnamespace" in blgen.request:
+ redirgen.request["gblnamespace"] =
blgen.request["gblnamespace"]
+ genlist = [blgen]
+ for redir in redirgen:
+ if redir.getRedirectTarget() == page:
+ genlist.append(
+ self.pagebacklinks(
+ redir, True, None, namespaces))
+ import itertools
+ return itertools.chain(*genlist)
+ return blgen
+
+ def page_embeddedin(self, page, filterRedirects=None, namespaces=None):
+ """Iterate all pages that embedded the given page as a template.
+
+ @param page: The Page to get inclusions for.
+ @param filterRedirects: If True, only return redirects that embed
+ the given page. If False, only return non-redirect links. If
+ None, return both (no filtering).
+ @param namespaces: If present, only return links from the namespaces
+ in this list.
+
+ """
+ eititle = page.title(withSection=False).encode(self.encoding())
+ eigen = api.PageGenerator("embeddedin", geititle=eititle, site=self)
+ if isinstance(namespaces, list):
+ eigen.request["geinamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ eigen.request["geinamespace"] = str(namespaces)
+ if filterRedirects is not None:
+ eigen.request["geifilterredir"] = filterRedirects and
"redirects"\
+ or
"nonredirects"
+ return eigen
+
+ def pagereferences(self, page, followRedirects=False, filterRedirects=None,
+ withTemplateInclusion=True, onlyTemplateInclusion=False,
+ namespaces=None):
+ """Convenience method combining pagebacklinks and
page_embeddedin."""
+
+ if onlyTemplateInclusion:
+ return self.page_embeddedin(page, namespaces=namespaces)
+ if not withTemplateInclusion:
+ return self.pagebacklinks(page, followRedirects,
+ namespaces=namespaces)
+ import itertools
+ return itertools.chain(
+ self.pagebacklinks(page, followRedirects,
+ filterRedirects, namespaces=namespaces),
+ self.page_embeddedin(page, filterRedirects,
+ namespaces=namespaces)
+ )
+
+ def pagelinks(self, page, namespaces=None, follow_redirects=False,
+ limit=None):
+ """Iterate internal wikilinks contained (or transcluded) on page.
+
+ @param namespaces: Only iterate pages in these namespaces (default: all)
+ @type namespaces: list of ints
+ @param follow_redirects: if True, yields the target of any redirects,
+ rather than the redirect page
+
+ """
+ plgen = api.PageGenerator("links", site=self)
+ if isinstance(limit, int):
+ plgen.limit = limit
+ if hasattr(page, "_pageid"):
+ plgen.request['pageids'] = str(page._pageid)
+ else:
+ pltitle = page.title(withSection=False).encode(self.encoding())
+ plgen.request['titles'] = pltitle
+ if follow_redirects:
+ plgen.request['redirects'] = ''
+ if isinstance(namespaces, list):
+ plgen.request["gplnamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ plgen.request["gplnamespace"] = str(namespaces)
+ return plgen
+
+ @deprecate_arg("withSortKey", None) # Sortkey doesn't work with
generator
+ def pagecategories(self, page, withSortKey=None):
+ """Iterate categories to which page belongs."""
+
+ clgen = api.CategoryPageGenerator("categories", site=self)
+ if hasattr(page, "_pageid"):
+ clgen.request['pageids'] = str(page._pageid)
+ else:
+ cltitle = page.title(withSection=False).encode(self.encoding())
+ clgen.request['titles'] = cltitle
+ return clgen
+
+ def pageimages(self, page):
+ """Iterate images used (not just linked) on the
page."""
+
+ imtitle = page.title(withSection=False).encode(self.encoding())
+ imgen = api.ImagePageGenerator("images", titles=imtitle, site=self)
+ return imgen
+
+ def pagetemplates(self, page, namespaces=None):
+ """Iterate templates transcluded (not just linked) on the
page."""
+
+ tltitle = page.title(withSection=False).encode(self.encoding())
+ tlgen = api.PageGenerator("templates", titles=tltitle, site=self)
+ if isinstance(namespaces, list):
+ tlgen.request["gtlnamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ tlgen.request["gtlnamespace"] = str(namespaces)
+ return tlgen
+
+ def categorymembers(self, category, namespaces=None, limit=None):
+ """Iterate members of specified category.
+
+ @param category: The Category to iterate.
+ @param namespaces: If present, only return category members from
+ these namespaces. For example, use namespaces=[14] to yield
+ subcategories, use namespaces=[6] to yield image files, etc. Note,
+ however, that the iterated values are always Page objects, even
+ if in the Category or Image namespace.
+ @type namespaces: list of ints
+ @param limit: maximum number of pages to iterate (default: all)
+ @type limit: int
+
+ """
+ if category.namespace() != 14:
+ raise Error(
+ u"categorymembers: non-Category page '%s' specified"
+ % category.title())
+ cmtitle = category.title(withSection=False).encode(self.encoding())
+ cmgen = api.PageGenerator("categorymembers", gcmtitle=cmtitle,
+ gcmprop="ids|title|sortkey", site=self)
+ if isinstance(namespaces, list):
+ cmgen.request["gcmnamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ cmgen.request["gcmnamespace"] = str(namespaces)
+ if isinstance(limit, int):
+ cmgen.limit = limit
+ return cmgen
+
+ def loadrevisions(self, page=None, getText=False, revids=None,
+ limit=None, startid=None, endid=None, starttime=None,
+ endtime=None, rvdir=None, user=None, excludeuser=None,
+ section=None, sysop=False):
+ """Retrieve and store revision information.
+
+ By default, retrieves the last (current) revision of the page,
+ I{unless} any of the optional parameters revids, startid, endid,
+ starttime, endtime, rvdir, user, excludeuser, or limit are
+ specified. Unless noted below, all parameters not specified
+ default to False.
+
+ If rvdir is False or not specified, startid must be greater than
+ endid if both are specified; likewise, starttime must be greater
+ than endtime. If rvdir is True, these relationships are reversed.
+
+ @param page: retrieve revisions of this Page (required unless ids
+ is specified)
+ @param getText: if True, retrieve the wiki-text of each revision;
+ otherwise, only retrieve the revision metadata (default)
+ @param section: if specified, retrieve only this section of the text
+ (getText must be True); section must be given by number (top of
+ the article is section 0), not name
+ @type section: int
+ @param revids: retrieve only the specified revision ids (required
+ unless page is specified)
+ @type revids: list of ints
+ @param limit: Retrieve no more than this number of revisions
+ @type limit: int
+ @param startid: retrieve revisions starting with this revid
+ @param endid: stop upon retrieving this revid
+ @param starttime: retrieve revisions starting at this timestamp
+ @param endtime: stop upon reaching this timestamp
+ @param rvdir: if false, retrieve newest revisions first (default);
+ if true, retrieve earliest first
+ @param user: retrieve only revisions authored by this user
+ @param excludeuser: retrieve all revisions not authored by this user
+ @param sysop: if True, switch to sysop account (if available) to
+ retrieve this page
+
+ """
+ latest = (revids is None and
+ startid is None and
+ endid is None and
+ starttime is None and
+ endtime is None and
+ rvdir is None and
+ user is None and
+ excludeuser is None and
+ limit is None) # if True, we are retrieving current revision
+
+ # check for invalid argument combinations
+ if page is None and revids is None:
+ raise ValueError(
+ "loadrevisions: either page or revids argument required")
+ if (startid is not None or endid is not None) and \
+ (starttime is not None or endtime is not None):
+ raise ValueError(
+ "loadrevisions: startid/endid combined with
starttime/endtime")
+ if starttime is not None and endtime is not None:
+ if rvdir and starttime >= endtime:
+ raise ValueError(
+ "loadrevisions: starttime > endtime with rvdir=True")
+ if (not rvdir) and endtime >= starttime:
+ raise ValueError(
+ "loadrevisions: endtime > starttime with rvdir=False")
+ if startid is not None and endid is not None:
+ if rvdir and startid >= endid:
+ raise ValueError(
+ "loadrevisions: startid > endid with rvdir=True")
+ if (not rvdir) and endid >= startid:
+ raise ValueError(
+ "loadrevisions: endid > startid with rvdir=False")
+
+ # assemble API request
+ if revids is None:
+ rvtitle = page.title(withSection=False).encode(self.encoding())
+ rvgen = api.PropertyGenerator(u"info|revisions", titles=rvtitle,
+ site=self)
+ else:
+ if isinstance(revids, (int, basestring)):
+ ids = unicode(revids)
+ else:
+ ids = u"|".join(unicode(r) for r in revids)
+ rvgen = api.PropertyGenerator(u"info|revisions", revids=ids,
+ site=self)
+ if getText:
+ rvgen.request[u"rvprop"] = \
+ u"ids|flags|timestamp|user|comment|content"
+ if section is not None:
+ rvgen.request[u"rvsection"] = unicode(section)
+ if latest or "revids" in rvgen.request:
+ rvgen.limit = -1 # suppress use of rvlimit parameter
+ elif isinstance(limit, int):
+ rvgen.limit = limit
+ if rvdir:
+ rvgen.request[u"rvdir"] = u"newer"
+ elif rvdir is not None:
+ rvgen.request[u"rvdir"] = u"older"
+ if startid:
+ rvgen.request[u"rvstartid"] = startid
+ if endid:
+ rvgen.request[u"rvendid"] = endid
+ if starttime:
+ rvgen.request[u"rvstart"] = starttime
+ if endtime:
+ rvgen.request[u"rvend"] = endtime
+ if user:
+ rvgen.request[u"rvuser"] = user
+ elif excludeuser:
+ rvgen.request[u"rvexcludeuser"] = excludeuser
+ # TODO if sysop: something
+ rvgen.continuekey = "revisions"
+ for pagedata in rvgen:
+ if page is not None:
+ if pagedata['title'] != page.title(withSection=False):
+ raise Error(
+ u"loadrevisions: Query on %s returned data on
'%s'"
+ % (page, pagedata['title']))
+ if pagedata.has_key('missing'):
+ raise NoPage(u'Page %s does not exist'
+ % page.title(asLink=True))
+ else:
+ page = Page(self, pagedata['title'])
+ api.update_page(page, pagedata)
+
+ def pageinterwiki(self, page):
+ # No such function in the API (this method isn't called anywhere)
+ raise NotImplementedError
+
+ def pagelanglinks(self, page):
+ """Iterate all interlanguage links on page, yielding Link
objects."""
+ lltitle = page.title(withSection=False)
+ llquery = api.PropertyGenerator("langlinks",
+ titles=lltitle.encode(self.encoding()),
+ site=self)
+ for pageitem in llquery:
+ if pageitem['title'] != lltitle:
+ raise Error(
+ u"getlanglinks: Query on %s returned data on '%s'"
+ % (page, pageitem['title']))
+ if 'langlinks' not in pageitem:
+ continue
+ for linkdata in pageitem['langlinks']:
+ yield pywikibot.Link(linkdata['*'],
+ source=pywikibot.Site(linkdata['lang']))
+
+ def page_extlinks(self, page):
+ """Iterate all external links on page, yielding URL
strings."""
+ eltitle = page.title(withSection=False)
+ elquery = api.PropertyGenerator("extlinks",
+ titles=eltitle.encode(self.encoding()),
+ site=self)
+ for pageitem in elquery:
+ if pageitem['title'] != eltitle:
+ raise RuntimeError(
+ "getlanglinks: Query on %s returned data on '%s'"
+ % (page, pageitem['title']))
+ if 'extlinks' not in pageitem:
+ continue
+ for linkdata in pageitem['extlinks']:
+ yield linkdata['*']
+
+ @deprecate_arg("throttle", None)
+ @deprecate_arg("includeredirects", "filterredir")
+ def allpages(self, start="!", prefix="", namespace=0,
filterredir=None,
+ filterlanglinks=None, minsize=None, maxsize=None,
+ protect_type=None, protect_level=None, limit=None,
+ reverse=False, includeredirects=None):
+ """Iterate pages in a single namespace.
+
+ Note: parameters includeRedirects and throttle are deprecated and
+ included only for backwards compatibility.
+
+ @param start: Start at this title (page need not exist).
+ @param prefix: Only yield pages starting with this string.
+ @param namespace: Iterate pages from this (single) namespace
+ (default: 0)
+ @param filterredir: if True, only yield redirects; if False (and not
+ None), only yield non-redirects (default: yield both)
+ @param filterlanglinks: if True, only yield pages with language links;
+ if False (and not None), only yield pages without language links
+ (default: yield both)
+ @param minsize: if present, only yield pages at least this many
+ bytes in size
+ @param maxsize: if present, only yield pages at most this many bytes
+ in size
+ @param protect_type: only yield pages that have a protection of the
+ specified type
+ @type protect_type: str
+ @param protect_level: only yield pages that have protection at this
+ level; can only be used if protect_type is specified
+ @param limit: maximum number of pages to iterate (default: iterate
+ all pages in namespace)
+ @param reverse: if True, iterate in reverse Unicode lexigraphic
+ order (default: iterate in forward order)
+ @param includeredirects: DEPRECATED, use filterredirs instead
+
+ """
+ if not isinstance(namespace, int):
+ raise Error("allpages: only one namespace permitted.")
+ if includeredirects is not None:
+ logger.debug(
+"allpages: 'includeRedirects' argument is deprecated; use
'filterredirs'.")
+ if includeredirects:
+ if includeredirects == "only":
+ filterredirs = True
+ else:
+ filterredirs = None
+ else:
+ filterredirs = False
+
+ apgen = api.PageGenerator("allpages", gapnamespace=str(namespace),
+ gapfrom=start, site=self)
+ if prefix:
+ apgen.request["gapprefix"] = prefix
+ if filterredir is not None:
+ apgen.request["gapfilterredir"] = (filterredir
+ and "redirects"
+ or "nonredirects")
+ if filterlanglinks is not None:
+ apgen.request["gapfilterlanglinks"] = (filterlanglinks
+ and "withlanglinks"
+ or "withoutlanglinks")
+ if isinstance(minsize, int):
+ apgen.request["gapminsize"] = str(minsize)
+ if isinstance(maxsize, int):
+ apgen.request["gapmaxsize"] = str(maxsize)
+ if isinstance(protect_type, basestring):
+ apgen.request["gapprtype"] = protect_type
+ if isinstance(protect_level, basestring):
+ apgen.request["gapprlevel"] = protect_level
+ if isinstance(limit, int):
+ apgen.limit = limit
+ if reverse:
+ apgen.request["gapdir"] = "descending"
+ return apgen
+
+ def prefixindex(self, prefix, namespace=0, includeredirects=True):
+ """Yield all pages with a given prefix. Deprecated.
+
+ Use allpages() with the prefix= parameter instead of this method.
+
+ """
+ logger.debug("Site.prefixindex() is deprecated; use allpages
instead.")
+ return self.allpages(prefix=prefix, namespace=namespace,
+ includeredirects=includeredirects)
+
+
+ def alllinks(self, start="!", prefix="", namespace=0,
unique=False,
+ limit=None, fromids=False):
+ """Iterate all links to pages (which need not exist) in one
namespace.
+
+ Note that, in practice, links that were found on pages that have
+ been deleted may not have been removed from the links table, so this
+ method can return false positives.
+
+ @param start: Start at this title (page need not exist).
+ @param prefix: Only yield pages starting with this string.
+ @param namespace: Iterate pages from this (single) namespace
+ (default: 0)
+ @param unique: If True, only iterate each link title once (default:
+ iterate once for each linking page)
+ @param limit: maximum number of pages to iterate (default: iterate
+ all pages in namespace)
+ @param fromids: if True, include the pageid of the page containing
+ each link (default: False) as the '_fromid' attribute of the Page;
+ cannot be combined with unique
+
+ """
+ if unique and fromids:
+ raise Error("alllinks: unique and fromids cannot both be True.")
+ if not isinstance(namespace, int):
+ raise Error("alllinks: only one namespace permitted.")
+ algen = api.ListGenerator("alllinks", alnamespace=str(namespace),
+ alfrom=start, site=self)
+ if prefix:
+ algen.request["alprefix"] = prefix
+ if isinstance(limit, int):
+ algen.limit = limit
+ if unique:
+ algen.request["alunique"] = ""
+ if fromids:
+ algen.request["alprop"] = "title|ids"
+ for link in algen:
+ p = pywikibot.Page(self, link['title'], link['ns'])
+ if fromids:
+ p._fromid = link['fromid']
+ yield p
+
+ def allcategories(self, start="!", prefix="", limit=None,
+ reverse=False):
+ """Iterate categories used (which need not have a Category page).
+
+ Iterator yields Category objects. Note that, in practice, links that
+ were found on pages that have been deleted may not have been removed
+ from the database table, so this method can return false positives.
+
+ @param start: Start at this category title (category need not exist).
+ @param prefix: Only yield categories starting with this string.
+ @param limit: maximum number of categories to iterate (default:
+ iterate all)
+ @param reverse: if True, iterate in reverse Unicode lexigraphic
+ order (default: iterate in forward order)
+
+ """
+ acgen = api.CategoryPageGenerator("allcategories",
+ gacfrom=start, site=self)
+ if prefix:
+ acgen.request["gacprefix"] = prefix
+ if isinstance(limit, int):
+ acgen.limit = limit
+ if reverse:
+ acgen.request["gacdir"] = "descending"
+ return acgen
+
+ def categories(self, number=10, repeat=False):
+ """Deprecated; retained for
backwards-compatibility"""
+ logger.debug(
+ "Site.categories() method is deprecated; use .allcategories()")
+ if repeat:
+ limit = None
+ else:
+ limit = number
+ return self.allcategories(limit=limit)
+
+ def allusers(self, start="!", prefix="", limit=None,
group=None):
+ """Iterate registered users, ordered by username.
+
+ Iterated values are dicts containing 'name', 'editcount',
+ 'registration', and (sometimes) 'groups' keys. 'groups'
will be
+ present only if the user is a member of at least 1 group, and will
+ be a list of unicodes; all the other values are unicodes and should
+ always be present.
+
+ @param start: start at this username (name need not exist)
+ @param prefix: only iterate usernames starting with this substring
+ @param limit: maximum number of users to iterate (default: all)
+ @param group: only iterate users that are members of this group
+ @type group: str
+
+ """
+ augen = api.ListGenerator("allusers", aufrom=start,
+ auprop="editcount|groups|registration",
+ site=self)
+ if prefix:
+ augen.request["auprefix"] = prefix
+ if group:
+ augen.request["augroup"] = group
+ if isinstance(limit, int):
+ augen.limit = limit
+ return augen
+
+ def allimages(self, start="!", prefix="", minsize=None,
maxsize=None,
+ limit=None, reverse=False, sha1=None, sha1base36=None):
+ """Iterate all images, ordered by image title.
+
+ Yields ImagePages, but these pages need not exist on the wiki.
+
+ @param start: start at this title (name need not exist)
+ @param prefix: only iterate titles starting with this substring
+ @param limit: maximum number of titles to iterate (default: all)
+ @param minsize: only iterate images of at least this many bytes
+ @param maxsize: only iterate images of no more than this many bytes
+ @param reverse: if True, iterate in reverse lexigraphic order
+ @param sha1: only iterate image (it is theoretically possible there
+ could be more than one) with this sha1 hash
+ @param sha1base36: same as sha1 but in base 36
+
+ """
+ aigen = api.ImagePageGenerator("allimages", gaifrom=start,
+ site=self)
+ if prefix:
+ aigen.request["gaiprefix"] = prefix
+ if isinstance(limit, int):
+ aigen.limit = limit
+ if isinstance(minsize, int):
+ aigen.request["gaiminsize"] = str(minsize)
+ if isinstance(maxsize, int):
+ aigen.request["gaimaxsize"] = str(maxsize)
+ if reverse:
+ aigen.request["gaidir"] = "descending"
+ if sha1:
+ aigen.request["gaisha1"] = sha1
+ if sha1base36:
+ aigen.request["gaisha1base36"] = sha1base36
+ return aigen
+
+ def blocks(self, starttime=None, endtime=None, reverse=False,
+ blockids=None, users=None, limit=None):
+ """Iterate all current blocks, in order of creation.
+
+ Note that logevents only logs user blocks, while this method
+ iterates all blocks including IP ranges. The iterator yields dicts
+ containing keys corresponding to the block properties (see
+
http://www.mediawiki.org/wiki/API:Query_-_Lists for documentation).
+
+ @param starttime: start iterating at this timestamp
+ @param endtime: stop iterating at this timestamp
+ @param reverse: if True, iterate oldest blocks first (default: newest)
+ @param blockids: only iterate blocks with these id numbers
+ @param users: only iterate blocks affecting these usernames or IPs
+ @param limit: maximum number of blocks to iterate (default: all)
+
+ """
+ if starttime and endtime:
+ if reverse:
+ if starttime > endtime:
+ raise pywikibot.Error(
+ "blocks: starttime must be before endtime with reverse=True")
+ else:
+ if endtime > starttime:
+ raise pywikibot.Error(
+ "blocks: endtime must be before starttime with reverse=False")
+ bkgen = api.ListGenerator("blocks", site=self)
+ bkgen.request["bkprop"] = \
+ "id|user|by|timestamp|expiry|reason|range|flags"
+ if starttime:
+ bkgen.request["bkstart"] = starttime
+ if endtime:
+ bkgen.request["bkend"] = endtime
+ if reverse:
+ bkgen.request["bkdir"] = "newer"
+ if blockids:
+ bkgen.request["bkids"] = blockids
+ if users:
+ bkgen.request["bkusers"] = users
+ if isinstance(limit, int):
+ bkgen.limit = limit
+ return bkgen
+
+ def exturlusage(self, url, protocol="http", namespaces=None,
+ limit=None):
+ """Iterate Pages that contain links to the given URL.
+
+ @param url: The URL to search for (without the protocol prefix);
+ this many include a '*' as a wildcard, only at the start of the
+ hostname
+ @param protocol: The protocol prefix (default: "http")
+ @param namespaces: Only iterate pages in these namespaces (default: all)
+ @type namespaces: list of ints
+ @param limit: Only iterate this many linking pages (default: all)
+
+ """
+ eugen = api.PageGenerator("exturlusage", geuquery=url,
+ geuprotocol=protocol, site=self)
+ if isinstance(namespaces, list):
+ eugen.request["geunamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ eugen.request["geunamespace"] = str(namespaces)
+ if isinstance(limit, int):
+ eugen.limit = limit
+ return eugen
+
+ def imageusage(self, image, namespaces=None, filterredir=None,
+ limit=None):
+ """Iterate Pages that contain links to the given ImagePage.
+
+ @param image: the image to search for (ImagePage need not exist on the wiki)
+ @type image: ImagePage
+ @param namespaces: Only iterate pages in these namespaces (default: all)
+ @type namespaces: list of ints
+ @param filterredir: if True, only yield redirects; if False (and not
+ None), only yield non-redirects (default: yield both)
+ @param limit: Only iterate this many linking pages (default: all)
+
+ """
+ iugen = api.PageGenerator("imageusage", site=self,
+ giutitle=image.title(withSection=False))
+ if isinstance(namespaces, list):
+ iugen.request["giunamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ iugen.request["giunamespace"] = str(namespaces)
+ if isinstance(limit, int):
+ iugen.limit = limit
+ if filterredir is not None:
+ iugen.request["giufilterredir"] = (filterredir and
"redirects"
+ or "nonredirects")
+ return iugen
+
+ def logevents(self, logtype=None, user=None, page=None,
+ start=None, end=None, reverse=False, limit=None):
+ """Iterate all log entries.
+
+ @param logtype: only iterate entries of this type (see wiki
+ documentation for available types, which will include "block",
+ "protect", "rights", "delete",
"upload", "move", "import",
+ "patrol", "merge")
+ @param user: only iterate entries that match this user name
+ @param page: only iterate entries affecting this page
+ @param start: only iterate entries from and after this timestamp
+ @param end: only iterate entries up to and through this timestamp
+ @param reverse: if True, iterate oldest entries first (default: newest)
+ @param limit: only iterate up to this many entries
+
+ """
+ if start and end:
+ if reverse:
+ if end < start:
+ raise Error(
+ "logevents: end must be later than start with reverse=True")
+ else:
+ if start < end:
+ raise Error(
+ "logevents: start must be later than end with
reverse=False")
+ legen = api.ListGenerator("logevents", site=self)
+ if logtype is not None:
+ legen.request["letype"] = logtype
+ if user is not None:
+ legen.request["leuser"] = user
+ if page is not None:
+ legen.request["letitle"] = page.title(withSection=False)
+ if start is not None:
+ legen.request["lestart"] = start
+ if end is not None:
+ legen.request["leend"] = end
+ if reverse:
+ legen.request["ledir"] = "newer"
+ if isinstance(limit, int):
+ legen.limit = limit
+ return legen
+
+ def recentchanges(self, start=None, end=None, reverse=False, limit=None,
+ namespaces=None, pagelist=None, changetype=None,
+ showMinor=None, showBot=None, showAnon=None,
+ showRedirects=None, showPatrolled=None):
+ """Iterate recent changes.
+
+ @param start: timestamp to start listing from
+ @param end: timestamp to end listing at
+ @param reverse: if True, start with oldest changes (default: newest)
+ @param limit: iterate no more than this number of entries
+ @param namespaces: iterate changes to pages in these namespaces only
+ @type namespaces: list of ints
+ @param pagelist: iterate changes to pages in this list only
+ @param pagelist: list of Pages
+ @param changetype: only iterate changes of this type ("edit" for
+ edits to existing pages, "new" for new pages, "log" for
log
+ entries)
+ @param showMinor: if True, only list minor edits; if False (and not
+ None), only list non-minor edits
+ @param showBot: if True, only list bot edits; if False (and not
+ None), only list non-bot edits
+ @param showAnon: if True, only list anon edits; if False (and not
+ None), only list non-anon edits
+ @param showRedirects: if True, only list edits to redirect pages; if
+ False (and not None), only list edits to non-redirect pages
+ @param showPatrolled: if True, only list patrolled edits; if False
+ (and not None), only list non-patrolled edits
+
+ """
+ if start and end:
+ if reverse:
+ if end < start:
+ raise Error(
+ "recentchanges: end must be later than start with reverse=True")
+ else:
+ if start < end:
+ raise Error(
+ "recentchanges: start must be later than end with reverse=False")
+ rcgen = api.ListGenerator("recentchanges", site=self,
+ rcprop="user|comment|timestamp|title|ids"
+ "|redirect|patrolled|loginfo|flags")
+ if start is not None:
+ rcgen.request["rcstart"] = start
+ if end is not None:
+ rcgen.request["rcend"] = end
+ if reverse:
+ rcgen.request["rcdir"] = "newer"
+ if isinstance(limit, int):
+ rcgen.limit = limit
+ if isinstance(namespaces, list):
+ rcgen.request["rcnamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ rcgen.request["rcnamespace"] = str(namespaces)
+ if pagelist:
+ rcgen.request["rctitles"] =
u"|".join(p.title(withSection=False)
+ for p in pagelist)
+ if changetype:
+ rcgen.request["rctype"] = changetype
+ filters = {'minor': showMinor,
+ 'bot': showBot,
+ 'anon': showAnon,
+ 'redirect': showRedirects,
+ 'patrolled': showPatrolled}
+ rcshow = []
+ for item in filters:
+ if filters[item] is not None:
+ rcshow.append(filters[item] and item or ("!"+item))
+ if rcshow:
+ rcgen.request["rcshow"] = "|".join(rcshow)
+ return rcgen
+
+ @deprecate_arg("number", "limit")
+ def search(self, searchstring, namespaces=None, where="text",
+ getredirects=False, limit=None):
+ """Iterate Pages that contain the searchstring.
+
+ Note that this may include non-existing Pages if the wiki's database
+ table contains outdated entries.
+
+ @param searchstring: the text to search for
+ @type searchstring: unicode
+ @param where: Where to search; value must be "text" or
"titles" (many
+ wikis do not support title search)
+ @param namespaces: search only in these namespaces (defaults to 0)
+ @type namespaces: list of ints
+ @param getredirects: if True, include redirects in results
+ @param limit: maximum number of results to iterate
+
+ """
+ if not searchstring:
+ raise Error("search: searchstring cannot be empty")
+ if where not in ("text", "titles"):
+ raise Error("search: unrecognized 'where' value: %s" %
where)
+ srgen = api.PageGenerator("search", gsrsearch=searchstring,
+ gsrwhat=where, site=self)
+ if not namespaces:
+ logger.warning("search: namespaces cannot be empty; using [0].")
+ namespaces = [0]
+ if isinstance(namespaces, list):
+ srgen.request["gsrnamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ else:
+ srgen.request["gsrnamespace"] = str(namespaces)
+ if getredirects:
+ srgen.request["gsrredirects"] = ""
+ if isinstance(limit, int):
+ srgen.limit = limit
+ return srgen
+
+ def usercontribs(self, user=None, userprefix=None, start=None, end=None,
+ reverse=False, limit=None, namespaces=None,
+ showMinor=None):
+ """Iterate contributions by a particular user.
+
+ Iterated values are in the same format as recentchanges.
+
+ @param user: Iterate contributions by this user (name or IP)
+ @param userprefix: Iterate contributions by all users whose names
+ or IPs start with this substring
+ @param start: Iterate contributions starting at this timestamp
+ @param end: Iterate contributions ending at this timestamp
+ @param reverse: Iterate oldest contributions first (default: newest)
+ @param limit: Maximum number of contributions to iterate
+ @param namespaces: Only iterate contributions in these namespaces
+ @type namespaces: list of ints
+ @param showMinor: if True, iterate only minor edits; if False and
+ not None, iterate only non-minor edits (default: iterate both)
+
+ """
+ if not (user or userprefix):
+ raise Error(
+ "usercontribs: either user or userprefix must be non-empty")
+ if start and end:
+ if reverse:
+ if end < start:
+ raise Error(
+ "usercontribs: end must be later than start with
reverse=True")
+ else:
+ if start < end:
+ raise Error(
+ "usercontribs: start must be later than end with
reverse=False")
+ ucgen = api.ListGenerator("usercontribs", site=self,
+ ucprop="ids|title|timestamp|comment|flags")
+ if user:
+ ucgen.request["ucuser"] = user
+ if userprefix:
+ ucgen.request["ucuserprefix"] = userprefix
+ if start is not None:
+ ucgen.request["ucstart"] = start
+ if end is not None:
+ ucgen.request["ucend"] = end
+ if reverse:
+ ucgen.request["ucdir"] = "newer"
+ if isinstance(limit, int):
+ ucgen.limit = limit
+ if isinstance(namespaces, list):
+ ucgen.request["ucnamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ ucgen.request["ucnamespace"] = str(namespaces)
+ if showMinor is not None:
+ ucgen.request["ucshow"] = showMinor and "minor" or
"!minor"
+ return ucgen
+
+ def watchlist_revs(self, start=None, end=None, reverse=False,
+ namespaces=None, showMinor=None, showBot=None,
+ showAnon=None, limit=None):
+ """Iterate revisions to pages on the bot user's watchlist.
+
+ Iterated values will be in same format as recentchanges.
+
+ @param start: Iterate revisions starting at this timestamp
+ @param end: Iterate revisions ending at this timestamp
+ @param reverse: Iterate oldest revisions first (default: newest)
+ @param namespaces: only iterate revisions to pages in these
+ namespaces (default: all)
+ @type namespaces: list of ints
+ @param showMinor: if True, only list minor edits; if False (and not
+ None), only list non-minor edits
+ @param showBot: if True, only list bot edits; if False (and not
+ None), only list non-bot edits
+ @param showAnon: if True, only list anon edits; if False (and not
+ None), only list non-anon edits
+ @param limit: Maximum number of revisions to iterate
+
+ """
+ if start and end:
+ if reverse:
+ if end < start:
+ raise Error(
+ "watchlist_revs: end must be later than start with reverse=True")
+ else:
+ if start < end:
+ raise Error(
+ "watchlist_revs: start must be later than end with reverse=False")
+ wlgen = api.ListGenerator("watchlist", wlallrev="",
site=self,
+ wlprop="user|comment|timestamp|title|ids|flags")
+ #TODO: allow users to ask for "patrol" as well?
+ if start is not None:
+ wlgen.request["wlstart"] = start
+ if end is not None:
+ wlgen.request["wlend"] = end
+ if reverse:
+ wlgen.request["wldir"] = "newer"
+ if isinstance(limit, int):
+ wlgen.limit = limit
+ if isinstance(namespaces, list):
+ wlgen.request["wlnamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ wlgen.request["wlnamespace"] = str(namespaces)
+ filters = {'minor': showMinor,
+ 'bot': showBot,
+ 'anon': showAnon}
+ wlshow = []
+ for item in filters:
+ if filters[item] is not None:
+ wlshow.append(filters[item] and item or ("!"+item))
+ if wlshow:
+ wlgen.request["wlshow"] = "|".join(wlshow)
+ return wlgen
+
+ def deletedrevs(self, page, start=None, end=None, reverse=None, limit=None,
+ get_text=False):
+ """Iterate deleted revisions.
+
+ Each value returned by the iterator will be a dict containing the
+ 'title' and 'ns' keys for a particular Page and a
'revisions' key
+ whose value is a list of revisions in the same format as
+ recentchanges (plus a 'content' element if requested). If get_text
+ is true, the toplevel dict will contain a 'token' key as well.
+
+ @param page: The page to check for deleted revisions
+ @param start: Iterate revisions starting at this timestamp
+ @param end: Iterate revisions ending at this timestamp
+ @param reverse: Iterate oldest revisions first (default: newest)
+ @param limit: Iterate no more than this number of revisions.
+ @param get_text: If True, retrieve the content of each revision and
+ an undelete token
+
+ """
+ if start and end:
+ if reverse:
+ if end < start:
+ raise Error(
+"deletedrevs: end must be later than start with reverse=True")
+ else:
+ if start < end:
+ raise Error(
+"deletedrevs: start must be later than end with reverse=False")
+ if not self.logged_in():
+ self.login()
+ if "deletedhistory" not in self.userinfo['rights']:
+ try:
+ self.login(True)
+ except NoUsername:
+ pass
+ if "deletedhistory" not in self.userinfo['rights']:
+ raise Error(
+"deletedrevs: User:%s not authorized to access deleted revisions."
+ % self.user())
+ if get_text:
+ if "undelete" not in self.userinfo['rights']:
+ try:
+ self.login(True)
+ except NoUsername:
+ pass
+ if "undelete" not in self.userinfo['rights']:
+ raise Error(
+"deletedrevs: User:%s not authorized to view deleted content."
+ % self.user())
+
+ drgen = api.ListGenerator("deletedrevs", site=self,
+ titles=page.title(withSection=False),
+ drprop="revid|user|comment|minor")
+ if get_text:
+ drgen.request['drprop'] = drgen.request['drprop'] +
"|content|token"
+ if start is not None:
+ drgen.request["drstart"] = start
+ if end is not None:
+ drgen.request["drend"] = end
+ if reverse:
+ drgen.request["drdir"] = "newer"
+ if isinstance(limit, int):
+ drgen.limit = limit
+ return drgen
+
+ def users(self, usernames):
+ """Iterate info about a list of users by name or IP.
+
+ @param usernames: a list of user names
+ @type usernames: list, or other iterable, of unicodes
+
+ """
+ if not isinstance(usernames, basestring):
+ usernames = u"|".join(usernames)
+ usgen = api.ListGenerator("users", ususers=usernames, site=self,
+ usprop="blockinfo|groups|editcount|registration")
+ return usgen
+
+ def randompages(self, limit=1, namespaces=None, redirects=False):
+ """Iterate a number of random pages.
+
+ Pages are listed in a fixed sequence, only the starting point is
+ random.
+
+ @param limit: the maximum number of pages to iterate (default: 1)
+ @param namespaces: only iterate pages in these namespaces.
+ @param redirects: if True, include only redirect pages in results
+ (default: include only non-redirects)
+
+ """
+ rngen = api.PageGenerator("random", site=self)
+ rngen.limit = limit
+ if isinstance(namespaces, list):
+ rngen.request["grnnamespace"] = u"|".join(unicode(ns)
+ for ns in namespaces)
+ elif namespaces is not None:
+ rngen.request["grnnamespace"] = str(namespaces)
+ if redirects:
+ rngen.request["grnredirect"] = ""
+ return rngen
+
+ # catalog of editpage error codes, for use in generating messages
+ _ep_errors = {
+ "noapiwrite": "API editing not enabled on %(site)s wiki",
+ "writeapidenied":
+"User %(user)s is not authorized to edit on %(site)s wiki",
+ "protectedtitle":
+"Title %(title)s is protected against creation on %(site)s",
+ "cantcreate":
+"User %(user)s not authorized to create new pages on %(site)s wiki",
+ "cantcreate-anon":
+"""Bot is not logged in, and anon users are not authorized to create new
pages
+on %(site)s wiki""",
+ "articleexists": "Page %(title)s already exists on %(site)s
wiki",
+ "noimageredirect-anon":
+"""Bot is not logged in, and anon users are not authorized to create
image
+redirects on %(site)s wiki""",
+ "noimageredirect":
+"User %(user)s not authorized to create image redirects on %(site)s wiki",
+ "spamdetected":
+"Edit to page %(title)s rejected by spam filter due to content:\n",
+ "filtered": "%(info)s",
+ "contenttoobig": "%(info)s",
+ "noedit-anon":
+"""Bot is not logged in, and anon users are not authorized to edit on
+%(site)s wiki""",
+ "noedit": "User %(user)s not authorized to edit pages on %(site)s
wiki",
+ "pagedeleted":
+"Page %(title)s has been deleted since last retrieved from %(site)s wiki",
+ "editconflict": "Page %(title)s not saved due to edit
conflict.",
+ }
+
+ def editpage(self, page, summary, minor=True, notminor=False,
+ recreate=True, createonly=False, watch=False, unwatch=False):
+ """Submit an edited Page object to be saved to the wiki.
+
+ @param page: The Page to be saved; its .text property will be used
+ as the new text to be saved to the wiki
+ @param token: the edit token retrieved using Site.token()
+ @param summary: the edit summary (required!)
+ @param minor: if True (default), mark edit as minor
+ @param notminor: if True, override account preferences to mark edit
+ as non-minor
+ @param recreate: if True (default), create new page even if this
+ title has previously been deleted
+ @param createonly: if True, raise an error if this title already
+ exists on the wiki
+ @param watch: if True, add this Page to bot's watchlist
+ @param unwatch: if True, remove this Page from bot's watchlist if
+ possible
+ @return: True if edit succeeded, False if it failed
+
+ """
+ text = page.text
+ if not text:
+ raise Error("editpage: no text to be saved")
+ try:
+ lastrev = page.latestRevision()
+ except NoPage:
+ lastrev = None
+ if not recreate:
+ raise Error("Page %s does not exist on %s wiki."
+ % (page.title(withSection=False), self))
+ token = self.token(page, "edit")
+ self.lock_page(page)
+ if lastrev is not None and page.latestRevision() != lastrev:
+ raise Error("editpage: Edit conflict detected; saving aborted.")
+ req = api.Request(site=self, action="edit",
+ title=page.title(withSection=False),
+ text=text, token=token, summary=summary)
+## if lastrev is not None:
+## req["basetimestamp"] = page._revisions[lastrev].timestamp
+ if minor:
+ req['minor'] = ""
+ elif notminor:
+ req['notminor'] = ""
+ if 'bot' in self.userinfo['groups']:
+ req['bot'] = ""
+ if recreate:
+ req['recreate'] = ""
+ if createonly:
+ req['createonly'] = ""
+ if watch:
+ req['watch'] = ""
+ elif unwatch:
+ req['unwatch'] = ""
+## FIXME: API gives 'badmd5' error
+## md5hash = md5()
+## md5hash.update(urllib.quote_plus(text.encode(self.encoding())))
+## req['md5'] = md5hash.digest()
+ while True:
+ try:
+ result = req.submit()
+ logger.debug("editpage response: %s" % result)
+ except api.APIError, err:
+ self.unlock_page(page)
+ if err.code.endswith("anon") and self.logged_in():
+ logger.debug(
+"editpage: received '%s' even though bot is logged in" % err.code)
+ errdata = {
+ 'site': self,
+ 'title': page.title(withSection=False),
+ 'user': self.user(),
+ 'info': err.info
+ }
+ if err.code == "spamdetected":
+ raise SpamfilterError(self._ep_errors[err.code] % errdata
+ + err.info[ err.info.index("fragment: ") + 9: ])
+
+ if err.code == "editconflict":
+ raise EditConflict(self._ep_errors[err.code] % errdata)
+ if err.code in self._ep_errors:
+ raise Error(self._ep_errors[err.code] % errdata)
+ logger.debug("editpage: Unexpected error code '%s'
received."
+ % err.code)
+ raise
+ assert ("edit" in result and "result" in
result["edit"]), result
+ if result["edit"]["result"] == "Success":
+ self.unlock_page(page)
+ if "nochange" in result["edit"]:
+ # null edit, page not changed
+ # TODO: do we want to notify the user of this?
+ return True
+ page._revid = result["edit"]["newrevid"]
+ # see
http://www.mediawiki.org/wiki/API:Wikimania_2006_API_discussion#Notes
+ # not safe to assume that saved text is the same as sent
+ self.loadrevisions(page, getText=True)
+ return True
+ elif result["edit"]["result"] == "Failure":
+ if "captcha" in result["edit"]:
+ captcha = result["edit"]["captcha"]
+ req['captchaid'] = captcha['id']
+ if captcha["type"] == "math":
+ req['captchaword'] =
input(captcha["question"])
+ continue
+ elif "url" in captcha:
+ webbrowser.open(url)
+ req['captchaword'] = cap_answerwikipedia.input(
+"Please view CAPTCHA in your browser, then type answer here:")
+ continue
+ else:
+ self.unlock_page(page)
+ logger.error(
+"editpage: unknown CAPTCHA response %s, page not saved"
+ % captcha)
+ return False
+ else:
+ self.unlock_page(page)
+ logger.error("editpage: unknown failure reason %s"
+ % str(result))
+ return False
+ else:
+ self.unlock_page(page)
+ logger.error(
+"editpage: Unknown result code '%s' received; page not saved"
+ % result["edit"]["result"])
+ logger.error(str(result))
+ return False
+
+ # catalog of move errors for use in error messages
+ _mv_errors = {
+ "noapiwrite": "API editing not enabled on %(site)s wiki",
+ "writeapidenied":
+"User %(user)s is not authorized to edit on %(site)s wiki",
+ "nosuppress":
+"User %(user)s is not authorized to move pages without creating redirects",
+ "cantmove-anon":
+"""Bot is not logged in, and anon users are not authorized to move pages
on
+%(site)s wiki""",
+ "cantmove":
+"User %(user)s is not authorized to move pages on %(site)s wiki",
+ "immobilenamespace":
+"Pages in %(oldnamespace)s namespace cannot be moved on %(site)s wiki",
+ "articleexists":
+"Cannot move because page [[%(newtitle)s]] already exists on %(site)s wiki",
+ "protectedpage":
+"Page [[%(oldtitle)s]] is protected against moving on %(site)s wiki",
+ "protectedtitle":
+"Page [[%(newtitle)s]] is protected against creation on %(site)s wiki",
+ "nonfilenamespace":
+"Cannot move a file to %(newnamespace)s namespace on %(site)s wiki",
+ "filetypemismatch":
+"[[%(newtitle)s]] file extension does not match content of [[%(oldtitle)s]]"
+ }
+
+ def movepage(self, page, newtitle, summary, movetalk=True,
+ noredirect=False):
+ """Move a Page to a new title.
+
+ @param page: the Page to be moved (must exist)
+ @param newtitle: the new title for the Page
+ @type newtitle: unicode
+ @param summary: edit summary (required!)
+ @param movetalk: if True (default), also move the talk page if possible
+ @param noredirect: if True, suppress creation of a redirect from the
+ old title to the new one
+ @return: Page object with the new title
+
+ """
+ oldtitle = page.title(withSection=False)
+ newlink = pywikibot.Link(newtitle, self)
+ if newlink.namespace:
+ newtitle = self.namespace(newlink.namespace) + ":" + newlink.title
+ else:
+ newtitle = newlink.title
+ if oldtitle == newtitle:
+ raise Error("Cannot move page %s to its own title."
+ % oldtitle)
+ if not page.exists():
+ raise Error("Cannot move page %s because it does not exist on %s."
+ % (oldtitle, self))
+ token = self.token(page, "move")
+ self.lock_page(page)
+ req = api.Request(site=self, action="move", to=newtitle,
+ token=token, reason=summary)
+ req['from'] = oldtitle # "from" is a python keyword
+ if movetalk:
+ req['movetalk'] = ""
+ if noredirect:
+ req['noredirect'] = ""
+ try:
+ result = req.submit()
+ logger.debug("movepage response: %s" % result)
+ except api.APIError, err:
+ if err.code.endswith("anon") and self.logged_in():
+ logger.debug(
+"movepage: received '%s' even though bot is logged in" % err.code)
+ errdata = {
+ 'site': self,
+ 'oldtitle': oldtitle,
+ 'oldnamespace': self.namespace(page.namespace()),
+ 'newtitle': newtitle,
+ 'newnamespace': self.namespace(newlink.namespace),
+ 'user': self.user(),
+ }
+ if err.code in self._mv_errors:
+ raise Error(self._mv_errors[err.code] % errdata)
+ logger.debug("movepage: Unexpected error code '%s'
received."
+ % err.code)
+ raise
+ finally:
+ self.unlock_page(page)
+ if "move" not in result:
+ logger.error("movepage: %s" % result)
+ raise Error("movepage: unexpected response")
+ # TODO: Check for talkmove-error messages
+ if "talkmove-error-code" in result["move"]:
+ logger.warning(u"movepage: Talk page %s not moved"
+ % (page.toggleTalkPage().title(asLink=True)))
+ return pywikibot.Page(page, newtitle)
+
+ # catalog of rollback errors for use in error messages
+ _rb_errors = {
+ "noapiwrite":
+ "API editing not enabled on %(site)s wiki",
+ "writeapidenied":
+ "User %(user)s not allowed to edit through the API",
+ "alreadyrolled":
+ "Page [[%(title)s]] already rolled back; action aborted.",
+ } # other errors shouldn't arise because we check for those errors
+
+ def rollbackpage(self, page, summary=u''):
+ """Roll back page to version before last user's edits.
+
+ As a precaution against errors, this method will fail unless
+ the page history contains at least two revisions, and at least
+ one that is not by the same user who made the last edit.
+
+ @param page: the Page to be rolled back (must exist)
+ @param summary: edit summary (defaults to a standardized message)
+
+ """
+ if len(page._revisions) < 2:
+ raise pywikibot.Error(
+ u"Rollback of %s aborted; load revision history first."
+ % page.title(asLink=True))
+ last_rev = page._revisions[page.latestRevision()]
+ last_user = last_rev.user
+ for rev in sorted(page._revisions.keys(), reverse=True):
+ # start with most recent revision first
+ if rev.user != last_user:
+ prev_user = rev.user
+ break
+ else:
+ raise pywikibot.Error(
+ u"Rollback of %s aborted; only one user in revision
history."
+ % page.title(asLink=True))
+ summary = summary or (
+u"Reverted edits by [[Special:Contributions/%(last_user)s|%(last_user)s]] "
+u"([[User talk:%(last_user)s|Talk]]) to last version by %(prev_user)s"
+ % locals())
+ token = self.token(page, "rollback")
+ self.lock_page(page)
+ req = api.Request(site=self, action="rollback",
+ title=page.title(withSection=False),
+ user=last_user,
+ token=token)
+ try:
+ result = req.submit()
+ except api.APIError, err:
+ errdata = {
+ 'site': self,
+ 'title': page.title(withSection=False),
+ 'user': self.user(),
+ }
+ if err.code in self._rb_errors:
+ raise Error(self._rb_errors[err.code] % errdata)
+ logger.debug("rollback: Unexpected error code '%s'
received."
+ % err.code)
+ raise
+ finally:
+ self.unlock_page(page)
+
+ # catalog of delete errors for use in error messages
+ _dl_errors = {
+ "noapiwrite":
+ "API editing not enabled on %(site)s wiki",
+ "writeapidenied":
+ "User %(user)s not allowed to edit through the API",
+ "permissiondenied":
+ "User %(user)s not authorized to delete pages on %(site)s wiki.",
+ "cantdelete":
+ "Could not delete [[%(title)s]]. Maybe it was deleted already.",
+ } # other errors shouldn't occur because of pre-submission checks
+
+ def deletepage(self, page, summary):
+ """Delete page from the wiki. Requires appropriate privilege
level.
+
+ @param page: Page to be deleted.
+ @param summary: Edit summary (required!).
+
+ """
+ try:
+ self.login(sysop=True)
+ except pywikibot.Error, e:
+ raise Error("delete: Unable to login as sysop (%s)"
+ % e.__class__.__name__)
+ if not self.logged_in(sysop=True):
+ raise Error("delete: Unable to login as sysop")
+ token = self.token("delete")
+ req = api.Request(site=self, action="delete", token=token,
+ title=page.title(withSection=False),
+ reason=summary)
+ try:
+ result = req.submit()
+ except api.APIError, err:
+ errdata = {
+ 'site': self,
+ 'title': page.title(withSection=False),
+ 'user': self.user(),
+ }
+ if err.code in self._dl_errors:
+ raise Error(self._dl_errors[err.code] % errdata)
+ logger.debug("delete: Unexpected error code '%s'
received."
+ % err.code)
+ raise
+ finally:
+ self.unlock_page(page)
+
+ # TODO: implement undelete
+
+ # TODO: implement patrol
+
+ def linksearch(self, siteurl, limit=500):
+ """Backwards-compatible interface to
exturlusage()"""
+ return self.exturlusage(siteurl, limit=limit)
+
+ @deprecate_arg("repeat", None)
+ def newimages(self, number=100, lestart=None, leend=None, leuser=None,
+ letitle=None):
+ """Yield ImagePages from most recent uploads"""
+ return self.logevents(logtype="upload", limit=number, start=lestart,
+ end=leend, user=leuser, title=letitle)
+
+ def getImagesFromAnHash(self, hash_found=None):
+ """Return all images that have the same hash.
+
+ Useful to find duplicates or nowcommons.
+
+ NOTE: it returns also the image itself, if you don't want it, just
+ filter the list returned.
+
+ NOTE 2: it returns the image title WITHOUT the image namespace.
+
+ """
+ if hash_found == None: # If the hash is none return None and not continue
+ return None
+ return [image.title(withNamespace=False)
+ for image in self.allimages(sha1=hash_found)]
+
+
+#### METHODS NOT IMPLEMENTED YET ####
+class NotImplementedYet:
+
+ # TODO: is this needed any more? can it be obtained from the http module?
+ def cookies(self, sysop = False):
+ """Return a string containing the user's current
cookies."""
+ self._loadCookies(sysop = sysop)
+ index = self._userIndex(sysop)
+ return self._cookies[index]
+
+ def _loadCookies(self, sysop = False):
+ """Retrieve session cookies for login"""
+ index = self._userIndex(sysop)
+ if self._cookies[index] is not None:
+ return
+ try:
+ if sysop:
+ try:
+ username = config.sysopnames[self.family.name
+ ][self.code]
+ except KeyError:
+ raise NoUsername("""\
+You tried to perform an action that requires admin privileges, but you haven't
+entered your sysop name in your user-config.py. Please add
+sysopnames['%s']['%s']='name' to your
user-config.py"""
+ % (self.family.name, self.code))
+ else:
+ username = pywikiobt.config2.usernames[self.family.name
+ ][self.code]
+ except KeyError:
+ self._cookies[index] = None
+ self._isLoggedIn[index] = False
+ else:
+ tmp = '%s-%s-%s-login.data' % (
+ self.family.name, self.code, username)
+ fn = config.datafilepath('login-data', tmp)
+ if not os.path.exists(fn):
+ self._cookies[index] = None
+ self._isLoggedIn[index] = False
+ else:
+ f = open(fn)
+ self._cookies[index] = '; '.join([x.strip() for x in
f.readlines()])
+ f.close()
+
+ # THESE ARE FUNCTIONS NOT YET IMPLEMENTED IN THE API
+ # TODO: avoid code duplication for the following methods
+ def newpages(self, number = 10, get_redirect = False, repeat = False):
+ """Yield new articles (as Page objects) from Special:Newpages.
+
+ Starts with the newest article and fetches the number of articles
+ specified in the first argument. If repeat is True, it fetches
+ Newpages again. If there is no new page, it blocks until there is
+ one, sleeping between subsequent fetches of Newpages.
+
+ The objects yielded are tuples composed of the Page object,
+ timestamp (unicode), length (int), an empty unicode string, username
+ or IP address (str), comment (unicode).
+
+ """
+ # TODO: in recent MW versions Special:Newpages takes a namespace parameter,
+ # and defaults to 0 if not specified.
+ # TODO: Detection of unregistered users is broken
+ # TODO: Repeat mechanism doesn't make much sense as implemented;
+ # should use both offset and limit parameters, and have an
+ # option to fetch older rather than newer pages
+ seen = set()
+ while True:
+ path = self.newpages_address(n=number)
+ # The throttling is important here, so always enabled.
+ get_throttle()
+ html = self.getUrl(path)
+
+ entryR = re.compile(
+'<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"'
+'
title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<length>[\d,.]+)[^\)\]]*[\)\]]'
+' .?<a href=".+?"
title=".+?:(?P<username>.+?)">'
+ )
+ for m in entryR.finditer(html):
+ date = m.group('date')
+ title = m.group('title')
+ title = title.replace('"', '"')
+ length = int(re.sub("[,.]", "",
m.group('length')))
+ loggedIn = u''
+ username = m.group('username')
+ comment = u''
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page, date, length, loggedIn, username, comment
+ if not repeat:
+ break
+
+ def longpages(self, number = 10, repeat = False):
+ """Yield Pages from Special:Longpages.
+
+ Return values are a tuple of Page object, length(int).
+
+ """
+ #TODO: should use offset and limit parameters; 'repeat' as now
+ # implemented is fairly useless
+ # this comment applies to all the XXXXpages methods following, as well
+ seen = set()
+ while True:
+ path = self.longpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(ur'<li>\(<a href=".+?"
title=".+?">hist</a>\) <a href=".+?"
title="(?P<title>.+?)">.+?</a>
\[(?P<length>\d+)(.+?)\]</li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ length = int(m.group('length'))
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page, length
+ if not repeat:
+ break
+
+ def shortpages(self, number = 10, repeat = False):
+ """Yield Pages and lengths from
Special:Shortpages."""
+ throttle = True
+ seen = set()
+ while True:
+ path = self.shortpages_address(n = number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(ur'<li>\(<a href=".+?"
title=".+?">hist</a>\) <a href=".+?"
title="(?P<title>.+?)">.+?</a>
\[(?P<length>\d+)(.+?)\]</li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ length = int(m.group('length'))
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page, length
+ if not repeat:
+ break
+
+ def deadendpages(self, number = 10, repeat = False):
+ """Yield Page objects retrieved from
Special:Deadendpages."""
+ seen = set()
+ while True:
+ path = self.deadendpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def ancientpages(self, number = 10, repeat = False):
+ """Yield Pages, datestamps from
Special:Ancientpages."""
+ seen = set()
+ while True:
+ path = self.ancientpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+'<li><a href=".+?"
title="(?P<title>.+?)">.+?</a>
(?P<date>.+?)</li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ date = m.group('date')
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page, date
+ if not repeat:
+ break
+
+ def lonelypages(self, number = 10, repeat = False):
+ """Yield Pages retrieved from
Special:Lonelypages."""
+ throttle = True
+ seen = set()
+ while True:
+ path = self.lonelypages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def unwatchedpages(self, number = 10, repeat = False):
+ """Yield Pages from Special:Unwatchedpages (requires Admin
privileges)."""
+ seen = set()
+ while True:
+ path = self.unwatchedpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path, sysop = True)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a>.+?</li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def uncategorizedcategories(self, number = 10, repeat = False):
+ """Yield Categories from
Special:Uncategorizedcategories."""
+ import catlib
+ seen = set()
+ while True:
+ path = self.uncategorizedcategories_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ if title not in seen:
+ seen.add(title)
+ page = catlib.Category(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def newimages(self, number = 10, repeat = False):
+ """Yield ImagePages from
Special:Log&type=upload"""
+
+ seen = set()
+ regexp = re.compile('<li[^>]*>(?P<date>.+?)\s+<a
href=.*?>(?P<user>.+?)</a>\s+\(.+?</a>\).*?<a
href=".*?"(?P<new> class="new")?
title="(?P<image>.+?)"\s*>(?:.*?<span
class="comment">(?P<comment>.*?)</span>)?', re.UNICODE)
+
+ while True:
+ path = self.log_address(number, mode = 'upload')
+ get_throttle()
+ html = self.getUrl(path)
+
+ for m in regexp.finditer(html):
+ image = m.group('image')
+
+ if image not in seen:
+ seen.add(image)
+
+ if m.group('new'):
+ output(u"Image \'%s\' has been deleted." %
image)
+ continue
+
+ date = m.group('date')
+ user = m.group('user')
+ comment = m.group('comment') or ''
+
+ yield ImagePage(self, image), date, user, comment
+ if not repeat:
+ break
+
+ def uncategorizedimages(self, number = 10, repeat = False):
+ """Yield ImagePages from
Special:Uncategorizedimages."""
+ seen = set()
+ ns = self.image_namespace()
+ entryR = re.compile(
+ '<a href=".+?"
title="(?P<title>%s:.+?)">.+?</a>' % ns)
+ while True:
+ path = self.uncategorizedimages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ if title not in seen:
+ seen.add(title)
+ page = ImagePage(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def uncategorizedpages(self, number = 10, repeat = False):
+ """Yield Pages from Special:Uncategorizedpages."""
+ seen = set()
+ while True:
+ path = self.uncategorizedpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def unusedcategories(self, number = 10, repeat = False):
+ """Yield Category objects from
Special:Unusedcategories."""
+ import catlib
+ seen = set()
+ while True:
+ path = self.unusedcategories_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile('<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+
+ if title not in seen:
+ seen.add(title)
+ page = catlib.Category(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def unusedfiles(self, number = 10, repeat = False, extension = None):
+ """Yield ImagePage objects from
Special:Unusedimages."""
+ seen = set()
+ ns = self.image_namespace()
+ entryR = re.compile(
+ '<a href=".+?"
title="(?P<title>%s:.+?)">.+?</a>' % ns)
+ while True:
+ path = self.unusedfiles_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ for m in entryR.finditer(html):
+ fileext = None
+ title = m.group('title')
+ if extension:
+ fileext = title[len(title)-3:]
+ if title not in seen and fileext == extension:
+ ## Check whether the media is used in a Proofread page
+ # code disabled because it slows this method down, and
+ # because it is unclear what it's supposed to do.
+ #basename = title[6:]
+ #page = Page(self, 'Page:' + basename)
+
+ #if not page.exists():
+ seen.add(title)
+ image = ImagePage(self, title)
+ yield image
+ if not repeat:
+ break
+
+ def withoutinterwiki(self, number=10, repeat=False):
+ """Yield Pages without language links from
Special:Withoutinterwiki."""
+ seen = set()
+ while True:
+ path = self.withoutinterwiki_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile('<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def linksearch(self, siteurl):
+ """Yield Pages from results of Special:Linksearch for
'siteurl'."""
+ if siteurl.startswith('*.'):
+ siteurl = siteurl[2:]
+ output(u'Querying [[Special:Linksearch]]...')
+ cache = []
+ for url in [siteurl, '*.' + siteurl]:
+ path = self.linksearch_address(url)
+ get_throttle()
+ html = self.getUrl(path)
+ loc = html.find('<div class="mw-spcontent">')
+ if loc > -1:
+ html = html[loc:]
+ loc = html.find('<div class="printfooter">')
+ if loc > -1:
+ html = html[:loc]
+ R = re.compile('title ?=\"(.*?)\"')
+ for title in R.findall(html):
+ if not siteurl in title:
+ # the links themselves have similar form
+ if title in cache:
+ continue
+ else:
+ cache.append(title)
+ yield Page(self, title)
+
Property changes on: branches/rewrite/pywikibot/site.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Added: svn:eol-style
+ native
Modified: branches/rewrite/pywikibot/textlib.py
===================================================================
--- branches/rewrite/pywikibot/textlib.py 2008-12-16 19:34:48 UTC (rev 6155)
+++ branches/rewrite/pywikibot/textlib.py 2008-12-16 19:40:20 UTC (rev 6156)
@@ -1,675 +1,675 @@
-# -*- coding: utf-8 -*-
-"""
-Functions for manipulating wiki-text.
-
-Unless otherwise noted, all functions take a unicode string as the argument
-and return a unicode string.
-
-"""
-#
-# (C) Pywikipedia bot team, 2008
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id: $'
-
-
-import pywikibot
-import re
-
-
-def unescape(s):
- """Replace escaped HTML-special characters by their
originals"""
- if '&' not in s:
- return s
- s = s.replace("<", "<")
- s = s.replace(">", ">")
- s = s.replace("'", "'")
- s = s.replace(""", '"')
- s = s.replace("&", "&") # Must be last
- return s
-
-
-def replaceExcept(text, old, new, exceptions, caseInsensitive=False,
- allowoverlap=False, marker = '', site = None):
- """
- Return text with 'old' replaced by 'new', ignoring specified types of
text.
-
- Skips occurences of 'old' within exceptions; e.g., within nowiki tags or
- HTML comments. If caseInsensitive is true, then use case insensitive
- regex matching. If allowoverlap is true, overlapping occurences are all
- replaced (watch out when using this, it might lead to infinite loops!).
-
- Parameters:
- text - a unicode string
- old - a compiled regular expression
- new - a unicode string (which can contain regular
- expression references), or a function which takes
- a match object as parameter. See parameter repl of
- re.sub().
- exceptions - a list of strings which signal what to leave out,
- e.g. ['math', 'table', 'template']
- caseInsensitive - a boolean
- marker - a string that will be added to the last replacement;
- if nothing is changed, it is added at the end
-
- """
- if site is None:
- site = pywikibot.getSite()
-
- exceptionRegexes = {
- 'comment': re.compile(r'(?s)<!--.*?-->'),
- # section headers
- 'header': re.compile(r'\r\n=+.+=+ *\r\n'),
- 'includeonly':
re.compile(r'(?is)<includeonly>.*?</includeonly>'),
- 'math': re.compile(r'(?is)<math>.*?</math>'),
- 'noinclude':
re.compile(r'(?is)<noinclude>.*?</noinclude>'),
- # wiki tags are ignored inside nowiki tags.
- 'nowiki':
re.compile(r'(?is)<nowiki>.*?</nowiki>'),
- # preformatted text
- 'pre': re.compile(r'(?ism)<pre>.*?</pre>'),
- 'source': re.compile(r'(?is)<source
.*?</source>'),
- # inline references
- 'ref': re.compile(r'(?ism)<ref[
>].*?</ref>'),
- 'timeline':
re.compile(r'(?is)<timeline>.*?</timeline>'),
- # lines that start with a space are shown in a monospace font and
- # have whitespace preserved.
- 'startspace': re.compile(r'(?m)^ (.*?)$'),
- # tables often have whitespace that is used to improve wiki
- # source code readability.
- # TODO: handle nested tables.
- 'table':
re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
- # templates with parameters often have whitespace that is used to
- # improve wiki source code readability.
- # 'template': re.compile(r'(?s){{.*?}}'),
- # The regex above fails on nested templates. This regex can handle
- # templates cascaded up to level 3, but no deeper. For arbitrary
- # depth, we'd need recursion which can't be done in Python's re.
- # After all, the language of correct parenthesis words is not regular.
- 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'),
- 'hyperlink': compileLinkR(),
- 'gallery':
re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
- # this matches internal wikilinks, but also interwiki, categories, and
- # images.
- 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
- 'interwiki': re.compile(r'(?i)\[\[(%s)\s?:[^\]]*\]\][\s]*'
- % '|'.join(site.validLanguageLinks() +
site.family.obsolete.keys())),
-
- }
-
- # if we got a string, compile it as a regular expression
- if type(old) is str or type(old) is unicode:
- if caseInsensitive:
- old = re.compile(old, re.IGNORECASE | re.UNICODE)
- else:
- old = re.compile(old)
-
- dontTouchRegexes = []
- for exc in exceptions:
- if isinstance(exc, str) or isinstance(exc, unicode):
- # assume it's a reference to the exceptionRegexes dictionary
- # defined above.
- if not exceptionRegexes.has_key(exc):
- raise ValueError("Unknown tag type: " + exc)
- dontTouchRegexes.append(exceptionRegexes[exc])
- else:
- # assume it's a regular expression
- dontTouchRegexes.append(exc)
- index = 0
- markerpos = len(text)
- while True:
- match = old.search(text, index)
- if not match:
- # nothing left to replace
- break
-
- # check which exception will occur next.
- nextExceptionMatch = None
- for dontTouchR in dontTouchRegexes:
- excMatch = dontTouchR.search(text, index)
- if excMatch and (
- nextExceptionMatch is None or
- excMatch.start() < nextExceptionMatch.start()):
- nextExceptionMatch = excMatch
-
- if nextExceptionMatch is not None and nextExceptionMatch.start() <=
match.start():
- # an HTML comment or text in nowiki tags stands before the next valid match.
Skip.
- index = nextExceptionMatch.end()
- else:
- # We found a valid match. Replace it.
- if callable(new):
- # the parameter new can be a function which takes the match as a
parameter.
- replacement = new(match)
- else:
- # it is not a function, but a string.
-
- # it is a little hack to make \n work. It would be better to fix it
- # previously, but better than nothing.
- new = new.replace('\\n', '\n')
-
- # We cannot just insert the new string, as it may contain regex
- # group references such as \2 or \g<name>.
- # On the other hand, this approach does not work because it can't
- # handle lookahead or lookbehind (see bug #1731008):
- #replacement = old.sub(new, text[match.start():match.end()])
- #text = text[:match.start()] + replacement + text[match.end():]
-
- # So we have to process the group references manually.
- replacement = new
-
- groupR =
re.compile(r'\\(?P<number>\d+)|\\g<(?P<name>.+?)>')
- while True:
- groupMatch = groupR.search(replacement)
- if not groupMatch:
- break
- groupID = groupMatch.group('name') or
int(groupMatch.group('number'))
- replacement = replacement[:groupMatch.start()] + match.group(groupID)
+ replacement[groupMatch.end():]
- text = text[:match.start()] + replacement + text[match.end():]
-
- # continue the search on the remaining text
- if allowoverlap:
- index = match.start() + 1
- else:
- index = match.start() + len(replacement)
- markerpos = match.start() + len(replacement)
- text = text[:markerpos] + marker + text[markerpos:]
- return text
-
-
-def removeDisabledParts(text, tags = ['*']):
- """
- Return text without portions where wiki markup is disabled
-
- Parts that can/will be removed are --
- * HTML comments
- * nowiki tags
- * pre tags
- * includeonly tags
-
- The exact set of parts which should be removed can be passed as the
- 'parts' parameter, which defaults to all.
- """
- regexes = {
- 'comments' : r'<!--.*?-->',
- 'includeonly':
r'<includeonly>.*?</includeonly>',
- 'nowiki': r'<nowiki>.*?</nowiki>',
- 'pre': r'<pre>.*?</pre>',
- 'source': r'<source .*?</source>',
- }
- if '*' in tags:
- tags = regexes.keys()
- toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]),
- re.IGNORECASE | re.DOTALL)
- return toRemoveR.sub('', text)
-
-
-def isDisabled(text, index, tags = ['*']):
- """
- Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
-
- For the tags parameter, see removeDisabledParts() above.
- """
- # Find a marker that is not already in the text.
- marker = '@@'
- while marker in text:
- marker += '@'
- text = text[:index] + marker + text[index:]
- text = removeDisabledParts(text, tags)
- return (marker not in text)
-
-
-# Functions dealing with interwiki language links
-
-# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
-# interproject. These functions only deal with links to a
-# corresponding page in another language on the same project (e.g.,
-# Wikipedia, Wiktionary, etc.) in another language. They do not find
-# or change links to a different project, or any that are formatted
-# as in-line interwiki links (e.g., "[[:es:Articulo]]". (CONFIRM)
-
-def getLanguageLinks(text, insite = None, pageLink = "[[]]"):
- """
- Return a dict of interlanguage links found in text.
-
- Dict uses language codes as keys and Page objects as values.
- Do not call this routine directly, use Page.interwiki() method
- instead.
-
- """
- if insite == None:
- insite = pywikibot.getSite()
- result = {}
- # Ignore interwiki links within nowiki tags, includeonly tags, pre tags,
- # and HTML comments
- text = removeDisabledParts(text)
-
- # This regular expression will find every link that is possibly an
- # interwiki link.
- # NOTE: language codes are case-insensitive and only consist of basic latin
- # letters and hyphens.
- interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
- for lang, pagetitle in interwikiR.findall(text):
- lang = lang.lower()
- # Check if it really is in fact an interwiki link to a known
- # language, or if it's e.g. a category tag or an internal link
- if lang in insite.family.obsolete:
- lang = insite.family.obsolete[lang]
- if lang in insite.validLanguageLinks():
- if '|' in pagetitle:
- # ignore text after the pipe
- pagetitle = pagetitle[:pagetitle.index('|')]
- # we want the actual page objects rather than the titles
- site = insite.getSite(code = lang)
- try:
- result[site] = pywikibot.Page(site, pagetitle, insite = insite)
- except InvalidTitle:
- output(
- u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
- % (lang, pagetitle))
- continue
- return result
-
-
-def removeLanguageLinks(text, site = None, marker = ''):
- """Return text with all interlanguage links removed.
-
- If a link to an unknown language is encountered, a warning is printed.
- If a marker is defined, that string is placed at the location of the
- last occurence of an interwiki link (at the end if there are no
- interwiki links).
-
- """
- if site == None:
- site = pywikibot.getSite()
- if not site.validLanguageLinks():
- return text
- # This regular expression will find every interwiki link, plus trailing
- # whitespace.
- languages = '|'.join(site.validLanguageLinks() +
site.family.obsolete.keys())
- interwikiR = re.compile(r'\[\[(%s)\s?:[^\]]*\]\][\s]*'
- % languages, re.IGNORECASE)
- text = replaceExcept(text, interwikiR, '',
- ['nowiki', 'comment', 'math',
'pre', 'source'], marker=marker)
- return text.strip()
-
-
-def replaceLanguageLinks(oldtext, new, site = None):
- """Replace interlanguage links in the text with a new set of links.
-
- 'new' should be a dict with the Site objects as keys, and Page objects
- as values (i.e., just like the dict returned by getLanguageLinks
- function).
-
- """
- # Find a marker that is not already in the text.
- marker = '@@'
- while marker in oldtext:
- marker += '@'
- if site == None:
- site = pywikibot.getSite()
- s = interwikiFormat(new, insite = site)
- s2 = removeLanguageLinks(oldtext, site = site, marker = marker)
- if s:
- if site.language() in site.family.interwiki_attop:
- newtext = s + site.family.interwiki_text_separator +
s2.replace(marker,'').strip()
- else:
- # calculate what was after the language links on the page
- firstafter = s2.find(marker) + len(marker)
- # Is there any text in the 'after' part that means we should keep it
after?
- if "</noinclude>" in s2[firstafter:]:
- newtext = s2[:firstafter] + s + s2[firstafter:]
- elif site.language() in site.family.categories_last:
- cats = getCategoryLinks(s2, site = site)
- s2 = removeCategoryLinks(s2.replace(marker,'').strip(), site) +
site.family.interwiki_text_separator + s
- newtext = replaceCategoryLinks(s2, cats, site=site)
- else:
- newtext = s2.replace(marker,'').strip() +
site.family.interwiki_text_separator + s
- newtext = newtext.replace(marker,'')
- else:
- newtext = s2.replace(marker,'')
- return newtext
-
-
-def interwikiFormat(links, insite = None):
- """Convert interwiki link dict into a wikitext string.
-
- 'links' should be a dict with the Site objects as keys, and Page
- objects as values.
-
- Return a unicode string that is formatted for inclusion in insite
- (defaulting to the current site).
- """
- if insite is None:
- insite = pywikibot.getSite()
- if not links:
- return ''
-
- ar = interwikiSort(links.keys(), insite)
- s = []
- for site in ar:
- try:
- link = links[site].aslink(forceInterwiki=True)
- s.append(link)
- except AttributeError:
- s.append(pywikibot.getSite(site).linkto(links[site],
- othersite=insite))
- if insite.lang in insite.family.interwiki_on_one_line:
- sep = u' '
- else:
- sep = u'\r\n'
- s=sep.join(s) + u'\r\n'
- return s
-
-
-# Sort sites according to local interwiki sort logic
-def interwikiSort(sites, insite = None):
- if insite is None:
- insite = pywikibot.getSite()
- if not sites:
- return []
-
- sites.sort()
- putfirst = insite.interwiki_putfirst()
- if putfirst:
- #In this case I might have to change the order
- firstsites = []
- for code in putfirst:
- # The code may not exist in this family?
- if code in insite.family.obsolete:
- code = insite.family.obsolete[code]
- if code in insite.validLanguageLinks():
- site = insite.getSite(code = code)
- if site in sites:
- del sites[sites.index(site)]
- firstsites = firstsites + [site]
- sites = firstsites + sites
- if insite.interwiki_putfirst_doubled(sites): #some implementations return False
- sites = insite.interwiki_putfirst_doubled(sites) + sites
- return sites
-
-
-# Functions dealing with category links
-
-def getCategoryLinks(text, site):
- """Return a list of category links found in text.
-
- List contains Category objects.
- Do not call this routine directly, use Page.categories() instead.
-
- """
- result = []
- # Ignore category links within nowiki tags, pre tags, includeonly tags,
- # and HTML comments
- text = removeDisabledParts(text)
- catNamespace = '|'.join(site.category_namespaces())
- R =
re.compile(r'\[\[\s*(?P<namespace>%s)\s*:\s*(?P<catName>.+?)'
- r'(?:\|(?P<sortKey>.+?))?\s*\]\]'
- % catNamespace, re.I)
- for match in R.finditer(text):
- cat = pywikibot.Category(site,
- '%s:%s' % (match.group('namespace'),
- match.group('catName')),
- sortKey = match.group('sortKey'))
- result.append(cat)
- return result
-
-
-def removeCategoryLinks(text, site, marker = ''):
- """Return text with all category links removed.
-
- Put the string marker after the last replacement (at the end of the text
- if there is no replacement).
-
- """
- # This regular expression will find every link that is possibly an
- # interwiki link, plus trailing whitespace. The language code is grouped.
- # NOTE: This assumes that language codes only consist of non-capital
- # ASCII letters and hyphens.
- catNamespace = '|'.join(site.category_namespaces())
- categoryR = re.compile(r'\[\[\s*(%s)\s*:.*?\]\]\s*' % catNamespace, re.I)
- text = replaceExcept(text, categoryR, '', ['nowiki',
'comment', 'math', 'pre', 'source'], marker = marker)
- if marker:
- #avoid having multiple linefeeds at the end of the text
- text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker,
text.strip())
- return text.strip()
-
-
-def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None):
- """Replace the category oldcat with the category newcat and return
- the modified text.
-
- """
- if site is None:
- site = pywikibot.getSite()
-
- catNamespace = '|'.join(site.category_namespaces())
- title = oldcat.titleWithoutNamespace()
- if not title:
- return
- # title might contain regex special characters
- title = re.escape(title)
- # title might not be capitalized correctly on the wiki
- if title[0].isalpha() and not site.nocapitalize:
- title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:]
- # spaces and underscores in page titles are interchangeable, and collapsible
- title = title.replace(r"\ ", "[ _]+").replace(r"\_",
"[ _]+")
- categoryR = re.compile(r'\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])'
- % (catNamespace, title), re.I)
- if newcat is None:
- text = replaceExcept(oldtext, categoryR, '',
- ['nowiki', 'comment', 'math',
'pre', 'source'])
- else:
- text = replaceExcept(oldtext, categoryR,
- '[[%s:%s\\2' % (site.namespace(14),
- newcat.titleWithoutNamespace()),
- ['nowiki', 'comment', 'math',
'pre', 'source'])
- return text
-
-
-def replaceCategoryLinks(oldtext, new, site = None, addOnly = False):
- """Replace the category links given in the wikitext given
- in oldtext by the new links given in new.
-
- 'new' should be a list of Category objects.
-
- If addOnly is True, the old category won't be deleted and
- the category(s) given will be added
- (and so they won't replace anything).
- """
-
- # Find a marker that is not already in the text.
- marker = '@@'
- while marker in oldtext:
- marker += '@'
-
- if site is None:
- site = pywikibot.getSite()
- if site.sitename() == 'wikipedia:de' and "{{Personendaten" in
oldtext:
- raise Error('The PyWikipediaBot is no longer allowed to touch categories on
the German Wikipedia on pages that contain the person data template because of the
non-standard placement of that template. See
http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006…)
-
- s = categoryFormat(new, insite = site)
- if addOnly:
- s2 = oldtext
- else:
- s2 = removeCategoryLinks(oldtext, site = site, marker = marker)
-
- if s:
- if site.language() in site.family.category_attop:
- newtext = s + site.family.category_text_separator + s2
- else:
- # calculate what was after the categories links on the page
- firstafter = s2.find(marker)
- # Is there any text in the 'after' part that means we should keep it
after?
- if "</noinclude>" in s2[firstafter:]:
- newtext = s2[:firstafter] + s + s2[firstafter:]
- elif site.language() in site.family.categories_last:
- newtext = s2.replace(marker,'').strip() +
site.family.category_text_separator + s
- else:
- interwiki = getLanguageLinks(s2)
- s2 = removeLanguageLinks(s2.replace(marker,''), site) +
site.family.category_text_separator + s
- newtext = replaceLanguageLinks(s2, interwiki, site)
- newtext = newtext.replace(marker,'')
- else:
- s2 = s2.replace(marker,'')
- return s2
- return newtext.strip()
-
-
-def categoryFormat(categories, insite = None):
- """Return a string containing links to all categories in a list.
-
- 'categories' should be a list of Category objects.
-
- The string is formatted for inclusion in insite.
-
- """
- if not categories:
- return ''
- if insite is None:
- insite = pywikibot.getSite()
- catLinks = [category.aslink(noInterwiki = True) for category in categories]
- if insite.category_on_one_line():
- sep = ' '
- else:
- sep = '\r\n'
- # Some people don't like the categories sorted
- #catLinks.sort()
- return sep.join(catLinks) + '\r\n'
-
-
-def compileLinkR(withoutBracketed=False, onlyBracketed=False):
- """Return a regex that matches external links."""
- # RFC 2396 says that URLs may only contain certain characters.
- # For this regex we also accept non-allowed characters, so that the bot
- # will later show these links as broken ('Non-ASCII Characters in URL').
- # Note: While allowing parenthesis inside URLs, MediaWiki will regard
- # right parenthesis at the end of the URL as not part of that URL.
- # The same applies to dot, comma, colon and some other characters.
- notAtEnd = '\]\s\)\.:;,<>"'
- # So characters inside the URL can be anything except whitespace,
- # closing squared brackets, quotation marks, greater than and less
- # than, and the last character also can't be parenthesis or another
- # character disallowed by MediaWiki.
- notInside = '\]\s<>"'
- # The first half of this regular expression is required because '' is
- # not allowed inside links. For example, in this wiki text:
- # ''Please see
http://www.example.org.''
- # .'' shouldn't be considered as part of the link.
- regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' +
notAtEnd + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside
+ ']*[^' + notAtEnd + '])'
-
- if withoutBracketed:
- regex = r'(?<!\[)' + regex
- elif onlyBracketed:
- regex = r'\[' + regex
- linkR = re.compile(regex)
- return linkR
-
-def extract_templates_and_params(text, get_redirect=False):
- """Return list of template calls found in text.
-
- Return value is a list of tuples. There is one tuple for each use of a
- template in the page, with the template title as the first entry and a
- dict of parameters as the second entry. Parameters are indexed by
- strings; as in MediaWiki, an unnamed parameter is given a parameter name
- with an integer value corresponding to its position among the unnnamed
- parameters, and if this results multiple parameters with the same name
- only the last value provided will be returned.
-
- """
- # remove commented-out stuff etc.
- thistxt = removeDisabledParts(text)
-
- # marker for inside templates or parameters
- marker = u'@@'
- while marker in thistxt:
- marker += u'@'
-
- # marker for links
- marker2 = u'##'
- while marker2 in thistxt:
- marker2 += u'#'
-
- # marker for math
- marker3 = u'%%'
- while marker2 in thistxt:
- marker3 += u'%'
-
- result = []
- inside = {}
- count = 0
- Rtemplate = re.compile(
-
ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
- Rmath = re.compile(ur'<math>[^<]+</math>')
- Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker))
- Rmarker2 = re.compile(ur'%s(\d+)%s' % (marker2, marker2))
- Rmarker3 = re.compile(ur'%s(\d+)%s' % (marker3, marker3))
-
- # Replace math with markers
- maths = {}
- count = 0
- for m in Rmath.finditer(thistxt):
- count += 1
- text = m.group()
- thistxt = thistxt.replace(text, '%s%d%s' % (marker3, count, marker3))
- maths[count] = text
-
- while Rtemplate.search(thistxt) is not None:
- for m in Rtemplate.finditer(thistxt):
- # Make sure it is not detected again
- count += 1
- text = m.group()
- thistxt = thistxt.replace(text,
- '%s%d%s' % (marker, count, marker))
- # Make sure stored templates don't contain markers
- for m2 in Rmarker.finditer(text):
- text = text.replace(m2.group(), inside[int(m2.group(1))])
- for m2 in Rmarker3.finditer(text):
- text = text.replace(m2.group(), maths[int(m2.group(1))])
- inside[count] = text
-
- # Name
- name = m.group('name').strip()
- m2 = Rmarker.search(name) or Rmath.search(name)
- if m2 is not None:
- # Doesn't detect templates whose name changes,
- # or templates whose name contains math tags
- continue
- # Parameters
- paramString = m.group('params')
- params = {}
- numbered_param = 1
- if paramString:
- # Replace wikilinks with markers
- links = {}
- count2 = 0
- for m2 in pywikibot.link_regex.finditer(paramString):
- count2 += 1
- text = m2.group(0)
- paramString = paramString.replace(text,
- '%s%d%s' % (marker2, count2, marker2))
- links[count2] = text
- # Parse string
- markedParams = paramString.split('|')
- # Replace markers
- for param in markedParams:
- if "=" in param:
- param_name, param_val = param.split("=", 1)
- else:
- param_name = unicode(numbered_param)
- param_val = param
- numbered_param += 1
- for m2 in Rmarker.finditer(param_val):
- param_val = param_val.replace(m2.group(),
- inside[int(m2.group(1))])
- for m2 in Rmarker2.finditer(param_val):
- param_val = param_val.replace(m2.group(),
- links[int(m2.group(1))])
- for m2 in Rmarker3.finditer(param_val):
- param_val = param_val.replace(m2.group(),
- maths[int(m2.group(1))])
- params[param_name] = param_val
-
- # Add it to the result
- result.append((name, params))
- return result
-
+# -*- coding: utf-8 -*-
+"""
+Functions for manipulating wiki-text.
+
+Unless otherwise noted, all functions take a unicode string as the argument
+and return a unicode string.
+
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+
+import pywikibot
+import re
+
+
+def unescape(s):
+ """Replace escaped HTML-special characters by their
originals"""
+ if '&' not in s:
+ return s
+ s = s.replace("<", "<")
+ s = s.replace(">", ">")
+ s = s.replace("'", "'")
+ s = s.replace(""", '"')
+ s = s.replace("&", "&") # Must be last
+ return s
+
+
+def replaceExcept(text, old, new, exceptions, caseInsensitive=False,
+ allowoverlap=False, marker = '', site = None):
+ """
+ Return text with 'old' replaced by 'new', ignoring specified types of
text.
+
+ Skips occurences of 'old' within exceptions; e.g., within nowiki tags or
+ HTML comments. If caseInsensitive is true, then use case insensitive
+ regex matching. If allowoverlap is true, overlapping occurences are all
+ replaced (watch out when using this, it might lead to infinite loops!).
+
+ Parameters:
+ text - a unicode string
+ old - a compiled regular expression
+ new - a unicode string (which can contain regular
+ expression references), or a function which takes
+ a match object as parameter. See parameter repl of
+ re.sub().
+ exceptions - a list of strings which signal what to leave out,
+ e.g. ['math', 'table', 'template']
+ caseInsensitive - a boolean
+ marker - a string that will be added to the last replacement;
+ if nothing is changed, it is added at the end
+
+ """
+ if site is None:
+ site = pywikibot.getSite()
+
+ exceptionRegexes = {
+ 'comment': re.compile(r'(?s)<!--.*?-->'),
+ # section headers
+ 'header': re.compile(r'\r\n=+.+=+ *\r\n'),
+ 'includeonly':
re.compile(r'(?is)<includeonly>.*?</includeonly>'),
+ 'math': re.compile(r'(?is)<math>.*?</math>'),
+ 'noinclude':
re.compile(r'(?is)<noinclude>.*?</noinclude>'),
+ # wiki tags are ignored inside nowiki tags.
+ 'nowiki':
re.compile(r'(?is)<nowiki>.*?</nowiki>'),
+ # preformatted text
+ 'pre': re.compile(r'(?ism)<pre>.*?</pre>'),
+ 'source': re.compile(r'(?is)<source
.*?</source>'),
+ # inline references
+ 'ref': re.compile(r'(?ism)<ref[
>].*?</ref>'),
+ 'timeline':
re.compile(r'(?is)<timeline>.*?</timeline>'),
+ # lines that start with a space are shown in a monospace font and
+ # have whitespace preserved.
+ 'startspace': re.compile(r'(?m)^ (.*?)$'),
+ # tables often have whitespace that is used to improve wiki
+ # source code readability.
+ # TODO: handle nested tables.
+ 'table':
re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
+ # templates with parameters often have whitespace that is used to
+ # improve wiki source code readability.
+ # 'template': re.compile(r'(?s){{.*?}}'),
+ # The regex above fails on nested templates. This regex can handle
+ # templates cascaded up to level 3, but no deeper. For arbitrary
+ # depth, we'd need recursion which can't be done in Python's re.
+ # After all, the language of correct parenthesis words is not regular.
+ 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'),
+ 'hyperlink': compileLinkR(),
+ 'gallery':
re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
+ # this matches internal wikilinks, but also interwiki, categories, and
+ # images.
+ 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
+ 'interwiki': re.compile(r'(?i)\[\[(%s)\s?:[^\]]*\]\][\s]*'
+ % '|'.join(site.validLanguageLinks() +
site.family.obsolete.keys())),
+
+ }
+
+ # if we got a string, compile it as a regular expression
+ if type(old) is str or type(old) is unicode:
+ if caseInsensitive:
+ old = re.compile(old, re.IGNORECASE | re.UNICODE)
+ else:
+ old = re.compile(old)
+
+ dontTouchRegexes = []
+ for exc in exceptions:
+ if isinstance(exc, str) or isinstance(exc, unicode):
+ # assume it's a reference to the exceptionRegexes dictionary
+ # defined above.
+ if not exceptionRegexes.has_key(exc):
+ raise ValueError("Unknown tag type: " + exc)
+ dontTouchRegexes.append(exceptionRegexes[exc])
+ else:
+ # assume it's a regular expression
+ dontTouchRegexes.append(exc)
+ index = 0
+ markerpos = len(text)
+ while True:
+ match = old.search(text, index)
+ if not match:
+ # nothing left to replace
+ break
+
+ # check which exception will occur next.
+ nextExceptionMatch = None
+ for dontTouchR in dontTouchRegexes:
+ excMatch = dontTouchR.search(text, index)
+ if excMatch and (
+ nextExceptionMatch is None or
+ excMatch.start() < nextExceptionMatch.start()):
+ nextExceptionMatch = excMatch
+
+ if nextExceptionMatch is not None and nextExceptionMatch.start() <=
match.start():
+ # an HTML comment or text in nowiki tags stands before the next valid match.
Skip.
+ index = nextExceptionMatch.end()
+ else:
+ # We found a valid match. Replace it.
+ if callable(new):
+ # the parameter new can be a function which takes the match as a
parameter.
+ replacement = new(match)
+ else:
+ # it is not a function, but a string.
+
+ # it is a little hack to make \n work. It would be better to fix it
+ # previously, but better than nothing.
+ new = new.replace('\\n', '\n')
+
+ # We cannot just insert the new string, as it may contain regex
+ # group references such as \2 or \g<name>.
+ # On the other hand, this approach does not work because it can't
+ # handle lookahead or lookbehind (see bug #1731008):
+ #replacement = old.sub(new, text[match.start():match.end()])
+ #text = text[:match.start()] + replacement + text[match.end():]
+
+ # So we have to process the group references manually.
+ replacement = new
+
+ groupR =
re.compile(r'\\(?P<number>\d+)|\\g<(?P<name>.+?)>')
+ while True:
+ groupMatch = groupR.search(replacement)
+ if not groupMatch:
+ break
+ groupID = groupMatch.group('name') or
int(groupMatch.group('number'))
+ replacement = replacement[:groupMatch.start()] + match.group(groupID)
+ replacement[groupMatch.end():]
+ text = text[:match.start()] + replacement + text[match.end():]
+
+ # continue the search on the remaining text
+ if allowoverlap:
+ index = match.start() + 1
+ else:
+ index = match.start() + len(replacement)
+ markerpos = match.start() + len(replacement)
+ text = text[:markerpos] + marker + text[markerpos:]
+ return text
+
+
+def removeDisabledParts(text, tags = ['*']):
+ """
+ Return text without portions where wiki markup is disabled
+
+ Parts that can/will be removed are --
+ * HTML comments
+ * nowiki tags
+ * pre tags
+ * includeonly tags
+
+ The exact set of parts which should be removed can be passed as the
+ 'parts' parameter, which defaults to all.
+ """
+ regexes = {
+ 'comments' : r'<!--.*?-->',
+ 'includeonly':
r'<includeonly>.*?</includeonly>',
+ 'nowiki': r'<nowiki>.*?</nowiki>',
+ 'pre': r'<pre>.*?</pre>',
+ 'source': r'<source .*?</source>',
+ }
+ if '*' in tags:
+ tags = regexes.keys()
+ toRemoveR = re.compile('|'.join([regexes[tag] for tag in tags]),
+ re.IGNORECASE | re.DOTALL)
+ return toRemoveR.sub('', text)
+
+
+def isDisabled(text, index, tags = ['*']):
+ """
+ Return True if text[index] is disabled, e.g. by a comment or by nowiki tags.
+
+ For the tags parameter, see removeDisabledParts() above.
+ """
+ # Find a marker that is not already in the text.
+ marker = '@@'
+ while marker in text:
+ marker += '@'
+ text = text[:index] + marker + text[index:]
+ text = removeDisabledParts(text, tags)
+ return (marker not in text)
+
+
+# Functions dealing with interwiki language links
+
+# Note - MediaWiki supports two kinds of interwiki links; interlanguage and
+# interproject. These functions only deal with links to a
+# corresponding page in another language on the same project (e.g.,
+# Wikipedia, Wiktionary, etc.) in another language. They do not find
+# or change links to a different project, or any that are formatted
+# as in-line interwiki links (e.g., "[[:es:Articulo]]". (CONFIRM)
+
+def getLanguageLinks(text, insite = None, pageLink = "[[]]"):
+ """
+ Return a dict of interlanguage links found in text.
+
+ Dict uses language codes as keys and Page objects as values.
+ Do not call this routine directly, use Page.interwiki() method
+ instead.
+
+ """
+ if insite == None:
+ insite = pywikibot.getSite()
+ result = {}
+ # Ignore interwiki links within nowiki tags, includeonly tags, pre tags,
+ # and HTML comments
+ text = removeDisabledParts(text)
+
+ # This regular expression will find every link that is possibly an
+ # interwiki link.
+ # NOTE: language codes are case-insensitive and only consist of basic latin
+ # letters and hyphens.
+ interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
+ for lang, pagetitle in interwikiR.findall(text):
+ lang = lang.lower()
+ # Check if it really is in fact an interwiki link to a known
+ # language, or if it's e.g. a category tag or an internal link
+ if lang in insite.family.obsolete:
+ lang = insite.family.obsolete[lang]
+ if lang in insite.validLanguageLinks():
+ if '|' in pagetitle:
+ # ignore text after the pipe
+ pagetitle = pagetitle[:pagetitle.index('|')]
+ # we want the actual page objects rather than the titles
+ site = insite.getSite(code = lang)
+ try:
+ result[site] = pywikibot.Page(site, pagetitle, insite = insite)
+ except InvalidTitle:
+ output(
+ u"[getLanguageLinks] Text contains invalid interwiki link [[%s:%s]]."
+ % (lang, pagetitle))
+ continue
+ return result
+
+
+def removeLanguageLinks(text, site = None, marker = ''):
+ """Return text with all interlanguage links removed.
+
+ If a link to an unknown language is encountered, a warning is printed.
+ If a marker is defined, that string is placed at the location of the
+ last occurence of an interwiki link (at the end if there are no
+ interwiki links).
+
+ """
+ if site == None:
+ site = pywikibot.getSite()
+ if not site.validLanguageLinks():
+ return text
+ # This regular expression will find every interwiki link, plus trailing
+ # whitespace.
+ languages = '|'.join(site.validLanguageLinks() +
site.family.obsolete.keys())
+ interwikiR = re.compile(r'\[\[(%s)\s?:[^\]]*\]\][\s]*'
+ % languages, re.IGNORECASE)
+ text = replaceExcept(text, interwikiR, '',
+ ['nowiki', 'comment', 'math',
'pre', 'source'], marker=marker)
+ return text.strip()
+
+
+def replaceLanguageLinks(oldtext, new, site = None):
+ """Replace interlanguage links in the text with a new set of links.
+
+ 'new' should be a dict with the Site objects as keys, and Page objects
+ as values (i.e., just like the dict returned by getLanguageLinks
+ function).
+
+ """
+ # Find a marker that is not already in the text.
+ marker = '@@'
+ while marker in oldtext:
+ marker += '@'
+ if site == None:
+ site = pywikibot.getSite()
+ s = interwikiFormat(new, insite = site)
+ s2 = removeLanguageLinks(oldtext, site = site, marker = marker)
+ if s:
+ if site.language() in site.family.interwiki_attop:
+ newtext = s + site.family.interwiki_text_separator +
s2.replace(marker,'').strip()
+ else:
+ # calculate what was after the language links on the page
+ firstafter = s2.find(marker) + len(marker)
+ # Is there any text in the 'after' part that means we should keep it
after?
+ if "</noinclude>" in s2[firstafter:]:
+ newtext = s2[:firstafter] + s + s2[firstafter:]
+ elif site.language() in site.family.categories_last:
+ cats = getCategoryLinks(s2, site = site)
+ s2 = removeCategoryLinks(s2.replace(marker,'').strip(), site) +
site.family.interwiki_text_separator + s
+ newtext = replaceCategoryLinks(s2, cats, site=site)
+ else:
+ newtext = s2.replace(marker,'').strip() +
site.family.interwiki_text_separator + s
+ newtext = newtext.replace(marker,'')
+ else:
+ newtext = s2.replace(marker,'')
+ return newtext
+
+
+def interwikiFormat(links, insite = None):
+ """Convert interwiki link dict into a wikitext string.
+
+ 'links' should be a dict with the Site objects as keys, and Page
+ objects as values.
+
+ Return a unicode string that is formatted for inclusion in insite
+ (defaulting to the current site).
+ """
+ if insite is None:
+ insite = pywikibot.getSite()
+ if not links:
+ return ''
+
+ ar = interwikiSort(links.keys(), insite)
+ s = []
+ for site in ar:
+ try:
+ link = links[site].aslink(forceInterwiki=True)
+ s.append(link)
+ except AttributeError:
+ s.append(pywikibot.getSite(site).linkto(links[site],
+ othersite=insite))
+ if insite.lang in insite.family.interwiki_on_one_line:
+ sep = u' '
+ else:
+ sep = u'\r\n'
+ s=sep.join(s) + u'\r\n'
+ return s
+
+
+# Sort sites according to local interwiki sort logic
+def interwikiSort(sites, insite = None):
+ if insite is None:
+ insite = pywikibot.getSite()
+ if not sites:
+ return []
+
+ sites.sort()
+ putfirst = insite.interwiki_putfirst()
+ if putfirst:
+ #In this case I might have to change the order
+ firstsites = []
+ for code in putfirst:
+ # The code may not exist in this family?
+ if code in insite.family.obsolete:
+ code = insite.family.obsolete[code]
+ if code in insite.validLanguageLinks():
+ site = insite.getSite(code = code)
+ if site in sites:
+ del sites[sites.index(site)]
+ firstsites = firstsites + [site]
+ sites = firstsites + sites
+ if insite.interwiki_putfirst_doubled(sites): #some implementations return False
+ sites = insite.interwiki_putfirst_doubled(sites) + sites
+ return sites
+
+
+# Functions dealing with category links
+
+def getCategoryLinks(text, site):
+ """Return a list of category links found in text.
+
+ List contains Category objects.
+ Do not call this routine directly, use Page.categories() instead.
+
+ """
+ result = []
+ # Ignore category links within nowiki tags, pre tags, includeonly tags,
+ # and HTML comments
+ text = removeDisabledParts(text)
+ catNamespace = '|'.join(site.category_namespaces())
+ R =
re.compile(r'\[\[\s*(?P<namespace>%s)\s*:\s*(?P<catName>.+?)'
+ r'(?:\|(?P<sortKey>.+?))?\s*\]\]'
+ % catNamespace, re.I)
+ for match in R.finditer(text):
+ cat = pywikibot.Category(site,
+ '%s:%s' % (match.group('namespace'),
+ match.group('catName')),
+ sortKey = match.group('sortKey'))
+ result.append(cat)
+ return result
+
+
+def removeCategoryLinks(text, site, marker = ''):
+ """Return text with all category links removed.
+
+ Put the string marker after the last replacement (at the end of the text
+ if there is no replacement).
+
+ """
+ # This regular expression will find every link that is possibly an
+ # interwiki link, plus trailing whitespace. The language code is grouped.
+ # NOTE: This assumes that language codes only consist of non-capital
+ # ASCII letters and hyphens.
+ catNamespace = '|'.join(site.category_namespaces())
+ categoryR = re.compile(r'\[\[\s*(%s)\s*:.*?\]\]\s*' % catNamespace, re.I)
+ text = replaceExcept(text, categoryR, '', ['nowiki',
'comment', 'math', 'pre', 'source'], marker = marker)
+ if marker:
+ #avoid having multiple linefeeds at the end of the text
+ text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker,
text.strip())
+ return text.strip()
+
+
+def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None):
+ """Replace the category oldcat with the category newcat and return
+ the modified text.
+
+ """
+ if site is None:
+ site = pywikibot.getSite()
+
+ catNamespace = '|'.join(site.category_namespaces())
+ title = oldcat.titleWithoutNamespace()
+ if not title:
+ return
+ # title might contain regex special characters
+ title = re.escape(title)
+ # title might not be capitalized correctly on the wiki
+ if title[0].isalpha() and not site.nocapitalize:
+ title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:]
+ # spaces and underscores in page titles are interchangeable, and collapsible
+ title = title.replace(r"\ ", "[ _]+").replace(r"\_",
"[ _]+")
+ categoryR = re.compile(r'\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])'
+ % (catNamespace, title), re.I)
+ if newcat is None:
+ text = replaceExcept(oldtext, categoryR, '',
+ ['nowiki', 'comment', 'math',
'pre', 'source'])
+ else:
+ text = replaceExcept(oldtext, categoryR,
+ '[[%s:%s\\2' % (site.namespace(14),
+ newcat.titleWithoutNamespace()),
+ ['nowiki', 'comment', 'math',
'pre', 'source'])
+ return text
+
+
+def replaceCategoryLinks(oldtext, new, site = None, addOnly = False):
+ """Replace the category links given in the wikitext given
+ in oldtext by the new links given in new.
+
+ 'new' should be a list of Category objects.
+
+ If addOnly is True, the old category won't be deleted and
+ the category(s) given will be added
+ (and so they won't replace anything).
+ """
+
+ # Find a marker that is not already in the text.
+ marker = '@@'
+ while marker in oldtext:
+ marker += '@'
+
+ if site is None:
+ site = pywikibot.getSite()
+ if site.sitename() == 'wikipedia:de' and "{{Personendaten" in
oldtext:
+ raise Error('The PyWikipediaBot is no longer allowed to touch categories on
the German Wikipedia on pages that contain the person data template because of the
non-standard placement of that template. See
http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006…)
+
+ s = categoryFormat(new, insite = site)
+ if addOnly:
+ s2 = oldtext
+ else:
+ s2 = removeCategoryLinks(oldtext, site = site, marker = marker)
+
+ if s:
+ if site.language() in site.family.category_attop:
+ newtext = s + site.family.category_text_separator + s2
+ else:
+ # calculate what was after the categories links on the page
+ firstafter = s2.find(marker)
+ # Is there any text in the 'after' part that means we should keep it
after?
+ if "</noinclude>" in s2[firstafter:]:
+ newtext = s2[:firstafter] + s + s2[firstafter:]
+ elif site.language() in site.family.categories_last:
+ newtext = s2.replace(marker,'').strip() +
site.family.category_text_separator + s
+ else:
+ interwiki = getLanguageLinks(s2)
+ s2 = removeLanguageLinks(s2.replace(marker,''), site) +
site.family.category_text_separator + s
+ newtext = replaceLanguageLinks(s2, interwiki, site)
+ newtext = newtext.replace(marker,'')
+ else:
+ s2 = s2.replace(marker,'')
+ return s2
+ return newtext.strip()
+
+
+def categoryFormat(categories, insite = None):
+ """Return a string containing links to all categories in a list.
+
+ 'categories' should be a list of Category objects.
+
+ The string is formatted for inclusion in insite.
+
+ """
+ if not categories:
+ return ''
+ if insite is None:
+ insite = pywikibot.getSite()
+ catLinks = [category.aslink(noInterwiki = True) for category in categories]
+ if insite.category_on_one_line():
+ sep = ' '
+ else:
+ sep = '\r\n'
+ # Some people don't like the categories sorted
+ #catLinks.sort()
+ return sep.join(catLinks) + '\r\n'
+
+
+def compileLinkR(withoutBracketed=False, onlyBracketed=False):
+ """Return a regex that matches external links."""
+ # RFC 2396 says that URLs may only contain certain characters.
+ # For this regex we also accept non-allowed characters, so that the bot
+ # will later show these links as broken ('Non-ASCII Characters in URL').
+ # Note: While allowing parenthesis inside URLs, MediaWiki will regard
+ # right parenthesis at the end of the URL as not part of that URL.
+ # The same applies to dot, comma, colon and some other characters.
+ notAtEnd = '\]\s\)\.:;,<>"'
+ # So characters inside the URL can be anything except whitespace,
+ # closing squared brackets, quotation marks, greater than and less
+ # than, and the last character also can't be parenthesis or another
+ # character disallowed by MediaWiki.
+ notInside = '\]\s<>"'
+ # The first half of this regular expression is required because '' is
+ # not allowed inside links. For example, in this wiki text:
+ # ''Please see
http://www.example.org.''
+ # .'' shouldn't be considered as part of the link.
+ regex = r'(?P<url>http[s]?://[^' + notInside + ']*?[^' +
notAtEnd + '](?=[' + notAtEnd+ ']*\'\')|http[s]?://[^' + notInside
+ ']*[^' + notAtEnd + '])'
+
+ if withoutBracketed:
+ regex = r'(?<!\[)' + regex
+ elif onlyBracketed:
+ regex = r'\[' + regex
+ linkR = re.compile(regex)
+ return linkR
+
+def extract_templates_and_params(text, get_redirect=False):
+ """Return list of template calls found in text.
+
+ Return value is a list of tuples. There is one tuple for each use of a
+ template in the page, with the template title as the first entry and a
+ dict of parameters as the second entry. Parameters are indexed by
+ strings; as in MediaWiki, an unnamed parameter is given a parameter name
+ with an integer value corresponding to its position among the unnnamed
+ parameters, and if this results multiple parameters with the same name
+ only the last value provided will be returned.
+
+ """
+ # remove commented-out stuff etc.
+ thistxt = removeDisabledParts(text)
+
+ # marker for inside templates or parameters
+ marker = u'@@'
+ while marker in thistxt:
+ marker += u'@'
+
+ # marker for links
+ marker2 = u'##'
+ while marker2 in thistxt:
+ marker2 += u'#'
+
+ # marker for math
+ marker3 = u'%%'
+ while marker2 in thistxt:
+ marker3 += u'%'
+
+ result = []
+ inside = {}
+ count = 0
+ Rtemplate = re.compile(
+
ur'{{(msg:)?(?P<name>[^{\|]+?)(\|(?P<params>[^{]+?))?}}')
+ Rmath = re.compile(ur'<math>[^<]+</math>')
+ Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker))
+ Rmarker2 = re.compile(ur'%s(\d+)%s' % (marker2, marker2))
+ Rmarker3 = re.compile(ur'%s(\d+)%s' % (marker3, marker3))
+
+ # Replace math with markers
+ maths = {}
+ count = 0
+ for m in Rmath.finditer(thistxt):
+ count += 1
+ text = m.group()
+ thistxt = thistxt.replace(text, '%s%d%s' % (marker3, count, marker3))
+ maths[count] = text
+
+ while Rtemplate.search(thistxt) is not None:
+ for m in Rtemplate.finditer(thistxt):
+ # Make sure it is not detected again
+ count += 1
+ text = m.group()
+ thistxt = thistxt.replace(text,
+ '%s%d%s' % (marker, count, marker))
+ # Make sure stored templates don't contain markers
+ for m2 in Rmarker.finditer(text):
+ text = text.replace(m2.group(), inside[int(m2.group(1))])
+ for m2 in Rmarker3.finditer(text):
+ text = text.replace(m2.group(), maths[int(m2.group(1))])
+ inside[count] = text
+
+ # Name
+ name = m.group('name').strip()
+ m2 = Rmarker.search(name) or Rmath.search(name)
+ if m2 is not None:
+ # Doesn't detect templates whose name changes,
+ # or templates whose name contains math tags
+ continue
+ # Parameters
+ paramString = m.group('params')
+ params = {}
+ numbered_param = 1
+ if paramString:
+ # Replace wikilinks with markers
+ links = {}
+ count2 = 0
+ for m2 in pywikibot.link_regex.finditer(paramString):
+ count2 += 1
+ text = m2.group(0)
+ paramString = paramString.replace(text,
+ '%s%d%s' % (marker2, count2, marker2))
+ links[count2] = text
+ # Parse string
+ markedParams = paramString.split('|')
+ # Replace markers
+ for param in markedParams:
+ if "=" in param:
+ param_name, param_val = param.split("=", 1)
+ else:
+ param_name = unicode(numbered_param)
+ param_val = param
+ numbered_param += 1
+ for m2 in Rmarker.finditer(param_val):
+ param_val = param_val.replace(m2.group(),
+ inside[int(m2.group(1))])
+ for m2 in Rmarker2.finditer(param_val):
+ param_val = param_val.replace(m2.group(),
+ links[int(m2.group(1))])
+ for m2 in Rmarker3.finditer(param_val):
+ param_val = param_val.replace(m2.group(),
+ maths[int(m2.group(1))])
+ params[param_name] = param_val
+
+ # Add it to the result
+ result.append((name, params))
+ return result
+
Property changes on: branches/rewrite/pywikibot/textlib.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Added: svn:eol-style
+ native
Modified: branches/rewrite/pywikibot/throttle.py
===================================================================
--- branches/rewrite/pywikibot/throttle.py 2008-12-16 19:34:48 UTC (rev 6155)
+++ branches/rewrite/pywikibot/throttle.py 2008-12-16 19:40:20 UTC (rev 6156)
@@ -1,275 +1,275 @@
-# -*- coding: utf-8 -*-
-"""
-Mechanics to slow down wiki read and/or write rate.
-"""
-#
-# (C) Pywikipedia bot team, 2008
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id: $'
-
-import pywikibot
-from pywikibot import config2 as config
-
-import logging
-import math
-import threading
-import time
-
-logger = logging.getLogger("wiki.throttle")
-
-pid = False # global process identifier
- # when the first Throttle is instantiated, it will set this
- # variable to a positive integer, which will apply to all
- # throttle objects created by this process.
-
-
-class Throttle(object):
- """Control rate of access to wiki server
-
- Calling this object blocks the calling thread until at least 'delay'
- seconds have passed since the previous call.
-
- Each Site initiates one Throttle object (site.throttle) to control the
- rate of access.
-
- """
- def __init__(self, site, mindelay=None, maxdelay=None, writedelay=None,
- multiplydelay=True, verbosedelay=False):
- self.lock = threading.RLock()
- self.mysite = str(site)
- self.logfn = config.datafilepath('throttle.log')
- self.mindelay = mindelay
- if self.mindelay is None:
- self.mindelay = config.minthrottle
- self.maxdelay = maxdelay
- if self.maxdelay is None:
- self.maxdelay = config.maxthrottle
- self.writedelay = writedelay
- self.last_read = 0
- self.last_write = 0
- self.next_multiplicity = 1.0
- self.checkdelay = 300 # Check logfile again after this many seconds
- self.dropdelay = 600 # Ignore processes that have not made
- # a check in this many seconds
- self.releasepid = 1200 # Free the process id after this many seconds
- self.lastwait = 0.0
- self.delay = 0
- self.verbosedelay = verbosedelay
- if multiplydelay:
- self.checkMultiplicity()
- self.setDelays()
-
- def checkMultiplicity(self):
- """Count running processes for site and set
process_multiplicity."""
- global pid
- self.lock.acquire()
- mysite = self.mysite
- logger.debug("Checking multiplicity: pid = %(pid)s" % globals())
- try:
- processes = []
- my_pid = pid or 1 # start at 1 if global pid not yet set
- count = 1
- # open throttle.log
- try:
- f = open(self.logfn, 'r')
- except IOError:
- if not pid:
- pass
- else:
- raise
- else:
- now = time.time()
- for line in f.readlines():
- # parse line; format is "pid timestamp site"
- try:
- line = line.split(' ')
- this_pid = int(line[0])
- ptime = int(line[1].split('.')[0])
- this_site = line[2].rstrip()
- except (IndexError, ValueError):
- continue # Sometimes the file gets corrupted
- # ignore that line
- if now - ptime > self.releasepid:
- continue # process has expired, drop from file
- if now - ptime <= self.dropdelay \
- and this_site == mysite \
- and this_pid != pid:
- count += 1
- if this_site != self.mysite or this_pid != pid:
- processes.append({'pid': this_pid,
- 'time': ptime,
- 'site': this_site})
- if not pid and this_pid >= my_pid:
- my_pid = this_pid+1 # next unused process id
-
- if not pid:
- pid = my_pid
- self.checktime = time.time()
- processes.append({'pid': pid,
- 'time': self.checktime,
- 'site': mysite})
- f = open(self.logfn, 'w')
- processes.sort(key=lambda p:(p['pid'], p['site']))
- for p in processes:
- f.write("%(pid)s %(time)s %(site)s\n" % p)
- f.close()
- self.process_multiplicity = count
- if self.verbosedelay:
- logger.info(
-u"Found %(count)s %(mysite)s processes running, including this one."
- % locals())
- finally:
- self.lock.release()
-
- def setDelays(self, delay=None, writedelay=None, absolute=False):
- """Set the nominal delays in seconds. Defaults to config
values."""
- self.lock.acquire()
- try:
- maxdelay = self.maxdelay
- if delay is None:
- delay = self.mindelay
- if writedelay is None:
- writedelay = config.put_throttle
- if absolute:
- self.maxdelay = delay
- self.mindelay = delay
- self.delay = delay
- self.writedelay = min(max(self.mindelay, writedelay),
- self.maxdelay)
- # Start the delay count now, not at the next check
- self.last_read = self.last_write = time.time()
- finally:
- self.lock.release()
-
- def getDelay(self, write=False):
- """Return the actual delay, accounting for multiple processes.
-
- This value is the maximum wait between reads/writes, not taking
- account of how much time has elapsed since the last access.
-
- """
- global pid
- if write:
- thisdelay = self.writedelay
- else:
- thisdelay = self.delay
- if pid: # If set, we're checking for multiple processes
- if time.time() > self.checktime + self.checkdelay:
- self.checkMultiplicity()
- if thisdelay < (self.mindelay * self.next_multiplicity):
- thisdelay = self.mindelay * self.next_multiplicity
- elif thisdelay > self.maxdelay:
- thisdelay = self.maxdelay
- thisdelay *= self.process_multiplicity
- return thisdelay
-
- def waittime(self, write=False):
- """Return waiting time in seconds if a query would be made right
now"""
- # Take the previous requestsize in account calculating the desired
- # delay this time
- thisdelay = self.getDelay(write=write)
- now = time.time()
- if write:
- ago = now - self.last_write
- else:
- ago = now - self.last_read
- if ago < thisdelay:
- delta = thisdelay - ago
- return delta
- else:
- return 0.0
-
- def drop(self):
- """Remove me from the list of running bot
processes."""
- # drop all throttles with this process's pid, regardless of site
- self.checktime = 0
- processes = []
- try:
- f = open(self.logfn, 'r')
- except IOError:
- return
- else:
- now = time.time()
- for line in f.readlines():
- try:
- line = line.split(' ')
- this_pid = int(line[0])
- ptime = int(line[1].split('.')[0])
- this_site = line[2].rstrip()
- except (IndexError,ValueError):
- continue # Sometimes the file gets corrupted
- # ignore that line
- if now - ptime <= self.releasepid \
- and this_pid != pid:
- processes.append({'pid': this_pid,
- 'time': ptime,
- 'site': this_site})
- f = open(self.logfn, 'w')
- processes.sort(key=lambda p:p['pid'])
- for p in processes:
- f.write("%(pid)s %(time)s %(site)s\n" % p)
- f.close()
-
- def __call__(self, requestsize=1, write=False):
- """
- Block the calling program if the throttle time has not expired.
-
- Parameter requestsize is the number of Pages to be read/written;
- multiply delay time by an appropriate factor.
-
- Because this seizes the throttle lock, it will prevent any other
- thread from writing to the same site until the wait expires.
-
- """
- self.lock.acquire()
- try:
- wait = self.waittime(write=write)
- # Calculate the multiplicity of the next delay based on how
- # big the request is that is being posted now.
- # We want to add "one delay" for each factor of two in the
- # size of the request. Getting 64 pages at once allows 6 times
- # the delay time for the server.
- self.next_multiplicity = math.log(1+requestsize)/math.log(2.0)
- # Announce the delay if it exceeds a preset limit
- if wait > config.noisysleep:
- logger.info(u"Sleeping for %(wait).1f seconds, %(now)s"
- % {'wait': wait,
- 'now': time.strftime("%Y-%m-%d
%H:%M:%S",
- time.localtime())
- } )
- time.sleep(wait)
- if write:
- self.last_write = time.time()
- else:
- self.last_read = time.time()
- finally:
- self.lock.release()
-
- def lag(self, lagtime):
- """
- Seize the throttle lock due to server lag.
-
- This will prevent any thread from accessing this site.
-
- """
- started = time.time()
- self.lock.acquire()
- try:
- # start at 1/2 the current server lag time
- # wait at least 5 seconds but not more than 120 seconds
- delay = min(max(5, lagtime//2), 120)
- # account for any time we waited while acquiring the lock
- wait = delay - (time.time() - started)
- if wait > 0:
- if wait > config.noisysleep:
- logger.info(u"Sleeping for %(wait).1f seconds, %(now)s"
- % {'wait': wait,
- 'now': time.strftime("%Y-%m-%d
%H:%M:%S",
- time.localtime())
- } )
- time.sleep(wait)
- finally:
- self.lock.release()
-
+# -*- coding: utf-8 -*-
+"""
+Mechanics to slow down wiki read and/or write rate.
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import pywikibot
+from pywikibot import config2 as config
+
+import logging
+import math
+import threading
+import time
+
+logger = logging.getLogger("wiki.throttle")
+
+pid = False # global process identifier
+ # when the first Throttle is instantiated, it will set this
+ # variable to a positive integer, which will apply to all
+ # throttle objects created by this process.
+
+
+class Throttle(object):
+ """Control rate of access to wiki server
+
+ Calling this object blocks the calling thread until at least 'delay'
+ seconds have passed since the previous call.
+
+ Each Site initiates one Throttle object (site.throttle) to control the
+ rate of access.
+
+ """
+ def __init__(self, site, mindelay=None, maxdelay=None, writedelay=None,
+ multiplydelay=True, verbosedelay=False):
+ self.lock = threading.RLock()
+ self.mysite = str(site)
+ self.logfn = config.datafilepath('throttle.log')
+ self.mindelay = mindelay
+ if self.mindelay is None:
+ self.mindelay = config.minthrottle
+ self.maxdelay = maxdelay
+ if self.maxdelay is None:
+ self.maxdelay = config.maxthrottle
+ self.writedelay = writedelay
+ self.last_read = 0
+ self.last_write = 0
+ self.next_multiplicity = 1.0
+ self.checkdelay = 300 # Check logfile again after this many seconds
+ self.dropdelay = 600 # Ignore processes that have not made
+ # a check in this many seconds
+ self.releasepid = 1200 # Free the process id after this many seconds
+ self.lastwait = 0.0
+ self.delay = 0
+ self.verbosedelay = verbosedelay
+ if multiplydelay:
+ self.checkMultiplicity()
+ self.setDelays()
+
+ def checkMultiplicity(self):
+ """Count running processes for site and set
process_multiplicity."""
+ global pid
+ self.lock.acquire()
+ mysite = self.mysite
+ logger.debug("Checking multiplicity: pid = %(pid)s" % globals())
+ try:
+ processes = []
+ my_pid = pid or 1 # start at 1 if global pid not yet set
+ count = 1
+ # open throttle.log
+ try:
+ f = open(self.logfn, 'r')
+ except IOError:
+ if not pid:
+ pass
+ else:
+ raise
+ else:
+ now = time.time()
+ for line in f.readlines():
+ # parse line; format is "pid timestamp site"
+ try:
+ line = line.split(' ')
+ this_pid = int(line[0])
+ ptime = int(line[1].split('.')[0])
+ this_site = line[2].rstrip()
+ except (IndexError, ValueError):
+ continue # Sometimes the file gets corrupted
+ # ignore that line
+ if now - ptime > self.releasepid:
+ continue # process has expired, drop from file
+ if now - ptime <= self.dropdelay \
+ and this_site == mysite \
+ and this_pid != pid:
+ count += 1
+ if this_site != self.mysite or this_pid != pid:
+ processes.append({'pid': this_pid,
+ 'time': ptime,
+ 'site': this_site})
+ if not pid and this_pid >= my_pid:
+ my_pid = this_pid+1 # next unused process id
+
+ if not pid:
+ pid = my_pid
+ self.checktime = time.time()
+ processes.append({'pid': pid,
+ 'time': self.checktime,
+ 'site': mysite})
+ f = open(self.logfn, 'w')
+ processes.sort(key=lambda p:(p['pid'], p['site']))
+ for p in processes:
+ f.write("%(pid)s %(time)s %(site)s\n" % p)
+ f.close()
+ self.process_multiplicity = count
+ if self.verbosedelay:
+ logger.info(
+u"Found %(count)s %(mysite)s processes running, including this one."
+ % locals())
+ finally:
+ self.lock.release()
+
+ def setDelays(self, delay=None, writedelay=None, absolute=False):
+ """Set the nominal delays in seconds. Defaults to config
values."""
+ self.lock.acquire()
+ try:
+ maxdelay = self.maxdelay
+ if delay is None:
+ delay = self.mindelay
+ if writedelay is None:
+ writedelay = config.put_throttle
+ if absolute:
+ self.maxdelay = delay
+ self.mindelay = delay
+ self.delay = delay
+ self.writedelay = min(max(self.mindelay, writedelay),
+ self.maxdelay)
+ # Start the delay count now, not at the next check
+ self.last_read = self.last_write = time.time()
+ finally:
+ self.lock.release()
+
+ def getDelay(self, write=False):
+ """Return the actual delay, accounting for multiple processes.
+
+ This value is the maximum wait between reads/writes, not taking
+ account of how much time has elapsed since the last access.
+
+ """
+ global pid
+ if write:
+ thisdelay = self.writedelay
+ else:
+ thisdelay = self.delay
+ if pid: # If set, we're checking for multiple processes
+ if time.time() > self.checktime + self.checkdelay:
+ self.checkMultiplicity()
+ if thisdelay < (self.mindelay * self.next_multiplicity):
+ thisdelay = self.mindelay * self.next_multiplicity
+ elif thisdelay > self.maxdelay:
+ thisdelay = self.maxdelay
+ thisdelay *= self.process_multiplicity
+ return thisdelay
+
+ def waittime(self, write=False):
+ """Return waiting time in seconds if a query would be made right
now"""
+ # Take the previous requestsize in account calculating the desired
+ # delay this time
+ thisdelay = self.getDelay(write=write)
+ now = time.time()
+ if write:
+ ago = now - self.last_write
+ else:
+ ago = now - self.last_read
+ if ago < thisdelay:
+ delta = thisdelay - ago
+ return delta
+ else:
+ return 0.0
+
+ def drop(self):
+ """Remove me from the list of running bot
processes."""
+ # drop all throttles with this process's pid, regardless of site
+ self.checktime = 0
+ processes = []
+ try:
+ f = open(self.logfn, 'r')
+ except IOError:
+ return
+ else:
+ now = time.time()
+ for line in f.readlines():
+ try:
+ line = line.split(' ')
+ this_pid = int(line[0])
+ ptime = int(line[1].split('.')[0])
+ this_site = line[2].rstrip()
+ except (IndexError,ValueError):
+ continue # Sometimes the file gets corrupted
+ # ignore that line
+ if now - ptime <= self.releasepid \
+ and this_pid != pid:
+ processes.append({'pid': this_pid,
+ 'time': ptime,
+ 'site': this_site})
+ f = open(self.logfn, 'w')
+ processes.sort(key=lambda p:p['pid'])
+ for p in processes:
+ f.write("%(pid)s %(time)s %(site)s\n" % p)
+ f.close()
+
+ def __call__(self, requestsize=1, write=False):
+ """
+ Block the calling program if the throttle time has not expired.
+
+ Parameter requestsize is the number of Pages to be read/written;
+ multiply delay time by an appropriate factor.
+
+ Because this seizes the throttle lock, it will prevent any other
+ thread from writing to the same site until the wait expires.
+
+ """
+ self.lock.acquire()
+ try:
+ wait = self.waittime(write=write)
+ # Calculate the multiplicity of the next delay based on how
+ # big the request is that is being posted now.
+ # We want to add "one delay" for each factor of two in the
+ # size of the request. Getting 64 pages at once allows 6 times
+ # the delay time for the server.
+ self.next_multiplicity = math.log(1+requestsize)/math.log(2.0)
+ # Announce the delay if it exceeds a preset limit
+ if wait > config.noisysleep:
+ logger.info(u"Sleeping for %(wait).1f seconds, %(now)s"
+ % {'wait': wait,
+ 'now': time.strftime("%Y-%m-%d
%H:%M:%S",
+ time.localtime())
+ } )
+ time.sleep(wait)
+ if write:
+ self.last_write = time.time()
+ else:
+ self.last_read = time.time()
+ finally:
+ self.lock.release()
+
+ def lag(self, lagtime):
+ """
+ Seize the throttle lock due to server lag.
+
+ This will prevent any thread from accessing this site.
+
+ """
+ started = time.time()
+ self.lock.acquire()
+ try:
+ # start at 1/2 the current server lag time
+ # wait at least 5 seconds but not more than 120 seconds
+ delay = min(max(5, lagtime//2), 120)
+ # account for any time we waited while acquiring the lock
+ wait = delay - (time.time() - started)
+ if wait > 0:
+ if wait > config.noisysleep:
+ logger.info(u"Sleeping for %(wait).1f seconds, %(now)s"
+ % {'wait': wait,
+ 'now': time.strftime("%Y-%m-%d
%H:%M:%S",
+ time.localtime())
+ } )
+ time.sleep(wait)
+ finally:
+ self.lock.release()
+
Property changes on: branches/rewrite/pywikibot/throttle.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Added: svn:eol-style
+ native
Modified: branches/rewrite/pywikibot/tools.py
===================================================================
--- branches/rewrite/pywikibot/tools.py 2008-12-16 19:34:48 UTC (rev 6155)
+++ branches/rewrite/pywikibot/tools.py 2008-12-16 19:40:20 UTC (rev 6156)
@@ -1,174 +1,174 @@
-# -*- coding: utf-8 -*-
-"""Miscellaneous helper functions (not wiki-dependent)"""
-#
-# (C) Pywikipedia bot team, 2008
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id: $'
-
-import sys
-import threading
-import time
-import Queue
-
-
-class ThreadedGenerator(threading.Thread):
- """Look-ahead generator class.
-
- Runs a generator in a separate thread and queues the results; can
- be called like a regular generator.
-
- Subclasses should override self.generator, I{not} self.run
-
- Important: the generator thread will stop itself if the generator's
- internal queue is exhausted; but, if the calling program does not use
- all the generated values, it must call the generator's stop() method to
- stop the background thread. Example usage:
-
- >>> gen = ThreadedGenerator(target=xrange, args=(20,))
- >>> try:
- ... for data in gen:
- ... print data,
- ... finally:
- ... gen.stop()
- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
-
- """
-
- def __init__(self, group=None, target=None, name="GeneratorThread",
- args=(), kwargs=None, qsize=65536):
- """Constructor. Takes same keyword arguments as
threading.Thread.
-
- target must be a generator function (or other callable that returns
- an iterable object).
-
- @param qsize: The size of the lookahead queue. The larger the qsize,
- the more values will be computed in advance of use (which can eat
- up memory and processor time).
- @type qsize: int
-
- """
- if kwargs is None:
- kwargs = {}
- if target:
- self.generator = target
- if not hasattr(self, "generator"):
- raise RuntimeError("No generator for ThreadedGenerator to run.")
- self.args, self.kwargs = args, kwargs
- threading.Thread.__init__(self, group=group, name=name)
- self.queue = Queue.Queue(qsize)
- self.finished = threading.Event()
-
- def __iter__(self):
- """Iterate results from the queue."""
- if not self.isAlive() and not self.finished.isSet():
- self.start()
- # if there is an item in the queue, yield it, otherwise wait
- while not self.finished.isSet():
- try:
- yield self.queue.get(True, 0.25)
- except Queue.Empty:
- pass
- except KeyboardInterrupt:
- self.stop()
-
- def stop(self):
- """Stop the background thread."""
- self.finished.set()
-
- def run(self):
- """Run the generator and store the results on the
queue."""
- self.__gen = self.generator(*self.args, **self.kwargs)
- for result in self.__gen:
- while True:
- if self.finished.isSet():
- return
- try:
- self.queue.put_nowait(result)
- except Queue.Full:
- time.sleep(0.25)
- continue
- break
- # wait for queue to be emptied, then kill the thread
- while not self.finished.isSet() and not self.queue.empty():
- time.sleep(0.25)
- self.stop()
-
-
-def itergroup(iterable, size):
- """Make an iterator that returns lists of (up to) size items from
iterable.
-
- Example:
-
- >>> i = itergroup(xrange(25), 10)
- >>> print i.next()
- [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
- >>> print i.next()
- [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
- >>> print i.next()
- [20, 21, 22, 23, 24]
- >>> print i.next()
- Traceback (most recent call last):
- ...
- StopIteration
-
- """
- group = []
- for item in iterable:
- group.append(item)
- if len(group) == size:
- yield group
- group = []
- if group:
- yield group
-
-
-class ThreadList(list):
- """A simple threadpool class to limit the number of simultaneous
threads.
-
- Any threading.Thread object can be added to the pool using the append()
- method. If the maximum number of simultaneous threads has not been reached,
- the Thread object will be started immediately; if not, the append() call
- will block until the thread is able to start.
-
- >>> pool = ThreadList(limit=10)
- >>> def work():
- ... time.sleep(1)
- ...
- >>> for x in xrange(20):
- ... pool.append(threading.Thread(target=work))
- ...
-
- """
- def __init__(self, limit=sys.maxint, *args):
- self.limit = limit
- list.__init__(self, *args)
- for item in list(self):
- if not isinstance(threading.Thread, item):
- raise TypeError("Cannot add '%s' to ThreadList" %
type(item))
-
- def active_count(self):
- """Return the number of alive threads, and delete all non-alive
ones."""
- count = 0
- for item in list(self):
- if item.isAlive():
- count += 1
- else:
- self.remove(item)
- return count
-
- def append(self, thd):
- if not isinstance(thd, threading.Thread):
- raise TypeError("Cannot append '%s' to ThreadList" %
type(thd))
- while self.active_count() >= self.limit:
- time.sleep(2)
- list.append(self, thd)
- thd.start()
-
-
-if __name__ == "__main__":
- def _test():
- import doctest
- doctest.testmod()
- _test()
+# -*- coding: utf-8 -*-
+"""Miscellaneous helper functions (not wiki-dependent)"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import sys
+import threading
+import time
+import Queue
+
+
+class ThreadedGenerator(threading.Thread):
+ """Look-ahead generator class.
+
+ Runs a generator in a separate thread and queues the results; can
+ be called like a regular generator.
+
+ Subclasses should override self.generator, I{not} self.run
+
+ Important: the generator thread will stop itself if the generator's
+ internal queue is exhausted; but, if the calling program does not use
+ all the generated values, it must call the generator's stop() method to
+ stop the background thread. Example usage:
+
+ >>> gen = ThreadedGenerator(target=xrange, args=(20,))
+ >>> try:
+ ... for data in gen:
+ ... print data,
+ ... finally:
+ ... gen.stop()
+ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+
+ """
+
+ def __init__(self, group=None, target=None, name="GeneratorThread",
+ args=(), kwargs=None, qsize=65536):
+ """Constructor. Takes same keyword arguments as
threading.Thread.
+
+ target must be a generator function (or other callable that returns
+ an iterable object).
+
+ @param qsize: The size of the lookahead queue. The larger the qsize,
+ the more values will be computed in advance of use (which can eat
+ up memory and processor time).
+ @type qsize: int
+
+ """
+ if kwargs is None:
+ kwargs = {}
+ if target:
+ self.generator = target
+ if not hasattr(self, "generator"):
+ raise RuntimeError("No generator for ThreadedGenerator to run.")
+ self.args, self.kwargs = args, kwargs
+ threading.Thread.__init__(self, group=group, name=name)
+ self.queue = Queue.Queue(qsize)
+ self.finished = threading.Event()
+
+ def __iter__(self):
+ """Iterate results from the queue."""
+ if not self.isAlive() and not self.finished.isSet():
+ self.start()
+ # if there is an item in the queue, yield it, otherwise wait
+ while not self.finished.isSet():
+ try:
+ yield self.queue.get(True, 0.25)
+ except Queue.Empty:
+ pass
+ except KeyboardInterrupt:
+ self.stop()
+
+ def stop(self):
+ """Stop the background thread."""
+ self.finished.set()
+
+ def run(self):
+ """Run the generator and store the results on the
queue."""
+ self.__gen = self.generator(*self.args, **self.kwargs)
+ for result in self.__gen:
+ while True:
+ if self.finished.isSet():
+ return
+ try:
+ self.queue.put_nowait(result)
+ except Queue.Full:
+ time.sleep(0.25)
+ continue
+ break
+ # wait for queue to be emptied, then kill the thread
+ while not self.finished.isSet() and not self.queue.empty():
+ time.sleep(0.25)
+ self.stop()
+
+
+def itergroup(iterable, size):
+ """Make an iterator that returns lists of (up to) size items from
iterable.
+
+ Example:
+
+ >>> i = itergroup(xrange(25), 10)
+ >>> print i.next()
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+ >>> print i.next()
+ [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+ >>> print i.next()
+ [20, 21, 22, 23, 24]
+ >>> print i.next()
+ Traceback (most recent call last):
+ ...
+ StopIteration
+
+ """
+ group = []
+ for item in iterable:
+ group.append(item)
+ if len(group) == size:
+ yield group
+ group = []
+ if group:
+ yield group
+
+
+class ThreadList(list):
+ """A simple threadpool class to limit the number of simultaneous
threads.
+
+ Any threading.Thread object can be added to the pool using the append()
+ method. If the maximum number of simultaneous threads has not been reached,
+ the Thread object will be started immediately; if not, the append() call
+ will block until the thread is able to start.
+
+ >>> pool = ThreadList(limit=10)
+ >>> def work():
+ ... time.sleep(1)
+ ...
+ >>> for x in xrange(20):
+ ... pool.append(threading.Thread(target=work))
+ ...
+
+ """
+ def __init__(self, limit=sys.maxint, *args):
+ self.limit = limit
+ list.__init__(self, *args)
+ for item in list(self):
+ if not isinstance(threading.Thread, item):
+ raise TypeError("Cannot add '%s' to ThreadList" %
type(item))
+
+ def active_count(self):
+ """Return the number of alive threads, and delete all non-alive
ones."""
+ count = 0
+ for item in list(self):
+ if item.isAlive():
+ count += 1
+ else:
+ self.remove(item)
+ return count
+
+ def append(self, thd):
+ if not isinstance(thd, threading.Thread):
+ raise TypeError("Cannot append '%s' to ThreadList" %
type(thd))
+ while self.active_count() >= self.limit:
+ time.sleep(2)
+ list.append(self, thd)
+ thd.start()
+
+
+if __name__ == "__main__":
+ def _test():
+ import doctest
+ doctest.testmod()
+ _test()
Property changes on: branches/rewrite/pywikibot/tools.py
___________________________________________________________________
Added: svn:keywords
+ Author Date Id Revision
Added: svn:eol-style
+ native