Revision: 5088
Author: russblau
Date: 2008-02-27 20:08:48 +0000 (Wed, 27 Feb 2008)
Log Message:
-----------
Committing page and site modules, related tests
Modified Paths:
--------------
branches/rewrite/pywikibot/__init__.py
branches/rewrite/pywikibot/config.py
branches/rewrite/pywikibot/data/api.py
branches/rewrite/pywikibot/family.py
branches/rewrite/pywikibot/login.py
branches/rewrite/pywikibot/tests/api_tests.py
Added Paths:
-----------
branches/rewrite/pywikibot/exceptions.py
branches/rewrite/pywikibot/page.py
branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/__init__.py
===================================================================
--- branches/rewrite/pywikibot/__init__.py 2008-02-27 20:05:28 UTC (rev 5087)
+++ branches/rewrite/pywikibot/__init__.py 2008-02-27 20:08:48 UTC (rev 5088)
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+The initialization file for the Pywikibot framework.
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+
+from exceptions import *
+
+from page import Page, ImagePage, Category
+
+import config
+
+_sites = {}
+default_family = config.family
+default_code = config.mylang
+
+def Site(code=None, fam=None, user=None, interface=None):
+ """Return the specified Site object.
+
+ Returns a cached object if possible, otherwise instantiates a new one.
+
+ @param code: language code
+ @type code: string
+ @param fam: family name or object
+ @type fam: string or Family
+ @param user: bot user name to use on this site
+ @type user: unicode
+
+ """
+ if code == None:
+ code = default_code
+ if fam == None:
+ fam = default_family
+ if user == None:
+ try:
+ user = config.usernames[fam][code]
+ except KeyError:
+ user = None
+ if interface is None:
+ interface = config.site_interface
+ try:
+ exec "from site import %s as __Site" % interface
+ except ImportError:
+ raise ValueError("Invalid interface name '%s'" % interface)
+ key = '%s:%s:%s' % (fam, code, user)
+ if not _sites.has_key(key):
+ _sites[key] = __Site(code=code, fam=fam, user=user)
+ return _sites[key]
+
+getSite = Site # alias for backwards-compability
+
+# DEBUG
+
+def output(text):
+ print text
+
+def input(prompt, password=False):
+ if password:
+ import getpass
+ return getpass.getpass(prompt)
+ return raw_input(prompt)
Modified: branches/rewrite/pywikibot/config.py
===================================================================
--- branches/rewrite/pywikibot/config.py 2008-02-27 20:05:28 UTC (rev 5087)
+++ branches/rewrite/pywikibot/config.py 2008-02-27 20:08:48 UTC (rev 5088)
@@ -26,7 +26,9 @@
family = 'wikipedia'
# The language code of the site we're working on.
mylang = 'language'
-
+# The default interface for communicating with the site
+# currently the only defined interface is 'APISite', so don't change this!
+site_interface = 'APISite'
# The dictionary usernames should contain a username for each site where you
# have a bot account. Please set your usernames by adding such lines to your
# user-config.py:
@@ -71,8 +73,49 @@
# Get the names of all known families, and initialize
# with empty dictionaries
-import wikipediatools as _wt
-_base_dir = _wt.get_base_dir()
+def _get_base_dir():
+ """Return the directory in which user-specific information is stored.
+
+ This is determined in the following order -
+ 1. If the script was called with a -dir: argument, use the directory
+ provided in this argument
+ 2. If the user has a PYWIKIBOT_DIR environment variable, use the value
+ of it
+ 3. If the script was started from a directory that contains a
+ user-config.py file, use this directory as the base
+ 4. If all else fails, use the directory from which this module was
+ loaded.
+
+ """
+ for arg in __sys.argv[1:]:
+ if arg.startswith("-dir:"):
+ base_dir = arg[5:]
+ __sys.argv.remove(arg)
+ break
+ else:
+ if os.environ.has_key("PYWIKIBOT_DIR"):
+ base_dir = os.environ["PYWIKIBOT_DIR"]
+ else:
+ if os.path.exists('user-config.py'):
+ base_dir = '.'
+ else:
+ try:
+ base_dir = os.path.split(
+ __sys.modules['wikipediatools'].__file__)[0]
+ except KeyError:
+ print sys.modules
+ base_dir = '.'
+ if not os.path.isabs(base_dir):
+ base_dir = os.path.normpath(os.path.join(os.getcwd(), base_dir))
+ # make sure this path is valid and that it contains user-config file
+ if not os.path.isdir(base_dir):
+ raise RuntimeError("Directory '%s' does not exist." %
base_dir)
+ if not os.path.exists(os.path.join(base_dir, "user-config.py")):
+ raise RuntimeError("No user-config.py found in directory
'%s'."
+ % base_dir)
+ return base_dir
+
+_base_dir = _get_base_dir()
_RfamilyFile = re.compile('(?P<name>.+)_family.py$')
for _filename in os.listdir(os.path.join(_base_dir, 'families')):
_m = _RfamilyFile.match(_filename)
@@ -477,12 +520,13 @@
"""Return an absolute path to a data file in a standard location.
Argument(s) are zero or more directory names, optionally followed by a
- data file name. The return path is offset to config.base_dir. Any
- directories in the path that do not already exist are created.
+ data file name. The return path is offset to the "data" subdirectory of
+ config.base_dir. Any directories in the path that do not already exist
+ are created.
"""
import os
- return makepath(os.path.join(base_dir, *filename))
+ return makepath(os.path.join(os.path.join(base_dir, "data"), *filename))
def shortpath(path):
"""Return a file path relative to config.base_dir."""
Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py 2008-02-27 20:05:28 UTC (rev 5087)
+++ branches/rewrite/pywikibot/data/api.py 2008-02-27 20:08:48 UTC (rev 5088)
@@ -10,6 +10,7 @@
__version__ = '$Id: $'
from UserDict import DictMixin
+from datetime import datetime, timedelta
import http
import simplejson as json
import logging
@@ -17,10 +18,10 @@
import traceback
import time
import urllib
-# TODO - replace when Page object is written
-from pywikibot.tests.dummy import TestPage as Page
+from pywikibot import login
+
lagpattern = re.compile(r"Waiting for [\d.]+: (?P<lag>\d+) seconds?
lagged")
@@ -127,7 +128,7 @@
if self.params['format'] != 'json':
raise TypeError("Query format '%s' cannot be parsed."
% self.params['format'])
- uri = self.site.script_path() + "api.php"
+ uri = self.site.scriptpath() + "/api.php"
params = urllib.urlencode(self.params)
while True:
# TODO wait on errors
@@ -143,6 +144,7 @@
rawdata = http.request(self.site, uri)
except Exception, e: #TODO: what exceptions can occur here?
logging.warning(traceback.format_exc())
+ print uri, params
self.wait()
continue
if rawdata.startswith(u"unknown_action"):
@@ -257,9 +259,44 @@
del self.data
+class LoginManager(login.LoginManager):
+ """Supplies getCookie() method to use API
interface."""
+ def getCookie(self, remember=True, captchaId=None, captchaAnswer=None):
+ """
+ Login to the site.
+
+ Paramters are all ignored.
+
+ Returns cookie data if succesful, None otherwise.
+ """
+ if hasattr(self, '_waituntil'):
+ if datetime.now() < self._waituntil:
+ time.sleep(self._waituntil - datetime.now())
+ login_request = Request(site=self.site,
+ action="login",
+ lgname=self.username,
+ lgpassword=self.password
+ )
+ login_result = login_request.submit()
+ if u"login" not in login_result:
+ raise RuntimeError("API login response does not have 'login'
key.")
+ if login_result['login']['result'] != u'Success':
+ self._waituntil = datetime.datetime.now() + datetime.timedelta(seconds=60)
+ return None
+
+ prefix = login_result['login']['cookieprefix']
+ cookies = []
+ for key in ('Token', 'UserID', 'UserName'):
+ cookies.append("%s%s=%s"
+ % (prefix, key,
+ login_result['login']['lg'+key.lower()]))
+ self.username = login_result['login']['lgusername']
+ return "\n".join(cookies)
+
+
if __name__ == "__main__":
- from pywikibot.tests.dummy import TestSite as Site, TestPage as Page
- mysite = Site("en.wikipedia.org")
+ from pywikibot import Site
+ mysite = Site("en", "wikipedia")
logging.getLogger().setLevel(logging.DEBUG)
def _test():
import doctest
Added: branches/rewrite/pywikibot/exceptions.py
===================================================================
--- branches/rewrite/pywikibot/exceptions.py (rev 0)
+++ branches/rewrite/pywikibot/exceptions.py 2008-02-27 20:08:48 UTC (rev 5088)
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+"""
+Exception classes used throughout the framework.
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+
+# TODO: These are copied from wikipedia.py; not certain that all of them
+# will be needed in the rewrite.
+
+class Error(Exception):
+ """Wikipedia error"""
+
+class NoUsername(Error):
+ """Username is not in user-config.py"""
+
+class NoPage(Error):
+ """Page does not exist"""
+
+class NoSuchSite(Error):
+ """Site does not exist"""
+
+class IsRedirectPage(Error):
+ """Page is a redirect page"""
+
+class IsNotRedirectPage(Error):
+ """Page is not a redirect page"""
+
+class LockedPage(Error):
+ """Page is locked"""
+
+class SectionError(Error):
+ """The section specified by # does not exist"""
+
+class PageNotSaved(Error):
+ """Saving the page has failed"""
+
+class EditConflict(PageNotSaved):
+ """There has been an edit conflict while uploading the
page"""
+
+class SpamfilterError(PageNotSaved):
+ """Saving the page has failed because the MediaWiki spam filter
detected a blacklisted URL."""
+ def __init__(self, arg):
+ self.url = arg
+ self.args = arg,
+
+class ServerError(Error):
+ """Got unexpected server response"""
+
+class BadTitle(Error):
+ """Server responded with BadTitle."""
+
+# UserBlocked exceptions should in general not be caught. If the bot has
+# been blocked, the bot operator should address the reason for the block
+# before continuing.
+class UserBlocked(Error):
+ """Your username or IP has been blocked"""
+
+class PageNotFound(Error):
+ """Page not found in list"""
+
+class CaptchaError(Error):
+ """Captcha is asked and config.solve_captcha ==
False."""
+
Modified: branches/rewrite/pywikibot/family.py
===================================================================
--- branches/rewrite/pywikibot/family.py 2008-02-27 20:05:28 UTC (rev 5087)
+++ branches/rewrite/pywikibot/family.py 2008-02-27 20:08:48 UTC (rev 5088)
@@ -2963,6 +2963,16 @@
wiki"""
return self.code2encoding(code),
+ # aliases
+ def encoding(self, code):
+ """Return the encoding for a specific language
wiki"""
+ return self.code2encoding(code)
+
+ def encodings(self, code):
+ """Return a list of historical encodings for a specific language
+ wiki"""
+ return self.code2encodings(code)
+
def __cmp__(self, otherfamily):
try:
return cmp(self.name, otherfamily.name)
@@ -2972,6 +2982,9 @@
def __hash__(self):
return hash(self.name)
+ def __repr__(self):
+ return 'Family("%s")' % self.name
+
def RversionTab(self, code):
"""Change this to some regular expression that shows the page we
found is an existing page, in case the normal regexp does not
work."""
Modified: branches/rewrite/pywikibot/login.py
===================================================================
--- branches/rewrite/pywikibot/login.py 2008-02-27 20:05:28 UTC (rev 5087)
+++ branches/rewrite/pywikibot/login.py 2008-02-27 20:08:48 UTC (rev 5088)
@@ -46,7 +46,10 @@
import re
import urllib2
-import wikipedia, config
+import config
+import pywikibot
+from pywikibot import Page
+from pywikibot.exceptions import *
# On some wikis you are only allowed to run a bot if there is a link to
# the bot's user page in a specific list.
@@ -65,17 +68,17 @@
class LoginManager:
def __init__(self, password = None, sysop = False, site = None):
- self.site = site or wikipedia.getSite()
+ self.site = site or pywikibot.Site()
if sysop:
try:
- self.username = config.sysopnames[self.site.family.name][self.site.lang]
+ self.username =
config.sysopnames[self.site.family().name][self.site.language()]
except:
- raise wikipedia.NoUsername(u'ERROR: Sysop username for %s:%s is
undefined.\nIf you have a sysop account for that site, please add such a line to
user-config.py:\n\nsysopnames[\'%s\'][\'%s\'] =
\'myUsername\'' % (self.site.family.name, self.site.lang,
self.site.family.name, self.site.lang))
+ raise NoUsername(u'ERROR: Sysop username for %s:%s is undefined.\nIf
you have a sysop account for that site, please add such a line to
user-config.py:\n\nsysopnames[\'%s\'][\'%s\'] =
\'myUsername\'' % (self.site.family.name, self.site.lang,
self.site.family.name, self.site.lang))
else:
try:
- self.username = config.usernames[self.site.family.name][self.site.lang]
+ self.username =
config.usernames[self.site.family().name][self.site.language()]
except:
- raise wikipedia.NoUsername(u'ERROR: Username for %s:%s is
undefined.\nIf you have an account for that site, please add such a line to
user-config.py:\n\nusernames[\'%s\'][\'%s\'] = \'myUsername\''
% (self.site.family.name, self.site.lang, self.site.family.name, self.site.lang))
+ raise NoUsername(u'ERROR: Username for %s:%s is undefined.\nIf you
have an account for that site, please add such a line to
user-config.py:\n\nusernames[\'%s\'][\'%s\'] = \'myUsername\''
% (self.site.family.name, self.site.lang, self.site.family.name, self.site.lang))
self.password = password
if getattr(config, 'password_file', ''):
self.readPassword()
@@ -85,9 +88,10 @@
Checks whether the bot is listed on a specific page to comply with
the policy on the respective wiki.
"""
+ return True # DEBUG
if botList.has_key(self.site.family.name) and
botList[self.site.family.name].has_key(self.site.language()):
botListPageTitle = botList[self.site.family.name][self.site.language()]
- botListPage = wikipedia.Page(self.site, botListPageTitle)
+ botListPage = Page(self.site, botListPageTitle)
for linkedPage in botListPage.linkedPages():
if linkedPage.titleWithoutNamespace() == self.username:
return True
@@ -171,10 +175,11 @@
The argument data is the raw data, as returned by getCookie().
- Returns nothing."""
- filename = wikipedia.config.datafilepath('login-data',
- '%s-%s-%s-login.data'
- % (self.site.family.name, self.site.lang, self.username))
+ """
+ filename = config.datafilepath('%s-%s-%s-login.data'
+ % (self.site.family().name,
+ self.site.language(),
+ self.username))
f = open(filename, 'w')
f.write(data)
f.close()
@@ -211,21 +216,21 @@
if not self.password:
# As we don't want the password to appear on the screen, we set
# password = True
- self.password = wikipedia.input(u'Password for user %s on %s:' %
(self.username, self.site), password = True)
+ self.password = pywikibot.input(u'Password for user %s on %s:' %
(self.username, self.site), password = True)
- self.password = self.password.encode(self.site.encoding())
+# self.password = self.password.encode(self.site.encoding())
- wikipedia.output(u"Logging in to %s as %s" % (self.site,
self.username))
+ pywikibot.output(u"Logging in to %s as %s" % (self.site,
self.username))
cookiedata = self.getCookie()
if cookiedata:
self.storecookiedata(cookiedata)
- wikipedia.output(u"Should be logged in now")
+ pywikibot.output(u"Should be logged in now")
# Show a warning according to the local bot policy
if not self.botAllowed():
- wikipedia.output(u'*** Your username is not listed on [[%s]].\n***
Please make sure you are allowed to use the robot before actually using it!' %
botList[self.site.family.name][self.site.lang])
+ pywikibot.output(u'*** Your username is not listed on [[%s]].\n***
Please make sure you are allowed to use the robot before actually using it!' %
botList[self.site.family.name][self.site.lang])
return True
else:
- wikipedia.output(u"Login failed. Wrong password or CAPTCHA
answer?")
+ pywikibot.output(u"Login failed. Wrong password or CAPTCHA
answer?")
if retry:
self.password = None
return self.login(retry = True)
Added: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py (rev 0)
+++ branches/rewrite/pywikibot/page.py 2008-02-27 20:08:48 UTC (rev 5088)
@@ -0,0 +1,1579 @@
+# -*- coding: utf-8 -*-
+"""
+Objects representing various types of MediaWiki pages.
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+import pywikibot
+from pywikibot.exceptions import *
+
+import htmlentitydefs
+import logging
+import re
+import unicodedata
+import urllib
+
+reNamespace = re.compile("^(.+?) *: *(.*)$")
+
+
+class Page(object):
+ """Page: A MediaWiki page
+
+ This object only implements internally methods that do not require
+ reading from or writing to the wiki. All other methods are delegated
+ to the Site object.
+
+ Methods available:
+ - site: The wiki this page is in
+ - title: The name of the page, with various presentation options
+ - namespace: The namespace in which the page is found
+ - section: The section of the page (the part of the title after '#', if
+ any)
+ - isAutoTitle: Title can be translated using the autoFormat method
+ - autoFormat: Auto-format certain dates and other standard format page
+ titles
+ - isCategory: True if the page is a category
+ - isDisambig (*): True if the page is a disambiguation page
+ - isImage: True if the page is an image
+ - isRedirectPage (*): True if the page is a redirect, false otherwise
+ - getRedirectTarget (*): The page the page redirects to
+ - isTalkPage: True if the page is in any "talk" namespace
+ - toggleTalkPage: Return the talk page (if this is one, return the
+ non-talk page)
+ - get (*): The text of the page
+ - latestRevision (*): The page's current revision id
+ - userName: Last user to edit page
+ - isIpEdit: True if last editor was unregistered
+ - editTime: Timestamp of the last revision to the page
+ - previousRevision (*): The revision id of the previous version
+ - permalink (*): The url of the permalink of the current version
+ - getOldVersion(id) (*): The text of a previous version of the page
+ - getVersionHistory: Load the version history information from wiki
+ - getVersionHistoryTable: Create a wiki table from the history data
+ - fullVersionHistory: Return all past versions including wikitext
+ - contributingUsers: Return set of users who have edited page
+ - exists (*): True if the page actually exists, false otherwise
+ - isEmpty (*): True if the page has 4 characters or less content, not
+ counting interwiki and category links
+ - interwiki (*): The interwiki links from the page (list of Pages)
+ - categories (*): The categories the page is in (list of Pages)
+ - linkedPages (*): The normal pages linked from the page (list of
+ Pages)
+ - imagelinks (*): The pictures on the page (list of ImagePages)
+ - templates (*): All templates referenced on the page (list of Pages)
+ - templatesWithParams(*): All templates on the page, with list of
+ parameters
+ - isDisambig (*): True if the page is a disambiguation page
+ - getReferences: List of pages linking to the page
+ - canBeEdited (*): True if page is unprotected or user has edit
+ privileges
+ - botMayEdit (*): True if bot is allowed to edit page
+ - put(newtext): Saves the page
+ - put_async(newtext): Queues the page to be saved asynchronously
+ - move: Move the page to another title
+ - delete: Deletes the page (requires being logged in)
+ - protect: Protect or unprotect a page (requires sysop status)
+ - removeImage: Remove all instances of an image from this page
+ - replaceImage: Replace all instances of an image with another
+ - loadDeletedRevisions: Load all deleted versions of this page
+ - getDeletedRevision: Return a particular deleted revision
+ - markDeletedRevision: Mark a version to be undeleted, or not
+ - undelete: Undelete past version(s) of the page
+
+ Deprecated methods (preserved for backwards-compatibility):
+ - urlname: Title, in a form suitable for a URL
+ - titleWithoutNamespace: Title, with the namespace part removed
+ - sectionFreeTitle: Title, without the section part
+ - aslink: Title in the form [[Title]] or [[lang:Title]]
+ - encoding: The encoding of the page
+
+ (*) This loads the page if it has not been loaded before; permalink might
+ even reload it if it has been loaded before
+
+ """
+ def __init__(self, site, title, insite=None,
+ defaultNamespace=0):
+ """Parameters:
+
+ @param site: the wikimedia Site on which the page resides
+ @param title: title of the page
+ @type title: unicode
+ @param insite: (optional) a wikimedia Site where this link was found
+ (to help decode interwiki links)
+ @param defaultNamespace: (optional) A namespace to use if the link
+ does not contain one
+ @type defaultNamespace: int
+
+ """
+ if site == None:
+ self._site = pywikibot.Site()
+ elif isinstance(site, basestring):
+ self._site = pywikibot.Site(site)
+ else:
+ self._site = site
+
+ if not insite: insite = self._site
+
+ # parse the title
+ # this can throw various exceptions if the title is invalid
+ link = Link(title, insite, defaultNamespace)
+ self._site = link.site
+ self._section = link.section
+ self._ns = link.namespace
+ self._title = link.title
+ # reassemble the canonical title from components
+ if self._section is not None:
+ self._title = self._title + "#" + self._section
+ if self._ns:
+ self._title = self.site().namespace(self._ns) + ":" + self._title
+ self._revisions = {}
+
+ def site(self):
+ """Return the Site object for the wiki on which this Page
resides."""
+ return self._site
+
+ def namespace(self):
+ """Return the number of the namespace of the page.
+
+ Only recognizes those namespaces defined in family.py.
+ If not defined, it will return 0 (the main namespace).
+
+ @return: int
+
+ """
+ return self._ns
+
+ def title(self, underscore=False, savetitle=False, withNamespace=True,
+ withSection=True, asUrl=False, asLink=False,
+ allowInterwiki=True, forceInterwiki=False, textlink=False):
+ """Return the title of this Page, as a Unicode string.
+
+ @param underscore: if true, replace all ' ' characters with '_'
+ @param savetitle: if true, try to quote all non-ASCII characters.
+ (DEPRECATED: use asURL instead)
+ @param withNamespace: if false, omit the namespace prefix
+ @param withSection: if false, omit the section
+ @param asUrl: if true, quote title as if in an URL
+ @param asLink: if true, return the title in the form of a wikilink
+ @param allowInterwiki: (only used if asLink is true) if true, format
+ the link as an interwiki link if necessary
+ @param forceInterwiki: (only used if asLink is true) if true, always
+ format the link as an interwiki link
+ @param textlink: (only used if asLink is true) if true, place a ':'
+ before Category: and Image: links
+
+ """
+ title = self._title
+ if not withNamespace and self._ns != 0:
+ title = title.split(u':', 1)[1]
+ if not withSection and self._section:
+ title = title.split(u'#', 1)[0]
+ if underscore or asUrl:
+ title = title.replace(u' ', u'_')
+ if savetitle:
+ logging.debug(
+ u"Page.title(savetitle=...) is deprecated.")
+ if savetitle or asUrl:
+ encodedTitle = title.encode(self.site().encoding())
+ title = urllib.quote(encodedTitle)
+ if asLink:
+ if forceInterwiki or (
+ allowInterwiki and self.site() != pywikibot.Site()):
+ if self.site().family() != pywikibot.Site().family() \
+ and self.site().family().name != self.site().language():
+# FIXME: Interwiki links shouldn't be fully urlencoded
+ return u'[[%s:%s:%s]]' % (self.site().family().name,
+ self.site().language(),
+ self.title(asUrl=True))
+ else:
+ return u'[[%s:%s]]' % (self.site().language(),
+ self.title(asUrl=True))
+ elif textlink and (self.isImage() or self.isCategory()):
+ return u'[[:%s]]' % title
+ else:
+ return u'[[%s]]' % title
+ return title
+
+ def section(self, underscore = False):
+ """Return the name of the section this Page refers to.
+
+ The section is the part of the title following a '#' character, if
+ any. If no section is present, return None.
+
+ @param underscore: unused, but maintained for backwards compatibility
+
+ """
+ if underscore:
+ logging.debug(
+ u"Page.section(underscore=...) is deprecated.")
+ if self._section:
+ return self._section
+ else:
+ return None
+
+ def __str__(self):
+ """Return a console representation of the
pagelink."""
+ return self.title(asLink=True, forceInterwiki=True)
+
+ def __repr__(self):
+ """Return a more complete string
representation."""
+ return u"%s(%s)" % (self.__class__.__name__, self.title())
+
+ def __cmp__(self, other):
+ """Test for equality and inequality of Page
objects"""
+ if not isinstance(other, Page):
+ # especially, return -1 if other is None
+ return -1
+ if not self.site() == other.site():
+ return cmp(self.site(), other.site())
+ owntitle = self.title()
+ othertitle = other.title()
+ return cmp(owntitle, othertitle)
+
+ def __hash__(self):
+ # Pseudo method that makes it possible to store Page objects as keys
+ # in hash-tables. This relies on the fact that the string
+ # representation of an instance can not change after the construction.
+ return hash(str(self))
+
+ def autoFormat(self):
+ """Return L{date.autoFormat} dictName and value, if any.
+
+ Value can be a year, date, etc., and dictName is 'YearBC',
+ 'Year_December', or another dictionary name. Please note that two
+ entries may have exactly the same autoFormat, but be in two
+ different namespaces, as some sites have categories with the
+ same names. Regular titles return (None, None).
+
+ """
+ if not hasattr(self, '_autoFormat'):
+ from pywikibot import date
+ self._autoFormat = date.getAutoFormat(
+ self.site().language(),
+ self.title(withNamespace=False)
+ )
+ return self._autoFormat
+
+ def isAutoTitle(self):
+ """Return True if title of this Page is in the autoFormat
dictionary."""
+ return self.autoFormat()[0] is not None
+
+ def get(self, force=False, get_redirect=False, throttle=None,
+ sysop=False, nofollow_redirects=None, change_edit_time=None):
+ """Return the wiki-text of the page.
+
+ This will retrieve the page from the server if it has not been
+ retrieved yet, or if force is True. This can raise the following
+ exceptions that should be caught by the calling code:
+
+ - NoPage: The page does not exist
+ - IsRedirectPage: The page is a redirect. The argument of the
+ exception is the title of the page it redirects to.
+ - SectionError: The section does not exist on a page with a #
+ link
+
+ @param force: reload all page attributes, including errors.
+ @param get_redirect: return the redirect text, do not follow the
+ redirect, do not raise an exception.
+ @param sysop: if the user has a sysop account, use it to retrieve
+ this page
+ @param throttle: DEPRECATED and unused
+ @param nofollow_redirects: DEPRECATED and unused
+ @param change_edit_time: DEPRECATED and unused
+
+ """
+ if throttle is not None:
+ logging.debug("Page.get(throttle) option is deprecated.")
+ if nofollow_redirects is not None:
+ logging.debug("Page.get(nofollow_redirects) option is
deprecated.")
+ if change_edit_time is not None:
+ logging.debug("Page.get(change_edit_time) option is deprecated.")
+ if force:
+ # When forcing, we retry the page no matter what. Old exceptions
+ # do not apply any more.
+ for attr in ['_redirarg', '_getexception']:
+ if hasattr(self, attr):
+ delattr(self,attr)
+ else:
+ # Make sure we re-raise an exception we got on an earlier attempt
+ if hasattr(self, '_redirarg') and not get_redirect:
+ raise IsRedirectPage, self._redirarg
+ elif hasattr(self, '_getexception'):
+ raise self._getexception
+ if force or not hasattr(self, "_revid") \
+ or not self._revid in self._revisions:
+ self.site().getrevisions(self, getText=True, ids=None, sysop=sysop)
+ # TODO: Exception handling for no-page, redirects, etc.
+
+ return self._revisions[self._revid].text
+
+ def getOldVersion(self, oldid, force=False, get_redirect=False,
+ throttle=None, sysop=False, nofollow_redirects=None,
+ change_edit_time=None):
+ """Return text of an old revision of this page; same options as
get().
+
+ @param oldid: The revid of the revision desired.
+
+ """
+ if throttle is not None:
+ logging.debug(
+ "Page.getOldVersion(throttle) option is deprecated.")
+ if nofollow_redirects is not None:
+ logging.debug(
+ "Page.getOldVersion(nofollow_redirects) option is
deprecated.")
+ if change_edit_time is not None:
+ logging.debug(
+ "Page.getOldVersion(change_edit_time) option is deprecated.")
+ if force or not oldid in self._revisions:
+ self.site().getrevisions(self, getText=True, ids=oldid,
+ redirs=get_redirect, sysop=sysop)
+ return self._revisions[oldid].text
+
+ def permalink(self):
+ """Return the permalink URL for current revision of this
page."""
+ return "%s://%s/%sindex.php?title=%s&oldid=%s" \
+ % (self.site().protocol(),
+ self.site().hostname(),
+ self.site().script_path(),
+ self.title(asUrl=True),
+ self.latestRevision())
+
+ def latestRevision(self):
+ """Return the current revision id for this
page."""
+ if not hasattr(self, '_revid'):
+ self.site().getrevisions(self)
+ return self._revid
+
+ def userName(self):
+ """Return name or IP address of last user to edit
page."""
+ return self._revisions[self.latestRevision()].user
+
+ def isIpEdit(self):
+ """Return True if last editor was unregistered."""
+ return self._revisions[self.latestRevision()].anon
+
+ def editTime(self):
+ """Return timestamp (in MediaWiki format) of last revision to
page."""
+ return self._revisions[self.latestRevision()].timestamp
+
+ def previousRevision(self):
+ """Return the revision id for the previous revision of this
Page."""
+ vh = self.getVersionHistory(revCount=2)
+ return vh[1][0]
+
+ def exists(self):
+ """Return True if page exists on the wiki, even if it's a
redirect.
+
+ If the title includes a section, return False if this section isn't
+ found.
+
+ """
+ return self.site().page_exists(self)
+
+ def isRedirectPage(self):
+ """Return True if this is a redirect, False if not or not
existing."""
+ return self.site().page_isredirect(self)
+
+ def isEmpty(self):
+ """Return True if the page text has less than 4 characters.
+
+ Character count ignores language links and category links.
+ Can raise the same exceptions as get().
+
+ """
+ txt = self.get()
+ txt = pywikibot.removeLanguageLinks(txt, site = self.site())
+ txt = pywikibot.removeCategoryLinks(txt, site = self.site())
+ if len(txt) < 4:
+ return True
+ else:
+ return False
+
+ def isTalkPage(self):
+ """Return True if this page is in any talk
namespace."""
+ ns = self.namespace()
+ return ns >= 0 and ns % 2 == 1
+
+ def toggleTalkPage(self):
+ """Return other member of the article-talk page pair for this
Page.
+
+ If self is a talk page, returns the associated content page;
+ otherwise, returns the associated talk page.
+ Returns None if self is a special page.
+
+ """
+ ns = self.namespace()
+ if ns < 0: # Special page
+ return None
+ if self.isTalkPage():
+ if self.namespace() == 1:
+ return Page(self.site(), self.title(withNamespace=False))
+ else:
+ return Page(self.site(),
+ self.site().namespace(ns - 1) + ':'
+ + self.title(withNamespace=False))
+ else:
+ return Page(self.site(),
+ self.site().namespace(ns + 1) + ':'
+ + self.title(withNamespace=False))
+
+ def isCategory(self):
+ """Return True if the page is a Category, False
otherwise."""
+ return self.namespace() == 14
+
+ def isImage(self):
+ """Return True if this is an image description page, False
otherwise."""
+ return self.namespace() == 6
+
+ def isDisambig(self):
+ """Return True if this is a disambiguation page, False otherwise.
+
+ Relies on the presence of specific templates, identified in the Family
+ file, to identify disambiguation pages.
+
+ """
+ if not hasattr(self, '_isDisambig'):
+ locdis = self.site().family.disambig( self.site().lang )
+ for template in self.templates():
+ tn = template.title(withNamespace=False)
+ if tn in locdis:
+ _isDisambig = True
+ break
+ else:
+ _isDisambig = False
+ return _isDisambig
+
+ def getReferences(self, follow_redirects=True, withTemplateInclusion=True,
+ onlyTemplateInclusion=False, redirectsOnly=False):
+ """Yield all pages that link to the page.
+
+ If you need a full list of referring pages, use
+ C{pages = list(s.getReferences())}
+
+ @param follow_redirects: if True, also return pages that link to a
+ redirect pointing to the page.
+ @param withTemplateInclusion: if True, also return pages where self
+ is used as a template.
+ @param onlyTemplateInclusion: if True, only return pages where self
+ is used as a template.
+ @param redirectsOnly: if True, only return redirects to self.
+
+ """
+ # N.B.: this method intentionally overlaps with backlinks() and
+ # embeddedin(). Depending on the interface, it may be more efficient
+ # to implement those methods in the site interface and then combine
+ # the results for this method, or to implement this method and then
+ # split up the results for the others.
+ return self.site().getreferences(self, follow_redirects,
+ withTemplateInclusion,
+ onlyTemplateInclusion,
+ redirectsOnly)
+
+ def backlinks(self, followRedirects=True, filterRedirects=None):
+ """Yield all pages that contain ordinary wikilinks to this page.
+
+ @param followRedirects: if True, also return pages that link to a
+ redirect pointing to the page.
+ @param filterRedirects: if True, only return redirects; if False,
+ omit redirects; if None, do not filter
+
+ """
+ return self.site().getbacklinks(self, followRedirects, filterRedirects)
+
+ def embeddedin(self):
+ """Yield all pages that embed this page as a
template."""
+ return self.site().getembeddedin(self)
+
+ def canBeEdited(self):
+ """Return bool indicating whether this page can be edited.
+
+ This returns True if and only if:
+ - page is unprotected, and bot has an account for this site, or
+ - page is protected, and bot has a sysop account for this site.
+
+ """
+ return self.site().page_can_be_edited(self)
+
+ def botMayEdit(self):
+ """Return True if this page allows bots to edit it.
+
+ This will be True if the page doesn't contain {{bots}} or
+ {{nobots}}, or it contains them and the active bot is allowed to
+ edit this page. (This method is only useful on those sites that
+ recognize the bot-exclusion protocol; on other sites, it will always
+ return True.)
+
+ The framework enforces this restriction by default. It is possible
+ to override this by setting ignore_bot_templates=True in
+ user_config.py, or using page.put(force=True).
+
+ """ # TODO: move this to Site object?
+ if config.ignore_bot_templates: #Check the "master ignore switch"
+ return True
+ try:
+ templates = self.templatesWithParams();
+ except (NoPage, IsRedirectPage, SectionError):
+ return True
+ for template in templates:
+ title = template[0].title(withNamespace=False)
+ if title == 'Nobots':
+ return False
+ elif title == 'Bots':
+ if len(template[1]) == 0:
+ return True
+ else:
+ (ttype, bots) = template[1][0].split('=', 1)
+ bots = bots.split(',')
+ if ttype == 'allow':
+ if 'all' in bots or username in bots:
+ return True
+ else:
+ return False
+ if ttype == 'deny':
+ if 'all' in bots or username in bots:
+ return False
+ else:
+ return True
+ # no restricting template found
+ return True
+
+
+ def put(self, newtext, comment=None, watchArticle=None, minorEdit=True,
+ force=False):
+ """Save the page with the contents of the first argument as the
text.
+
+ @param newtext: The complete text of the revised page.
+ @type newtext: unicode
+ @param comment: The edit summary for the modification (optional,
+ but most wikis strongly encourage its use)
+ @type comment: unicode
+ @param watchArticle: if True, add or if False, remove this Page
+ to/from bot user's watchlist; if None, leave watchlist status
+ unchanged
+ @type watchArticle: bool or None
+ @param minorEdit: if True, mark this edit as minor
+ @type minorEdit: bool
+ @param force: if True, ignore botMayEdit() setting
+ @type force: bool
+
+ """
+ return self.site().put(self, newtext, comment, watchArticle,
+ minorEdit, force)
+
+ def put_async(self, newtext,
+ comment=None, watchArticle=None, minorEdit=True, force=False,
+ callback=None):
+ """Put page on queue to be saved to wiki asynchronously.
+
+ Asynchronous version of put (takes the same arguments), which places
+ pages on a queue to be saved by a daemon thread. All arguments are
+ the same as for .put(), except:
+
+ @param callback: a callable object that will be called after the
+ page put operation. This object must take two arguments: (1) a
+ Page object, and (2) an exception instance, which will be None
+ if the page was saved successfully. The callback is intended for
+ use by bots that need to keep track of which saves were
+ successful.
+
+ """
+ return self.site().put(self, newtext, comment, watchArticle,
+ minorEdit, force, callback, async=True)
+
+ def linkedPages(self):
+ """Iterate Pages that this Page links to.
+
+ Only returns pages from "normal" internal links. Image and category
+ links are omitted unless prefixed with ":"; embedded templates are
+ omitted (but links within them are returned); all interwiki and
+ external links are omitted.
+
+ @return: a generator that yields Page objects.
+
+ """
+ return self.site().getlinks(self)
+
+ def interwiki(self):
+ """Iterate interwiki links in the page text.
+
+ @return: a generator that yields Link objects.
+
+ """
+ return self.site().getinterwiki(self)
+
+ def langlinks(self):
+ """Iterate all interlanguage links on this page.
+
+ Note that the links yielded by this method will be a subset of
+ the results of self.interwiki().
+
+ @return: a generator that yields Link objects.
+
+ """
+ return self.site().getlanglinks(self)
+
+ def imagelinks(self, followRedirects=False, loose=None):
+ """Iterate ImagePage objects for images displayed on this Page.
+
+ @param followRedirects: if an image link redirects to another page,
+ yield the redirect target instead of the original link
+ @param loose: DEPRECATED and ignored
+ @return: a generator that yields ImagePage objects.
+
+ """
+ if loose is not None:
+ logging.debug(
+ u"Page.imagelinks(loose) option is deprecated.")
+ return self.site().getimages(followRedirects)
+
+ def templates(self):
+ """Iterate Page objects for templates used on this Page.
+
+ Template parameters are ignored. This method only returns embedded
+ templates, not template pages that happen to be referenced through
+ a normal link.
+
+ """
+ return self.site().gettemplates(self)
+
+ def templatesWithParams(self):
+ """Iterate templates used on this Page.
+
+ @return: a generator that yields a tuple for each use of a template
+ in the page, with the template Page as the first entry and a list of
+ parameters as the second entry.
+
+ """
+ return self.site().templates_with_params(self)
+
+ def categories(self, nofollow_redirects=None, withSortKey=False):
+ """Iterate categories that the article is in.
+
+ @param nofollow_redirects: DEPRECATED and ignored
+ @param withSortKey: if True, include the sort key in each Category.
+ @return: a generator that yields Category objects.
+
+ """
+ # follow_redirects makes no sense here because category membership
+ # doesn't follow redirects
+ if nofollow_redirects is not None:
+ logging.debug(
+ u"Page.categories(nofollow_redirects) option is deprecated.")
+ return self.site().categories(withSortKey=withSortKey)
+
+ def extlinks(self):
+ """Iterate all external URLs (not interwiki links) from this
page.
+
+ @return: a generator that yields unicode objects containing URLs.
+
+ """
+ return self.site().getextlinks(self)
+
+ def getRedirectTarget(self):
+ """Return a Page object for the target this Page redirects to.
+
+ If this page is not a redirect page, will raise an IsNotRedirectPage
+ exception. This method also can raise a NoPage exception.
+
+ """
+ return self.site().follow_redirect(self)
+
+ def getVersionHistory(self, forceReload=False, reverseOrder=False,
+ getAll=False, revCount=500):
+ """Load the version history page and return history information.
+
+ Return value is a list of tuples, where each tuple represents one
+ edit and is built of revision id, edit date/time, user name, and
+ edit summary. Starts with the most current revision, unless
+ reverseOrder is True. Defaults to getting the first revCount edits,
+ unless getAll is True.
+
+ """
+ if getAll:
+ limit = None
+ else:
+ limit = revCount
+ return self.site().getrevisions(self, withText=False,
+ older=reverseOrder, limit=limit)
+
+ def getVersionHistoryTable(self, forceReload=False, reverseOrder=False,
+ getAll=False, revCount=500):
+ """Return the version history as a wiki table."""
+ result = '{| border="1"\n'
+ result += '! oldid || date/time || username || edit summary\n'
+ for oldid, time, username, summary \
+ in self.getVersionHistory(forceReload=forceReload,
+ reverseOrder=reverseOrder,
+ getAll=getAll, revCount=revCount):
+ result += '|----\n'
+ result += '| %s || %s || %s || <nowiki>%s</nowiki>\n'\
+ % (oldid, time, username, summary)
+ result += '|}\n'
+ return result
+
+ def fullVersionHistory(self):
+ """Iterate all previous versions including wikitext.
+
+ @return: A generator that yields tuples consisting of revision ID,
+ edit date/time, user name and content
+ """
+ return self.site().getrevisions(self, withText=True,
+ older=reverseOrder, limit=None)
+
+ def contributingUsers(self):
+ """Return a set of usernames (or IPs) of users who edited this
page."""
+ edits = self.getVersionHistory()
+ users = set([edit[2] for edit in edits])
+ return users
+
+ def move(self, newtitle, reason=None, movetalkpage=True, sysop=False,
+ throttle=None, deleteAndMove=False, safe=True):
+ """Move this page to a new title.
+
+ @param newtitle: The new page title.
+ @param reason: The edit summary for the move.
+ @param movetalkpage: If true, move this page's talk page (if it exists)
+ @param sysop: Try to move using sysop account, if available
+ @param throttle: DEPRECATED
+ @param deleteAndMove: if move succeeds, delete the old page
+ (requires sysop privileges)
+ @param safe: If false, attempt to delete existing page at newtitle
+ (if there is one) and then move this page to that title
+
+ """
+ if throttle is not None:
+ logging.debug(
+ u"Page.move: throttle option is deprecated.")
+ if reason is None:
+ pywikibot.output(u'Moving %s to [[%s]].'
+ % (self.title(asLink=True), newtitle))
+ reason = pywikibot.input(u'Please enter a reason for the move:')
+ return self.site().move(self, newtitle, reason,
+ movetalkpage=movetalkpage, sysop=sysop,
+ deleteAndMove=deleteAndMove, safe=safe)
+
+ def delete(self, reason=None, prompt=True, throttle=None, mark=False):
+ """Deletes the page from the wiki. Requires administrator status.
+
+ @param reason: The edit summary for the deletion.
+ @param prompt: If true, prompt user for confirmation before deleting.
+ @param mark: if true, and user does not have sysop rights, place a
+ speedy-deletion request on the page instead.
+
+ """
+ if throttle is not None:
+ logging.debug(
+ u"Page.delete: throttle option is deprecated.")
+ if reason is None:
+ pywikibot.output(u'Deleting %s.' % (self.title(asLink=True)))
+ reason = pywikibot.input(u'Please enter a reason for the deletion:')
+ answer = u'y'
+ if prompt and not hasattr(self.site(), '_noDeletePrompt'):
+ answer = pywikibot.inputChoice(u'Do you want to delete %s?'
+ % self.title(asLink = True, forceInterwiki = True),
+ ['Yes', 'No', 'All'],
+ ['Y', 'N', 'A'],
+ 'N')
+ if answer in ['a', 'A']:
+ answer = 'y'
+ self.site()._noDeletePrompt = True
+ if answer in ['y', 'Y']:
+ return self.site().delete(self, reason, mark=mark)
+
+ def loadDeletedRevisions(self):
+ """Retrieve all deleted revisions for this Page from
Special/Undelete.
+
+ Stores all revisions' timestamps, dates, editors and comments in
+ self._deletedRevs attribute.
+
+ @return: list of timestamps (which can be used to retrieve revisions
+ later on).
+
+ """
+ return self.site().loadDeletedRevisions(self)
+
+ def getDeletedRevision(self, timestamp, retrieveText=False):
+ """Return a particular deleted revision by timestamp.
+
+ @return: a list of [date, editor, comment, text, restoration
+ marker]. text will be None, unless retrieveText is True (or has
+ been retrieved earlier). If timestamp is not found, returns
+ None.
+
+ """
+ return self.site().getDeletedRevision(self, timestamp,
+ getText=retrieveText)
+
+ def markDeletedRevision(self, timestamp, undelete=True):
+ """Mark the revision identified by timestamp for undeletion.
+
+ @param undelete: if False, mark the revision to remain deleted.
+
+ """
+ if self._deletedRevs == None:
+ self.loadDeletedRevisions()
+ if not self._deletedRevs.has_key(timestamp):
+ #TODO: Throw an exception?
+ return None
+ self._deletedRevs[timestamp][4] = undelete
+ self._deletedRevsModified = True
+
+ def undelete(self, comment=None, throttle=None):
+ """Undelete revisions based on the markers set by previous calls.
+
+ If no calls have been made since loadDeletedRevisions(), everything
+ will be restored.
+
+ Simplest case::
+ Page(...).undelete('This will restore all revisions')
+
+ More complex::
+ pg = Page(...)
+ revs = pg.loadDeletedRevsions()
+ for rev in revs:
+ if ... #decide whether to undelete a revision
+ pg.markDeletedRevision(rev) #mark for undeletion
+ pg.undelete('This will restore only selected revisions.')
+
+ @param comment: The undeletion edit summary.
+ @param throttle: DEPRECATED
+
+ """
+ if throttle is not None:
+ logging.debug(
+ u"Page.undelete: throttle option is deprecated.")
+ if comment is None:
+ pywikibot.output(u'Preparing to undelete %s.'
+ % (self.title(asLink=True)))
+ comment = pywikibot.input(
+ u'Please enter a reason for the undeletion:')
+ return self.site().undelete(self, comment)
+
+ def protect(self, edit='sysop', move='sysop',
create='sysop',
+ unprotect=False, reason=None, prompt=True, throttle=None):
+ """(Un)protect a wiki page. Requires administrator status.
+
+ Valid protection levels (in MediaWiki 1.12) are '' (equivalent to
+ 'none'), 'autoconfirmed', and 'sysop'.
+
+ @param edit: Level of edit protection
+ @param move: Level of move protection
+ @param create: Level of create protection
+ @param unprotect: If true, unprotect the page (equivalent to setting
+ all protection levels to '')
+ @param reason: Edit summary.
+ @param prompt: If true, ask user for confirmation.
+ @param throttle: DEPRECATED
+
+ """
+ if throttle is not None:
+ logging.debug(
+ u"Page.protect: throttle option is deprecated.")
+ if reason is None:
+ if unprotect:
+ un = u'un'
+ else:
+ un = u''
+ pywikibot.output(u'Preparing to %sprotect %s.'
+ % (un, self.title(asLink=True)))
+ reason = pywikibot.input(u'Please enter a reason for the action:')
+ if unprotect:
+ edit = move = create = ""
+ answer = 'y'
+ if prompt and not hasattr(self.site(), '_noProtectPrompt'):
+ answer = pywikibot.inputChoice(
+ u'Do you want to change the protection level of %s?'
+ % self.title(asLink=True, forceInterwiki = True),
+ ['Yes', 'No', 'All'], ['Y',
'N', 'A'], 'N')
+ if answer in ['a', 'A']:
+ answer = 'y'
+ self.site()._noProtectPrompt = True
+ if answer in ['y', 'Y']:
+ return self.site().protect(self, edit, move, create, reason)
+
+######## DEPRECATED METHODS ########
+
+ def encoding(self):
+ """Return the character encoding used on this Page's wiki
Site.
+
+ DEPRECATED: use Site.encoding() instead
+
+ """
+ logging.debug(u"Page.encoding() is deprecated; use Site.encoding().")
+ return self.site().encoding()
+
+ def titleWithoutNamespace(self, underscore=False):
+ """Return title of Page without namespace and without section.
+
+ DEPRECATED: use self.title(withNamespace=False) instead.
+
+ """
+ logging.debug(
+ u"Page.titleWithoutNamespace() method is deprecated.")
+ return self.title(underscore=underscore, withNamespace=False,
+ withSection=False)
+
+ def sectionFreeTitle(self, underscore=False):
+ """Return the title of this Page, without the section (if any).
+
+ DEPRECATED: use self.title(withSection=False) instead.
+
+ """
+ logging.debug(
+ u"Page.sectionFreeTitle() method is deprecated.")
+ return self.title(underscore=underscore, withSection=False)
+
+ def aslink(self, forceInterwiki=False, textlink=False, noInterwiki=False):
+ """Return a string representation in the form of a wikilink.
+
+ DEPRECATED: use self.title(asLink=True) instead.
+
+ """
+ logging.debug(u"Page.aslink() method is deprecated.")
+ return self.title(asLink=True, forceInterwiki=forceInterwiki,
+ allowInterwiki=not noInterwiki, textlink=textlink)
+
+ def urlname(self):
+ """Return the Page title encoded for use in an URL.
+
+ DEPRECATED: use self.title(asUrl=True) instead.
+
+ """
+ logging.debug(u"Page.urlname() method is deprecated.")
+ return self.title(asUrl=True)
+
+####### DISABLED METHODS (warnings provided) ######
+ # these methods are easily replaced by editing the page's text using
+ # textlib methods and then using put() on the result.
+
+ def removeImage(self, image, put=False, summary=None, safe=True):
+ """Old method to remove all instances of an image from
page."""
+ logging.warning(u"Page.removeImage() is no longer supported.")
+
+ def replaceImage(self, image, replacement=None, put=False, summary=None,
+ safe=True):
+ """Old method to replace all instances of an image with
another."""
+ logging.warning(u"Page.replaceImage() is no longer supported.")
+
+
+class ImagePage(Page):
+ """A subclass of Page representing an image descriptor wiki page.
+
+ Supports the same interface as Page, with the following added methods:
+
+ getImagePageHtml : Download image page and return raw HTML text.
+ fileURL : Return the URL for the image described on this
+ page.
+ fileIsOnCommons : Return True if image stored on Wikimedia
+ Commons.
+ fileIsShared : Return True if image stored on Wikitravel
+ shared repository.
+ getFileMd5Sum : Return image file's MD5 checksum.
+ getFileVersionHistory : Return the image file's version history.
+ getFileVersionHistoryTable: Return the version history in the form of a
+ wiki table.
+ usingPages : Iterate Pages on which the image is displayed.
+
+ """
+ def __init__(self, site, title, insite = None):
+ Page.__init__(self, site, title, insite, defaultNamespace=6)
+ if self.namespace() != 6:
+ raise ValueError(u"'%s' is not in the image namespace!" %
title)
+
+ def getImagePageHtml(self):
+ """
+ Download the image page, and return the HTML, as a unicode string.
+
+ Caches the HTML code, so that if you run this method twice on the
+ same ImagePage object, the page will only be downloaded once.
+ """
+ if not hasattr(self, '_imagePageHtml'):
+ from pywikibot.data import http
+ path = "%s/index.php?title=%s" \
+ % (self.site().scriptpath(), self.title(asUrl=True))
+ self._imagePageHtml = http.request(self.site(), path)
+ return self._imagePageHtml
+
+ def fileUrl(self):
+ """Return the URL for the image described on this
page."""
+ # TODO add scaling option?
+ if not hasattr(self, '_imageinfo'):
+ self._imageinfo = self.site().getimageinfo(self)
+ return self._imageinfo['url']
+
+ def fileIsOnCommons(self):
+ """Return True if the image is stored on Wikimedia
Commons"""
+ return self.fileUrl().startswith(
+ 'http://upload.wikimedia.org/wikipedia/commons/')
+
+ def fileIsShared(self):
+ """Return True if image is stored on any known shared
repository."""
+ # as of now, the only known repositories are commons and wikitravel
+ if 'wikitravel_shared' in self.site().shared_image_repository():
+ return self.fileUrl().startswith(
+
u'http://wikitravel.org/upload/shared/')
+ return self.fileIsOnCommons()
+
+ def getFileMd5Sum(self):
+ """Return image file's MD5 checksum."""
+ logging.debug(
+ "ImagePage.getFileMd5Sum() is deprecated; use getFileSHA1Sum().")
+# FIXME: MD5 might be performed on incomplete file due to server disconnection
+# (see bug #1795683).
+ import md5, urllib
+ f = urllib.urlopen(self.fileUrl())
+ # TODO: check whether this needs a User-Agent header added
+ md5Checksum = md5.new(f.read()).hexdigest()
+ f.close()
+ return md5Checksum
+
+ def getFileSHA1Sum(self):
+ """Return image file's SHA1 checksum."""
+ if not hasattr(self, '_imageinfo'):
+ self._imageinfo = self.site().getimageinfo(self)
+ return self._imageinfo['sha1']
+
+ def getFileVersionHistory(self):
+ """Return the image file's version history.
+
+ @return: An iterator yielding tuples containing (timestamp,
+ username, resolution, filesize, comment).
+
+ """
+ #TODO; return value may need to change
+ return self.site().getimageinfo(self, history=True)
+
+ def getFileVersionHistoryTable(self):
+ """Return the version history in the form of a wiki
table."""
+ lines = []
+ #TODO: if getFileVersionHistory changes, make sure this follows it
+ for (datetime, username, resolution, size, comment) \
+ in self.getFileVersionHistory():
+ lines.append('| %s || %s || %s || %s ||
<nowiki>%s</nowiki>' \
+ % (datetime, username, resolution, size, comment))
+ return u'{| border="1"\n! date/time || username || resolution ||
size || edit summary\n|----\n' + u'\n|----\n'.join(lines) + '\n|}'
+
+ def usingPages(self):
+ """Yield Pages on which the image is displayed."""
+ return self.site().getimageusage(self)
+
+class Category(Page):
+ """A page in the Category: namespace"""
+
+ def __init__(self, site, title, insite=None, sortKey=None):
+ """All parameters are the same as for Page() constructor, except:
+
+ @param sortKey: DEPRECATED (use .aslink() method instead)
+
+ """
+ Page.__init__(self, site=site, title=title, insite=insite,
+ defaultNamespace=14)
+ if sortKey is not None:
+ logging.debug(
+ "The 'sortKey' option in Category constructor is
deprecated.")
+ if self.namespace() != 14:
+ raise ValueError(u"'%s' is not in the category namespace!"
+ % title)
+
+ def aslink(self, sortKey=u'', forceInterwiki=None, textlink=None,
+ noInterwiki=None):
+ """Return a link to place a page in this Category.
+
+ Use this only to generate a "true" category link, not for interwikis
+ or text links to category pages.
+
+ Parameters are deprecated and preserved for backwards-compatibility,
+ except:
+
+ @param sortKey: The sort key for the article to be placed in this
+ Category; if omitted, default sort key is used.
+ @type sortKey: (optional) unicode
+
+ """
+ if forceInterwiki is not None \
+ or textlink is not None or noInterwiki is not None:
+ logging.debug("All arguments to Category.aslink() are
deprecated.")
+ if sortKey:
+ titleWithSortKey = '%s|%s' % (self.title(withSection=False),
+ self.sortKey)
+ else:
+ titleWithSortKey = self.title(withSection=False)
+ return '[[%s]]' % titleWithSortKey
+
+ def subcategories(self, recurse=False):
+ """Iterate all subcategories of the current category.
+
+ @param recurse: if not False or 0, also iterate subcategories of
+ subcategories. If an int, limit recursion to this number of
+ levels. (Example: recurse=1 will iterate direct subcats and
+ first-level sub-sub-cats, but no deeper.)
+ @type recurse: int or bool
+
+ """
+ if not isinstance(recurse, bool) and recurse:
+ recurse = recurse - 1
+ if not hasattr(self, "_subcats"):
+ self._subcats = []
+ for member in self.site().categorymembers(self, namespaces=[14]):
+ subcat = Category(self.site(), member.title())
+ self.subcats.append(subcat)
+ yield subcat
+ if recurse:
+ for item in subcat.subcategories(recurse):
+ yield item
+ else:
+ for subcat in self._subcats:
+ yield subcat
+ if recurse:
+ for item in subcat.subcategories(recurse):
+ yield item
+
+ def articles(self, recurse=False, startFrom=None):
+ """
+ Yields all articles in the current category.
+
+ @param recurse: if not False or 0, also iterate articles in
+ subcategories. If an int, limit recursion to this number of
+ levels. (Example: recurse=1 will iterate articles in first-level
+ subcats, but no deeper.)
+ @type recurse: int or bool
+
+ """
+ namespaces = self.site().namespaces()
+ namespaces.remove(14)
+ for member in self.site().categorymembers(self, namespaces=namespaces):
+ yield member
+ if recurse:
+ if not isinstance(recurse, bool) and recurse:
+ recurse = recurse - 1
+ for subcat in self.subcategories():
+ for article in subcat.articles(recurse):
+ yield article
+
+ def isEmptyCategory(self):
+ """Return True if category has no members (including
subcategories)."""
+ for member in self.site().categorymembers(self, limit=1):
+ return False
+ return True
+
+ def copyTo(self, catname):
+ """
+ Copy text of category page to a new page. Does not move contents.
+
+ @param catname: New category title (without namespace)
+ @return: True if copying was successful, False if target page
+ already existed.
+
+ """
+ # This seems far too specialized to be in the top-level framework
+ catname = self.site().category_namespace() + ':' + catname
+ targetCat = Category(self.site(), catname)
+ if targetCat.exists():
+ pywikibot.output('Target page %s already exists!'
+ % targetCat.title())
+ return False
+ else:
+ pywikibot.output('Moving text from %s to %s.'
+ % (self.title(), targetCat.title()))
+ authors = ', '.join(self.contributingUsers())
+ creationSummary = pywikibot.translate(
+ self.site(), msg_created_for_renaming
+ ) % (self.title(), authors)
+ targetCat.put(self.get(), creationSummary)
+ return True
+
+ def copyAndKeep(self, catname, cfdTemplates):
+ """Copy partial category page text (not contents) to a new title.
+
+ Like copyTo above, except this removes a list of templates (like
+ deletion templates) that appear in the old category text. It also
+ removes all text between the two HTML comments BEGIN CFD TEMPLATE
+ and END CFD TEMPLATE. (This is to deal with CFD templates that are
+ substituted.)
+
+ Returns true if copying was successful, false if target page already
+ existed.
+
+ @param catname: New category title (without namespace)
+ @param cfdTemplates: A list (or iterator) of templates to be removed
+ from the page text
+ @return: True if copying was successful, False if target page
+ already existed.
+
+ """
+ # I don't see why we need this as part of the framework either
+ catname = self.site().category_namespace() + ':' + catname
+ targetCat = Category(self.site(), catname)
+ if targetCat.exists():
+ pywikibot.output('Target page %s already exists!'
+ % targetCat.title())
+ return False
+ else:
+ pywikibot.output('Moving text from %s to %s.'
+ % (self.title(), targetCat.title()))
+ authors = ', '.join(self.contributingUsers())
+ creationSummary = pywikibot.translate(
+ self.site(), msg_created_for_renaming
+ ) % (self.title(), authors)
+ newtext = self.get()
+ for regexName in cfdTemplates:
+ matchcfd = re.compile(r"{{%s.*?}}" % regexName, re.IGNORECASE)
+ newtext = matchcfd.sub('',newtext)
+ matchcomment = re.compile(
+ r"<!--BEGIN CFD TEMPLATE-->.*?<!--END CFD
TEMPLATE-->",
+ re.IGNORECASE | re.MULTILINE | re.DOTALL)
+ newtext = matchcomment.sub('', newtext)
+ pos = 0
+ while (newtext[pos:pos+1] == "\n"):
+ pos = pos + 1
+ newtext = newtext[pos:]
+ targetCat.put(newtext, creationSummary)
+ return True
+
+#### DEPRECATED METHODS ####
+ def subcategoriesList(self, recurse=False):
+ """DEPRECATED: Equivalent to
list(self.subcategories(...))"""
+ logging.debug("Category.subcategoriesList() method is deprecated.")
+ return sorted(list(set(self.subcategories(recurse))))
+
+ def articlesList(self, recurse=False):
+ """DEPRECATED: equivalent to
list(self.articles(...))"""
+ logging.debug("Category.articlesList() method is deprecated.")
+ return sorted(list(set(self.articles(recurse))))
+
+ def supercategories(self):
+ """DEPRECATED: equivalent to self.categories()"""
+ logging.debug("Category.supercategories() method is deprecated.")
+ return self.categories()
+
+ def supercategoriesList(self):
+ """DEPRECATED: equivalent to
list(self.categories(...))"""
+ logging.debug("Category.articlesList() method is deprecated.")
+ return sorted(list(set(self.categories())))
+
+
+class Revision(object):
+ """A structure holding information about a single revision of a
Page."""
+ def __init__(self, revid, timestamp, user, anon=False, comment=u"",
+ text=None, minor=False):
+ """All parameters correspond to object attributes (e.g., revid
+ parameter is stored as self.revid)
+
+ @param revid: Revision id number
+ @type revid: int
+ @param text: Revision wikitext.
+ @type text: unicode, or None if text not yet retrieved
+ @param timestamp: Revision time stamp (in MediaWiki text format)
+ @type timestamp: unicode
+ @param user: user who edited this revision
+ @type user: unicode
+ @param anon: user is unregistered
+ @type anon: bool
+ @param comment: edit comment text
+ @type comment: unicode
+ @param minor: edit flagged as minor
+ @type minor: bool
+
+ """
+ self.revid = revid
+ self.text = text
+ self.timestamp = timestamp
+ self.user = user
+ self.anon = anon
+ self.comment = comment
+ self.minor = minor
+
+
+class Link(object):
+ """A Mediawiki link (local or interwiki)
+
+ Has the following attributes:
+
+ - site: The Site object for the wiki linked to
+ - namespace: The namespace of the page linked to (int)
+ - title: The title of the page linked to (unicode); does not include
+ namespace or section
+ - section: The section of the page linked to (unicode or None); this
+ contains any text following a '#' character in the title
+ - anchor: The anchor text (unicode or None); this contains any text
+ following a '|' character inside the link
+
+ """
+ illegal_titles_pattern = re.compile(
+ # Matching titles will be held as illegal.
+ u'''[^
%!\"$&'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+]'''
+ # URL percent encoding sequences interfere with the ability
+ # to round-trip titles -- you can't link to them consistently.
+ u'|%[0-9A-Fa-f]{2}'
+ # XML/HTML character references produce similar issues.
+ u'|&[A-Za-z0-9\x80-\xff]+;'
+ u'|&#[0-9]+;'
+ u'|&#x[0-9A-Fa-f]+;'
+ )
+ namespace_pattern = re.compile("^(.+?)_*:_*(.*)$")
+
+ def __init__(self, text, source=None, defaultNamespace=0):
+ """Parse text into a Link object.
+
+ @param text: the link text (everything appearing between [[ and ]]
+ on a wiki page)
+ @type text: unicode
+ @param source: the Site on which the link was found (not necessarily
+ the site to which the link refers)
+ @type source: Site
+ @param defaultNamespace: a namespace to use if the link does not
+ contain one (defaults to 0)
+ @type defaultNamespace: int
+
+ """
+ # First remove the anchor, which is stored unchanged, if there is one
+ if u"|" in text:
+ text, self.anchor = text.split(u"|", 1)
+ else:
+ self.anchor = None
+
+ if source is None:
+ source = pywikibot.Site()
+ self.source = self.site = source
+
+ # Clean up the name, it can come from anywhere.
+ # Convert HTML entities to unicode
+ t = html2unicode(text)
+
+ # Convert URL-encoded characters to unicode
+ t = url2unicode(t, site=self.site)
+
+ # Normalize unicode string to a NFC (composed) format to allow proper
+ # string comparisons. According to
+ #
http://svn.wikimedia.org/viewvc/mediawiki/branches/REL1_6/phase3/includes/n…
+ # the mediawiki code normalizes everything to NFC, not NFKC (which
+ # might result in information loss).
+ t = unicodedata.normalize('NFC', t)
+
+ # This code was adapted from Title.php : secureAndSplit()
+ #
+ if u'\ufffd' in t:
+ raise Error("Title contains illegal char (\\uFFFD)")
+ self.namespace = defaultNamespace
+
+ # Replace underscores by spaces
+ t = t.replace(u'_', u' ')
+ # replace multiple spaces and underscores with a single space
+ while u" " in t: t = t.replace(u" ", u" ")
+ # Strip spaces at both ends
+ t = t.strip()
+ # Remove left-to-right and right-to-left markers.
+ t = t.replace(u'\u200e', u'').replace(u'\u200f',
u'')
+
+ # Initial colon indicates main namespace rather than specified default
+ if t.startswith(u':'):
+ self.namespace = 0
+ # remove the colon but continue processing
+ # remove any subsequent whitespace
+ t = t[1:].strip()
+
+ # Namespace or interwiki prefix
+ firstPass = True
+ while True:
+ fam = self.site.family
+
+ m = Link.namespace_pattern.match(t)
+ if m:
+ pre = m.group(1).lower()
+ ns = self.site.getNamespaceIndex(pre)
+ if ns:
+ # Ordinary namespace
+ t = m.group(2)
+ self.namespace = ns
+ elif pre in fam.langs.keys()\
+ or pre in fam.get_known_families(site=self.site):
+
+ if not firstPass:
+ # Can't make a local interwiki link to an interwiki link.
+ # That's just crazy!
+ raise Error("Improperly formatted interwiki link
'%s'"
+ % text)
+
+ # Interwiki link
+ t = m.group(2)
+ if pre in fam.langs.keys():
+ newsite = pywikibot.Site(pre, fam)
+ else:
+ otherlang = self.site.lang
+ familyName = fam.get_known_families(site=self.site)[pre]
+ if familyName in ['commons', 'meta']:
+ otherlang = familyName
+ try:
+ newsite = pywikibot.Site(otherlang, familyName)
+ except ValueError:
+ raise Error("""\
+%s is not a local page on %s, and the %s family is
+not supported by PyWikiBot!"""
+ % (title, self.site(), familyName))
+
+ # Redundant interwiki prefix to the local wiki
+ if newsite == self.site:
+ if not t:
+ # Can't have an empty self-link
+ raise Error("Invalid link title: '%s'" %
text)
+ firstPass = False
+ continue
+ self.site = newsite
+ # If there's an initial colon after the interwiki, that also
+ # resets the default namespace
+ if t.startswith(":"):
+ self.namespace = 0
+ t = t[1:]
+ break
+
+ if u"#" in t:
+ t, sec = t.split(u'#', 1)
+ t, self.section = t.rstrip(), sec.lstrip()
+ else:
+ self.section = None
+
+ # Reject illegal characters.
+ if Link.illegal_titles_pattern.search(t):
+ raise Error("Invalid title (contains illegal char(s)):
'%s'" % text)
+
+ # Pages with "/./" or "/../" appearing in the URLs will
+ # often be unreachable due to the way web browsers deal
+ #* with 'relative' URLs. Forbid them explicitly.
+
+ if u'.' in t and (
+ t == u'.' or t == u'..'
+ or t.startswith(u"./")
+ or t.startswith(u"../")
+ or u"/./" in t
+ or u"/../" in t
+ or t.endswith(u"/.")
+ or t.endswith(u"/..")
+ ):
+ raise Error("Invalid title (contains . / combinations):
'%s'"
+ % text)
+
+ # Magic tilde sequences? Nu-uh!
+ if u"~~~" in t:
+ raise Error("Invalid title (contains ~~~): '%s'" % text)
+
+ if self.namespace != -1 and len(t) > 255:
+ raise Error("Invalid title (over 255 bytes): '%s'" % t)
+
+ if self.site.case() == 'first-letter':
+ t = t[:1].upper() + t[1:]
+
+ # Can't make a link to a namespace alone...
+ # "empty" local links can only be self-links
+ # with a fragment identifier.
+ if not t and self.site == self.source and self.namespace != 0:
+ raise ValueError("Invalid link (no page title): '%s'" %
text)
+
+ self.title = t
+
+
+# Utility functions for parsing page titles
+
+def html2unicode(text, ignore = []):
+ """Return text, replacing HTML entities by equivalent unicode
characters."""
+ # This regular expression will match any decimal and hexadecimal entity and
+ # also entities that might be named entities.
+ entityR = re.compile(
+
r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));')
+ # These characters are Html-illegal, but sadly you *can* find some of
+ # these and converting them to unichr(decimal) is unsuitable
+ convertIllegalHtmlEntities = {
+ 128 : 8364, # €
+ 130 : 8218, # ‚
+ 131 : 402, # ƒ
+ 132 : 8222, # „
+ 133 : 8230, # …
+ 134 : 8224, # †
+ 135 : 8225, # ‡
+ 136 : 710, # ˆ
+ 137 : 8240, # ‰
+ 138 : 352, # Š
+ 139 : 8249, # ‹
+ 140 : 338, # Œ
+ 142 : 381, # Ž
+ 145 : 8216, # ‘
+ 146 : 8217, # ’
+ 147 : 8220, # “
+ 148 : 8221, # ”
+ 149 : 8226, # •
+ 150 : 8211, # –
+ 151 : 8212, # —
+ 152 : 732, # ˜
+ 153 : 8482, # ™
+ 154 : 353, # š
+ 155 : 8250, # ›
+ 156 : 339, # œ
+ 158 : 382, # ž
+ 159 : 376 # Ÿ
+ }
+ #ensuring that illegal   and , which have no known
values,
+ #don't get converted to unichr(129), unichr(141) or unichr(157)
+ ignore = set(ignore) | set([129, 141, 157])
+ result = u''
+ i = 0
+ found = True
+ while found:
+ text = text[i:]
+ match = entityR.search(text)
+ if match:
+ unicodeCodepoint = None
+ if match.group('decimal'):
+ unicodeCodepoint = int(match.group('decimal'))
+ elif match.group('hex'):
+ unicodeCodepoint = int(match.group('hex'), 16)
+ elif match.group('name'):
+ name = match.group('name')
+ if htmlentitydefs.name2codepoint.has_key(name):
+ # We found a known HTML entity.
+ unicodeCodepoint = htmlentitydefs.name2codepoint[name]
+ result += text[:match.start()]
+ try:
+ unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint]
+ except KeyError:
+ pass
+ if unicodeCodepoint and unicodeCodepoint not in ignore and (WIDEBUILD or
unicodeCodepoint < 65534):
+ result += unichr(unicodeCodepoint)
+ else:
+ # Leave the entity unchanged
+ result += text[match.start():match.end()]
+ i = match.end()
+ else:
+ result += text
+ found = False
+ return result
+
+def url2unicode(title, site, site2 = None):
+ """Convert url-encoded text to unicode using site's encoding.
+
+ If site2 is provided, try its encodings as well. Uses the first encoding
+ that doesn't cause an error.
+
+ """
+ # create a list of all possible encodings for both hint sites
+ encList = [site.encoding()] + list(site.encodings())
+ if site2 and site2 <> site:
+ encList.append(site2.encoding())
+ encList += list(site2.encodings())
+ firstException = None
+ # try to handle all encodings (will probably retry utf-8)
+ for enc in encList:
+ try:
+ t = title.encode(enc)
+ t = urllib.unquote(t)
+ return unicode(t, enc)
+ except UnicodeError, ex:
+ if not firstException:
+ firstException = ex
+ pass
+ # Couldn't convert, raise the original exception
+ raise firstException
+
Added: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py (rev 0)
+++ branches/rewrite/pywikibot/site.py 2008-02-27 20:08:48 UTC (rev 5088)
@@ -0,0 +1,1947 @@
+# -*- coding: utf-8 -*-
+"""
+Objects representing MediaWiki sites (wikis) and families (groups of wikis
+on the same topic in different languages).
+"""
+#
+# (C) Pywikipedia bot team, 2008
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id: $'
+
+import pywikibot
+from pywikibot.exceptions import *
+from pywikibot.data import api
+
+import os
+
+def Family(fam=None, fatal=True):
+ """Import the named family.
+
+ @param fam: family name (if omitted, uses the configured default)
+ @type fam: str
+ @param fatal: if True, the bot will stop running if the given family is
+ unknown. If False, it will only raise a ValueError exception.
+ @param fatal: bool
+ @return: a Family instance configured for the named family.
+
+ """
+ if fam == None:
+ fam = pywikibot.default_family
+ try:
+ # first try the built-in families
+ exec "import pywikibot.families.%s_family as myfamily" % fam
+ except ImportError:
+ # next see if user has defined a local family module
+ try:
+ sys.path.append(pywikibot.config.datafilepath('families'))
+ exec "import %s_family as myfamily" % fam
+ except ImportError:
+ if fatal:
+ output(u"""\
+Error importing the %s family. This probably means the family
+does not exist. Also check your configuration file."""
+ % fam)
+ import traceback
+ traceback.print_stack()
+ sys.exit(1)
+ else:
+ raise ValueError("Family %s does not exist" % repr(fam))
+ return myfamily.Family()
+
+
+class BaseSite(object):
+ """Site methods that are independent of the communication
interface."""
+ # to implement a specific interface, define a Site class that inherits
+ # from this
+ def __init__(self, code, fam=None, user=None):
+ """
+ @param code: the site's language code
+ @type code: str
+ @param fam: wiki family name (optional)
+ @type fam: str or Family
+ @param user: bot user name (optional)
+ @type user: str
+
+ """
+ self._lang = code.lower()
+ if isinstance(fam, basestring) or fam is None:
+ self._family = Family(fam, fatal=False)
+ else:
+ self._family = fam
+
+## # if we got an outdated language code, use the new one instead.
+## if self._family.obsolete.has_key(self._lang):
+## if self._family.obsolete[self._lang] is not None:
+## self._lang = self._family.obsolete[self._lang]
+## else:
+## # no such language anymore
+## raise NoSuchSite("Language %s in family %s is obsolete"
+## % (self._lang, self._family.name))
+##
+## if self._lang not in self.languages():
+## if self._lang == 'zh-classic' and 'zh-classical' in
self.languages():
+## self._lang = 'zh-classical'
+## # database hack (database is varchar[10] -> zh-classical
+## # is cut to zh-classic.
+## else:
+## raise NoSuchSite("Language %s does not exist in family %s"
+## % (self._lang, self._family.name))
+ self._username = user
+
+ def family(self):
+ """Return the associated Family object."""
+ return self._family
+
+ def language(self):
+ """Return the site's language code."""
+ # N.B. this code does not always identify a language as such, but
+ # may identify a wiki that is part of any family grouping
+ return self._lang
+
+ def user(self):
+ """Return the currently-logged in bot user, or
None."""
+ if self.logged_in():
+ return self._username
+ return None
+
+ def __getattr__(self, attr):
+ """Calls to methods not defined in this object are passed to
Family."""
+ try:
+ method = getattr(self.family(), attr)
+ return lambda self=self: method(self.language())
+ except AttributeError:
+ raise AttributeError("%s instance has no attribute '%s'"
+ % (self.__class__.__name__, attr)
+ )
+
+ def sitename(self):
+ """Return string representing this Site's name and
language."""
+ return self.family().name+':'+self.language()
+
+ __str__ = sitename
+
+ def __repr__(self):
+ return 'Site("%s", "%s")' % (self.language(),
self.family().name)
+
+ def linktrail(self):
+ """Return regex for trailing chars displayed as part of a
link."""
+ return self.family().linktrail(self.language())
+
+ def languages(self):
+ """Return list of all valid language codes for this site's
Family."""
+ return self.family().langs.keys()
+
+ def getNamespaceIndex(self, namespace):
+ """Given a namespace name, return its int index, or None if
invalid."""
+ return self.family().getNamespaceIndex(self.language(), namespace)
+
+
+class APISite(BaseSite):
+ """API interface to MediaWiki site.
+
+ Do not use directly; use pywikibot.Site function.
+
+ """
+## Site methods from version 1.0 (as these are implemented in this file,
+## or declared deprecated/obsolete, they will be removed from this list)
+##########
+## validLanguageLinks: A list of language codes that can be used in interwiki
+## links.
+##
+## messages: return True if there are new messages on the site
+## cookies: return user's cookies as a string
+##
+## getUrl: retrieve an URL from the site
+## urlEncode: Encode a query to be sent using an http POST request.
+## postForm: Post form data to an address at this site.
+## postData: Post encoded form data to an http address at this site.
+##
+## namespace(num): Return local name of namespace 'num'.
+## normalizeNamespace(value): Return preferred name for namespace 'value' in
+## this Site's language.
+## namespaces: Return list of canonical namespace names for this Site.
+## getNamespaceIndex(name): Return the int index of namespace 'name', or None
+## if invalid.
+##
+## redirect: Return the localized redirect tag for the site.
+## redirectRegex: Return compiled regular expression matching on redirect
+## pages.
+## mediawiki_message: Retrieve the text of a specified MediaWiki message
+## has_mediawiki_message: True if this site defines specified MediaWiki
+## message
+##
+## shared_image_repository: Return tuple of image repositories used by this
+## site.
+## category_on_one_line: Return True if this site wants all category links
+## on one line.
+## interwiki_putfirst: Return list of language codes for ordering of
+## interwiki links.
+## linkto(title): Return string in the form of a wikilink to 'title'
+## isInterwikiLink(s): Return True if 's' is in the form of an interwiki
+## link.
+## getSite(lang): Return Site object for wiki in same family, language
+## 'lang'.
+## version: Return MediaWiki version string from Family file.
+## versionnumber: Return int identifying the MediaWiki version.
+## live_version: Return version number read from Special:Version.
+## checkCharset(charset): Warn if charset doesn't match family file.
+##
+## linktrail: Return regex for trailing chars displayed as part of a link.
+## disambcategory: Category in which disambiguation pages are listed.
+##
+## Methods that yield Page objects derived from a wiki's Special: pages
+## (note, some methods yield other information in a tuple along with the
+## Pages; see method docs for details) --
+##
+## search(query): query results from Special:Search
+## allpages(): Special:Allpages
+## prefixindex(): Special:Prefixindex
+## newpages(): Special:Newpages
+## newimages(): Special:Log&type=upload
+## longpages(): Special:Longpages
+## shortpages(): Special:Shortpages
+## categories(): Special:Categories (yields Category objects)
+## deadendpages(): Special:Deadendpages
+## ancientpages(): Special:Ancientpages
+## lonelypages(): Special:Lonelypages
+## unwatchedpages(): Special:Unwatchedpages (sysop accounts only)
+## uncategorizedcategories(): Special:Uncategorizedcategories (yields
+## Category objects)
+## uncategorizedpages(): Special:Uncategorizedpages
+## uncategorizedimages(): Special:Uncategorizedimages (yields
+## ImagePage objects)
+## unusedcategories(): Special:Unusuedcategories (yields Category)
+## unusedfiles(): Special:Unusedimages (yields ImagePage)
+## withoutinterwiki: Special:Withoutinterwiki
+## linksearch: Special:Linksearch
+##
+## Convenience methods that provide access to properties of the wiki Family
+## object; all of these are read-only and return a unicode string unless
+## noted --
+##
+## encoding: The current encoding for this site.
+## encodings: List of all historical encodings for this site.
+## category_namespace: Canonical name of the Category namespace on this
+## site.
+## category_namespaces: List of all valid names for the Category
+## namespace.
+## image_namespace: Canonical name of the Image namespace on this site.
+## template_namespace: Canonical name of the Template namespace on this
+## site.
+## protocol: Protocol ('http' or 'https') for access to this
site.
+## hostname: Host portion of site URL.
+## path: URL path for index.php on this Site.
+## dbName: MySQL database name.
+##
+## Methods that return addresses to pages on this site (usually in
+## Special: namespace); these methods only return URL paths, they do not
+## interact with the wiki --
+##
+## export_address: Special:Export.
+## query_address: URL path + '?' for query.php
+## api_address: URL path + '?' for api.php
+## apipath: URL path for api.php
+## move_address: Special:Movepage.
+## delete_address(s): Delete title 's'.
+## undelete_view_address(s): Special:Undelete for title 's'
+## undelete_address: Special:Undelete.
+## protect_address(s): Protect title 's'.
+## unprotect_address(s): Unprotect title 's'.
+## put_address(s): Submit revision to page titled 's'.
+## get_address(s): Retrieve page titled 's'.
+## nice_get_address(s): Short URL path to retrieve page titled 's'.
+## edit_address(s): Edit form for page titled 's'.
+## purge_address(s): Purge cache and retrieve page 's'.
+## block_address: Block an IP address.
+## unblock_address: Unblock an IP address.
+## blocksearch_address(s): Search for blocks on IP address 's'.
+## linksearch_address(s): Special:Linksearch for target 's'.
+## search_address(q): Special:Search for query 'q'.
+## allpages_address(s): Special:Allpages.
+## newpages_address: Special:Newpages.
+## longpages_address: Special:Longpages.
+## shortpages_address: Special:Shortpages.
+## unusedfiles_address: Special:Unusedimages.
+## categories_address: Special:Categories.
+## deadendpages_address: Special:Deadendpages.
+## ancientpages_address: Special:Ancientpages.
+## lonelypages_address: Special:Lonelypages.
+## unwatchedpages_address: Special:Unwatchedpages.
+## uncategorizedcategories_address: Special:Uncategorizedcategories.
+## uncategorizedimages_address: Special:Uncategorizedimages.
+## uncategorizedpages_address: Special:Uncategorizedpages.
+## unusedcategories_address: Special:Unusedcategories.
+## withoutinterwiki_address: Special:Withoutinterwiki.
+## references_address(s): Special:Whatlinksere for page 's'.
+## allmessages_address: Special:Allmessages.
+## upload_address: Special:Upload.
+## double_redirects_address: Special:Doubleredirects.
+## broken_redirects_address: Special:Brokenredirects.
+## login_address: Special:Userlogin.
+## captcha_image_address(id): Special:Captcha for image 'id'.
+## watchlist_address: Special:Watchlist editor.
+## contribs_address(target): Special:Contributions for user 'target'.
+
+ def __init__(self, code, fam=None, user=None):
+ BaseSite.__init__(self, code, fam, user)
+ self._namespaces = {
+ # these are the MediaWiki built-in names, which always work
+ # localized names are loaded later upon accessing the wiki
+ -2: [u"Media"],
+ -1: [u"Special"],
+ 0: [u""],
+ 1: [u"Talk"],
+ 2: [u"User"],
+ 3: [u"User talk"],
+ 4: [u"Project"],
+ 5: [u"Project talk"],
+ 6: [u"Image"],
+ 7: [u"Image talk"],
+ 8: [u"MediaWiki"],
+ 9: [u"MediaWiki talk"],
+ 10: [u"Template"],
+ 11: [u"Template talk"],
+ 12: [u"Help"],
+ 13: [u"Help talk"],
+ 14: [u"Category"],
+ 15: [u"Category talk"],
+ }
+ return
+# START HERE
+ self._mediawiki_messages = {}
+ self.nocapitalize = self._lang in self.family().nocapitalize
+ self._userData = [False, False]
+ self._userName = [None, None]
+ self._isLoggedIn = [None, None]
+ self._isBlocked = [None, None]
+ self._messages = [None, None]
+ self._rights = [None, None]
+ self._token = [None, None]
+ self._cookies = [None, None]
+ # Calculating valid languages took quite long, so we calculate it once
+ # in initialization instead of each time it is used.
+ self._validlanguages = []
+ for language in self.languages():
+ if not language[:1].upper() + language[1:] in self.namespaces():
+ self._validlanguages.append(language)
+
+ def logged_in(self, sysop=False):
+ """Return True if logged in with specified privileges, otherwise
False.
+
+ @param sysop: if True, require sysop privileges.
+
+ """
+ if not hasattr(self, '_userinfo'):
+ return False
+ if self._userinfo['name'] != self._username:
+ return False
+ return (not sysop) or 'sysop' in self._userinfo['groups']
+
+ def loggedInAs(self, sysop = False):
+ """Return the current username if logged in, otherwise return
None.
+
+ DEPRECATED (use .user() method instead)
+ Checks if we're logged in by loading a page and looking for the login
+ link. We assume that we're not being logged out during a bot run, so
+ loading the test page is only required once.
+
+ """
+ logging.debug("Site.loggedInAs() method is deprecated.")
+ return self.logged_in(sysop) and self.user()
+
+ def login(self, sysop=False):
+ """Log the user in if not already logged in."""
+ if not self.logged_in(sysop):
+ loginMan = api.LoginManager(site=self, sysop=sysop)
+ if loginMan.login(retry = True):
+ self._username = loginMan.username
+ if hasattr(self, "_userinfo"):
+ del self._userinfo
+ self.getuserinfo()
+
+ forceLogin = login # alias for backward-compatibility
+
+ def getuserinfo(self):
+ """Retrieve userinfo from site and store in _userinfo attribute.
+
+ self._userinfo will be a dict with the following keys and values:
+
+ - id: user id (numeric str)
+ - name: username (if user is logged in)
+ - anon: present if user is not logged in
+ - groups: list of groups (could be empty)
+ - rights: list of rights (could be empty)
+ - message: present if user has a new message on talk page
+ - blockinfo: present if user is blocked (dict)
+
+ """
+ if not hasattr(self, "_userinfo"):
+ uirequest = api.Request(
+ site=self,
+ action="query",
+ meta="userinfo",
+ uiprop="blockinfo|hasmsg|groups|rights"
+ )
+ uidata = uirequest.submit()
+ assert 'query' in uidata, \
+ "API userinfo response lacks 'query' key"
+ uidata = uidata['query']
+ assert 'userinfo' in uidata, \
+ "API userinfo response lacks 'userinfo' key"
+ self._userinfo = uidata['userinfo']
+ return self._userinfo
+
+ def getsiteinfo(self):
+ """Retrieve siteinfo from site and store in _siteinfo
attribute."""
+ if not hasattr(self, "_siteinfo"):
+ sirequest = api.Request(
+ site=self,
+ action="query",
+ meta="siteinfo",
+ siprop="general|namespaces|namespacealiases"
+ )
+ try:
+ sidata = sirequest.submit()
+ except api.APIError:
+ # hack for older sites that don't support 1.12 properties
+ sirequest = api.Request(
+ site=self,
+ action="query",
+ meta="siteinfo",
+ siprop="general|namespaces"
+ )
+ sidata = sirequest.submit()
+
+ assert 'query' in sidata, \
+ "API siteinfo response lacks 'query' key"
+ sidata = sidata['query']
+ assert 'general' in sidata, \
+ "API siteinfo response lacks 'general' key"
+ assert 'namespaces' in sidata, \
+ "API siteinfo response lacks 'namespaces' key"
+ self._siteinfo = sidata['general']
+ nsdata = sidata['namespaces']
+ for nskey in nsdata:
+ ns = int(nskey)
+ if ns in self._namespaces:
+ if nsdata[nskey]["*"] in self._namespaces[ns]:
+ continue
+ # this is the preferred form so it goes at front of list
+ self._namespaces[ns].insert(0, nsdata[nskey]["*"])
+ else:
+ self._namespaces[ns] = [nsdata[nskey]["*"]]
+ if 'namespacealiases' in sidata:
+ aliasdata = sidata['namespacealiases']
+ for item in aliasdata:
+ # this is a less preferred form so it goes at the end
+
self._namespaces[int(item['id'])].append(item["*"])
+ return self._siteinfo
+
+ def case(self):
+ return self.getsiteinfo()['case']
+
+ def namespace(self, num, all = False):
+ """Return string containing local name of namespace
'num'.
+
+ If optional argument 'all' is true, return a tuple of all recognized
+ values for this namespace.
+
+ """
+ return self._namespaces[num][0]
+
+
+class NotImplementedYet:
+
+ def isBlocked(self, sysop = False):
+ """Check if the user is blocked."""
+ try:
+ text =
self.getUrl(u'%saction=query&meta=userinfo&uiprop=blockinfo'
+ % self.api_address(), sysop=sysop)
+ return text.find('blockedby=') > -1
+ except NotImplementedError:
+ return False
+
+ def isAllowed(self, right, sysop = False):
+ """Check if the user has a specific right.
+ Among possible rights:
+ * Actions: edit, move, delete, protect, upload
+ * User levels: autoconfirmed, sysop, bot, empty string (always true)
+ """
+ if right == '' or right == None:
+ return True
+ else:
+ self._load(sysop = sysop)
+ index = self._userIndex(sysop)
+ return right in self._rights[index]
+
+ def messages(self, sysop = False):
+ """Returns true if the user has new messages, and false
otherwise."""
+ self._load(sysop = sysop)
+ index = self._userIndex(sysop)
+ return self._messages[index]
+
+ def cookies(self, sysop = False):
+ """Return a string containing the user's current
cookies."""
+ self._loadCookies(sysop = sysop)
+ index = self._userIndex(sysop)
+ return self._cookies[index]
+
+ def _loadCookies(self, sysop = False):
+ """Retrieve session cookies for login"""
+ index = self._userIndex(sysop)
+ if self._cookies[index] is not None:
+ return
+ try:
+ if sysop:
+ try:
+ username = config.sysopnames[self.family().name][self.language()]
+ except KeyError:
+ raise NoUsername("""\
+You tried to perform an action that requires admin privileges, but you haven't
+entered your sysop name in your user-config.py. Please add
+sysopnames['%s']['%s']='name' to your
user-config.py"""
+ % (self.family().name, self.language()))
+ else:
+ username = config.usernames[self.family().name][self.language()]
+ except KeyError:
+ self._cookies[index] = None
+ self._isLoggedIn[index] = False
+ else:
+ tmp = '%s-%s-%s-login.data' % (
+ self.family().name, self.language(), username)
+ fn = config.datafilepath('login-data', tmp)
+ if not os.path.exists(fn):
+ self._cookies[index] = None
+ self._isLoggedIn[index] = False
+ else:
+ f = open(fn)
+ self._cookies[index] = '; '.join([x.strip() for x in
f.readlines()])
+ f.close()
+
+ def urlEncode(self, query):
+ """Encode a query so that it can be sent using an http POST
request."""
+ if not query:
+ return None
+ if hasattr(query, 'iteritems'):
+ iterator = query.iteritems()
+ else:
+ iterator = iter(query)
+ l = []
+ wpEditToken = None
+ for key, value in iterator:
+ if isinstance(key, unicode):
+ key = key.encode('utf-8')
+ if isinstance(value, unicode):
+ value = value.encode('utf-8')
+ key = urllib.quote(key)
+ value = urllib.quote(value)
+ if key == 'wpEditToken':
+ wpEditToken = value
+ continue
+ l.append(key + '=' + value)
+
+ # wpEditToken is explicitly added as last value.
+ # If a premature connection abort occurs while putting, the server will
+ # not have received an edit token and thus refuse saving the page
+ if wpEditToken != None:
+ l.append('wpEditToken=' + wpEditToken)
+ return '&'.join(l)
+
+ def postForm(self, address, predata, sysop=False, useCookie=True):
+ """Post http form data to the given address at this site.
+
+ address is the absolute path without hostname.
+ predata is a dict or any iterable that can be converted to a dict,
+ containing keys and values for the http form.
+
+ Return a (response, data) tuple, where response is the HTTP
+ response object and data is a Unicode string containing the
+ body of the response.
+
+ """
+ data = self.urlEncode(predata)
+ try:
+ return self.postData(address, data, sysop=sysop,
+ useCookie=useCookie)
+ except socket.error, e:
+ raise ServerError(e)
+
+ def postData(self, address, data,
+ contentType='application/x-www-form-urlencoded',
+ sysop=False, useCookie=True, compress=True):
+ """Post encoded data to the given http address at this site.
+
+ address is the absolute path without hostname.
+ data is an ASCII string that has been URL-encoded.
+
+ Returns a (response, data) tuple where response is the HTTP
+ response object and data is a Unicode string containing the
+ body of the response.
+ """
+
+ # TODO: add the authenticate stuff here
+
+ if False: #self.persistent_http:
+ conn = self.conn
+ else:
+ # Encode all of this into a HTTP request
+ if self.protocol() == 'http':
+ conn = httplib.HTTPConnection(self.hostname())
+ elif self.protocol() == 'https':
+ conn = httplib.HTTPSConnection(self.hostname())
+ # otherwise, it will crash, as other protocols are not supported
+
+ conn.putrequest('POST', address)
+ conn.putheader('Content-Length', str(len(data)))
+ conn.putheader('Content-type', contentType)
+ conn.putheader('User-agent', useragent)
+ if useCookie and self.cookies(sysop = sysop):
+ conn.putheader('Cookie', self.cookies(sysop = sysop))
+ if False: #self.persistent_http:
+ conn.putheader('Connection', 'Keep-Alive')
+ if compress:
+ conn.putheader('Accept-encoding', 'gzip')
+ conn.endheaders()
+ conn.send(data)
+
+ # Prepare the return values
+ # Note that this can raise network exceptions which are not
+ # caught here.
+ try:
+ response = conn.getresponse()
+ except httplib.BadStatusLine:
+ # Blub.
+ conn.close()
+ conn.connect()
+ return self.postData(address, data, contentType, sysop, useCookie)
+
+ data = response.read()
+
+ if compress and response.getheader('Content-Encoding') ==
'gzip':
+ data = decompress_gzip(data)
+
+ data = data.decode(self.encoding())
+ response.close()
+
+ if True: #not self.persistent_http:
+ conn.close()
+
+ # If a wiki page, get user data
+ self._getUserData(data, sysop = sysop)
+
+ return response, data
+
+ def getUrl(self, path, retry = True, sysop = False, data = None, compress = True):
+ """
+ Low-level routine to get a URL from the wiki.
+
+ Parameters:
+ path - The absolute path, without the hostname.
+ retry - If True, retries loading the page when a network error
+ occurs.
+ sysop - If True, the sysop account's cookie will be used.
+ data - An optional dict providing extra post request parameters
+
+ Returns the HTML text of the page converted to unicode.
+ """
+ if False: #self.persistent_http and not data:
+ self.conn.putrequest('GET', path)
+ self.conn.putheader('User-agent', useragent)
+ self.conn.putheader('Cookie', self.cookies(sysop = sysop))
+ self.conn.putheader('Connection', 'Keep-Alive')
+ if compress:
+ self.conn.putheader('Accept-encoding', 'gzip')
+ self.conn.endheaders()
+
+ # Prepare the return values
+ # Note that this can raise network exceptions which are not
+ # caught here.
+ try:
+ response = self.conn.getresponse()
+ except httplib.BadStatusLine:
+ # Blub.
+ self.conn.close()
+ self.conn.connect()
+ return self.getUrl(path, retry, sysop, data, compress)
+
+ text = response.read()
+ headers = dict(response.getheaders())
+
+ else:
+ if self.hostname() in config.authenticate.keys():
+ uo = authenticateURLopener
+ else:
+ uo = MyURLopener()
+ if self.cookies(sysop = sysop):
+ uo.addheader('Cookie', self.cookies(sysop = sysop))
+ if compress:
+ uo.addheader('Accept-encoding', 'gzip')
+
+ url = '%s://%s%s' % (self.protocol(), self.hostname(), path)
+ data = self.urlEncode(data)
+
+ # Try to retrieve the page until it was successfully loaded (just in
+ # case the server is down or overloaded).
+ # Wait for retry_idle_time minutes (growing!) between retries.
+ retry_idle_time = 1
+ retrieved = False
+ while not retrieved:
+ try:
+ if self.hostname() in config.authenticate.keys():
+ if False: # compress:
+ request = urllib2.Request(url, data)
+ request.add_header('Accept-encoding',
'gzip')
+ opener = urllib2.build_opener()
+ f = opener.open(request)
+ else:
+ f = urllib2.urlopen(url, data)
+ else:
+ f = uo.open(url, data)
+ retrieved = True
+ except KeyboardInterrupt:
+ raise
+ except Exception, e:
+ if retry:
+ # We assume that the server is down. Wait some time, then try
again.
+ output(u"%s" % e)
+ output(u"""\
+WARNING: Could not open '%s://%s%s'. Maybe the server or
+your connection is down. Retrying in %i minutes..."""
+ % (self.protocol(), self.hostname(), path,
+ retry_idle_time))
+ time.sleep(retry_idle_time * 60)
+ # Next time wait longer, but not longer than half an hour
+ retry_idle_time *= 2
+ if retry_idle_time > 30:
+ retry_idle_time = 30
+ else:
+ raise
+ text = f.read()
+
+ headers = f.info()
+
+ contentType = headers.get('content-type', '')
+ contentEncoding = headers.get('content-encoding', '')
+
+ # Ensure that all sent data is received
+ if int(headers.get('content-length', '0')) != len(text) and
'content-length' in headers:
+ output(u'Warning! len(text) does not match content-length: %s != %s'
% \
+ (len(text), headers.get('content-length')))
+ if False: #self.persistent_http
+ self.conn.close()
+ self.conn.connect()
+ return self.getUrl(path, retry, sysop, data, compress)
+
+ if compress and contentEncoding == 'gzip':
+ text = decompress_gzip(text)
+
+ R = re.compile('charset=([^\'\";]+)')
+ m = R.search(contentType)
+ if m:
+ charset = m.group(1)
+ else:
+ output(u"WARNING: No character set found.")
+ # UTF-8 as default
+ charset = 'utf-8'
+ # Check if this is the charset we expected
+ self.checkCharset(charset)
+ # Convert HTML to Unicode
+ try:
+ text = unicode(text, charset, errors = 'strict')
+ except UnicodeDecodeError, e:
+ print e
+ output(u'ERROR: Invalid characters found on %s://%s%s, replaced by
\\ufffd.' % (self.protocol(), self.hostname(), path))
+ # We use error='replace' in case of bad encoding.
+ text = unicode(text, charset, errors = 'replace')
+
+ # If a wiki page, get user data
+ self._getUserData(text, sysop = sysop)
+
+ return text
+
+ def _getUserData(self, text, sysop = False):
+ """
+ Get the user data from a wiki page data.
+
+ Parameters:
+ * text - the page text
+ * sysop - is the user a sysop?
+ """
+ if '<div id="globalWrapper">' not in text:
+ # Not a wiki page
+ return
+
+ index = self._userIndex(sysop)
+
+ # Check for blocks - but only if version is 1.11 (userinfo is available)
+ # and the user data was not yet loaded
+ if self.versionnumber() >= 11 and not self._userData[index]:
+ blocked = self.isBlocked(sysop = sysop)
+ if blocked and not self._isBlocked[index]:
+ # Write a warning if not shown earlier
+ if sysop:
+ account = 'Your sysop account'
+ else:
+ account = 'Your account'
+ output(u'WARNING: %s on %s is blocked. Editing using this account
will stop the run.' % (account, self))
+ self._isBlocked[index] = blocked
+
+ # Check for new messages
+ if '<div class="usermessage">' in text:
+ if not self._messages[index]:
+ # User has *new* messages
+ if sysop:
+ output(u'NOTE: You have new messages in your sysop account on
%s' % self)
+ else:
+ output(u'NOTE: You have new messages on %s' % self)
+ self._messages[index] = True
+ else:
+ self._messages[index] = False
+
+ # Don't perform other checks if the data was already loaded
+ if self._userData[index]:
+ return
+
+ # Search for the the user page link at the top.
+ # Note that the link of anonymous users (which doesn't exist at all
+ # in Wikimedia sites) has the ID pt-anonuserpage, and thus won't be
+ # found here.
+ userpageR = re.compile('<li id="pt-userpage"><a
href=".+?">(?P<username>.+?)</a></li>')
+ m = userpageR.search(text)
+ if m:
+ self._isLoggedIn[index] = True
+ self._userName[index] = m.group('username')
+ else:
+ self._isLoggedIn[index] = False
+ # No idea what is the user name, and it isn't important
+ self._userName[index] = None
+
+ # Check user groups, if possible (introduced in 1.10)
+ groupsR = re.compile(r'var wgUserGroups = \[\"(.+)\"\];')
+ m = groupsR.search(text)
+ if m:
+ rights = m.group(1)
+ rights = rights.split('", "')
+ if '*' in rights:
+ rights.remove('*')
+ self._rights[index] = rights
+ # Warnings
+ # Don't show warnings for not logged in users, they will just fail to
+ # do any action
+ if self._isLoggedIn[index]:
+ if 'bot' not in self._rights[index]:
+ if sysop:
+ output(u'Note: Your sysop account on %s does not have a bot
flag. Its edits will be visible in the recent changes.' % self)
+ else:
+ output(u'WARNING: Your account on %s does not have a bot
flag. Its edits will be visible in the recent changes and it may get blocked.' %
self)
+ if sysop and 'sysop' not in self._rights[index]:
+ output(u'WARNING: Your sysop account on %s does not seem to have
sysop rights. You may not be able to perform any sysop-restricted actions using it.' %
self)
+ else:
+ # We don't have wgUserGroups, and can't check the rights
+ self._rights[index] = []
+ if self._isLoggedIn[index]:
+ # Logged in user
+ self._rights[index].append('user')
+ # Assume bot, and thus autoconfirmed
+ self._rights[index].extend(['bot', 'autoconfirmed'])
+ if sysop:
+ # Assume user reported as a sysop indeed has the sysop rights
+ self._rights[index].append('sysop')
+ # Assume the user has the default rights
+ self._rights[index].extend(['read', 'createaccount',
'edit', 'upload', 'createpage', 'createtalk',
'move', 'upload'])
+ if 'bot' in self._rights[index] or 'sysop' in
self._rights[index]:
+ self._rights[index].append('apihighlimits')
+ if 'sysop' in self._rights[index]:
+ self._rights[index].extend(['delete', 'undelete',
'block', 'protect', 'import', 'deletedhistory',
'unwatchedpages'])
+
+ # Search for a token
+ tokenR = re.compile(r"\<input type='hidden'
value=\"(.*?)\" name=\"wpEditToken\"")
+ tokenloc = tokenR.search(text)
+ if tokenloc:
+ self._token[index] = tokenloc.group(1)
+ if self._rights[index] is not None:
+ # In this case, token and rights are loaded - user data is now loaded
+ self._userData[index] = True
+ else:
+ # Token not found
+ # Possible reason for this is the user is blocked, don't show a
+ # warning in this case, otherwise do show a warning
+ # Another possible reason is that the page cannot be edited - ensure
+ # there is a textarea and the tab "view source" is not shown
+ if u'<textarea' in text and u'<li
id="ca-viewsource"' not in text and not self._isBlocked[index]:
+ # Token not found
+ output(u'WARNING: Token not found on %s. You will not be able to edit
any page.' % self)
+
+ def mediawiki_message(self, key):
+ """Return the MediaWiki message text for key "key"
"""
+ global mwpage, tree
+ if key.lower() not in self._mediawiki_messages.keys() \
+ and not hasattr(self, "_phploaded"):
+ get_throttle()
+ mwpage = self.getUrl("%s?title=%s:%s&action=edit"
+ % (self.path(), urllib.quote(
+ self.namespace(8).replace(' ', '_').encode(
+ self.encoding())),
+ key))
+ tree = BeautifulSoup(mwpage,
+ convertEntities=BeautifulSoup.HTML_ENTITIES,
+ parseOnlyThese=SoupStrainer("textarea"))
+ if tree.textarea is not None and tree.textarea.string is not None:
+ value = tree.textarea.string.strip()
+ else:
+ value = None
+ if value:
+ self._mediawiki_messages[key.lower()] = value
+ else:
+ self._mediawiki_messages[key.lower()] = None
+ # Fallback in case MediaWiki: page method doesn't work
+ if verbose:
+ output(
+ u"Retrieving mediawiki messages from
Special:Allmessages")
+ retry_idle_time = 1
+ while True:
+ get_throttle()
+ phppage =
self.getUrl(self.get_address("Special:Allmessages")
+ + "&ot=php")
+ Rphpvals = re.compile(r"(?ms)'([^']*)' =>
'(.*?[^\\])',")
+ count = 0
+ for (phpkey, phpval) in Rphpvals.findall(phppage):
+ count += 1
+ self._mediawiki_messages[str(phpkey).lower()] = phpval
+ if count == 0:
+ # No messages could be added.
+ # We assume that the server is down.
+ # Wait some time, then try again.
+ output('WARNING: No messages found in Special:Allmessages.
Maybe the server is down. Retrying in %i minutes...' % retry_idle_time)
+ time.sleep(retry_idle_time * 60)
+ # Next time wait longer, but not longer than half an hour
+ retry_idle_time *= 2
+ if retry_idle_time > 30:
+ retry_idle_time = 30
+ continue
+ break
+ self._phploaded = True
+
+ key = key.lower()
+ if self._mediawiki_messages[key] is None:
+ raise KeyError("MediaWiki key '%s' does not exist on %s"
+ % (key, self))
+ return self._mediawiki_messages[key]
+
+ def has_mediawiki_message(self, key):
+ """Return True iff this site defines a MediaWiki message for
'key'."""
+ try:
+ v = self.mediawiki_message(key)
+ return True
+ except KeyError:
+ return False
+
+ def _load(self, sysop = False):
+ """
+ Loads user data.
+ This is only done if we didn't do get any page yet and the information
+ is requested, otherwise we should already have this data.
+
+ Parameters:
+ * sysop - Get sysop user data?
+ """
+ index = self._userIndex(sysop)
+ if self._userData[index]:
+ return
+
+ if verbose:
+ output(u'Getting information for site %s' % self)
+
+ # Get data
+ url = self.edit_address('Non-existing_page')
+ text = self.getUrl(url, sysop = sysop)
+
+ # Parse data
+ self._getUserData(text, sysop = sysop)
+
+ def search(self, query, number = 10, namespaces = None):
+ """Yield search results (using Special:Search page) for
query."""
+ throttle = True
+ path = self.search_address(urllib.quote_plus(query),
+ n=number, ns=namespaces)
+ get_throttle()
+ html = self.getUrl(path)
+
+ entryR = re.compile(ur'<li[^>]*><a href=".+?"
title="(?P<title>.+?)">.+?</a>'
+ '<br />(?P<match>.*?)<span
style="color[^>]*>.+?: '
+ '(?P<relevance>[0-9.]+)% - '
+# '(?P<size>[0-9.]*) '
+# '(?P<sizeunit>[A-Za-z]) '
+# '\((?P<words>.+?) \w+\) - '
+# '(?P<date>.+?)</span></li>'
+ , re.DOTALL)
+
+ for m in entryR.finditer(html):
+ page = Page(self, m.group('title'))
+ match = m.group('match')
+ relevance = m.group('relevance')
+ #size = m.group('size')
+ ## sizeunit appears to always be "KB"
+ #words = m.group('words')
+ #date = m.group('date')
+
+ #print "%s - %s %s (%s words) - %s" % (relevance, size, sizeunit,
words, date)
+
+ #yield page, match, relevance, size, words, date
+ yield page, match, relevance, '', '', ''
+
+ # TODO: avoid code duplication for the following methods
+ def newpages(self, number = 10, get_redirect = False, repeat = False):
+ """Yield new articles (as Page objects) from Special:Newpages.
+
+ Starts with the newest article and fetches the number of articles
+ specified in the first argument. If repeat is True, it fetches
+ Newpages again. If there is no new page, it blocks until there is
+ one, sleeping between subsequent fetches of Newpages.
+
+ The objects yielded are tuples composed of the Page object,
+ timestamp (unicode), length (int), an empty unicode string, username
+ or IP address (str), comment (unicode).
+
+ """
+ # TODO: in recent MW versions Special:Newpages takes a namespace parameter,
+ # and defaults to 0 if not specified.
+ # TODO: Detection of unregistered users is broken
+ # TODO: Repeat mechanism doesn't make much sense as implemented;
+ # should use both offset and limit parameters, and have an
+ # option to fetch older rather than newer pages
+ seen = set()
+ while True:
+ path = self.newpages_address(n=number)
+ # The throttling is important here, so always enabled.
+ get_throttle()
+ html = self.getUrl(path)
+
+ entryR = re.compile(
+'<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"'
+'
title="(?P<title>.+?)">.+?</a>.+?[\(\[](?P<length>[\d,.]+)[^\)\]]*[\)\]]'
+' .?<a href=".+?"
title=".+?:(?P<username>.+?)">'
+ )
+ for m in entryR.finditer(html):
+ date = m.group('date')
+ title = m.group('title')
+ title = title.replace('"', '"')
+ length = int(re.sub("[,.]", "",
m.group('length')))
+ loggedIn = u''
+ username = m.group('username')
+ comment = u''
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page, date, length, loggedIn, username, comment
+ if not repeat:
+ break
+
+ def longpages(self, number = 10, repeat = False):
+ """Yield Pages from Special:Longpages.
+
+ Return values are a tuple of Page object, length(int).
+
+ """
+ #TODO: should use offset and limit parameters; 'repeat' as now
+ # implemented is fairly useless
+ # this comment applies to all the XXXXpages methods following, as well
+ seen = set()
+ while True:
+ path = self.longpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(ur'<li>\(<a href=".+?"
title=".+?">hist</a>\) <a href=".+?"
title="(?P<title>.+?)">.+?</a>
\[(?P<length>\d+)(.+?)\]</li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ length = int(m.group('length'))
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page, length
+ if not repeat:
+ break
+
+ def shortpages(self, number = 10, repeat = False):
+ """Yield Pages and lengths from
Special:Shortpages."""
+ throttle = True
+ seen = set()
+ while True:
+ path = self.shortpages_address(n = number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(ur'<li>\(<a href=".+?"
title=".+?">hist</a>\) <a href=".+?"
title="(?P<title>.+?)">.+?</a>
\[(?P<length>\d+)(.+?)\]</li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ length = int(m.group('length'))
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page, length
+ if not repeat:
+ break
+
+ def categories(self, number=10, repeat=False):
+ """Yield Category objects from
Special:Categories"""
+ import catlib
+ seen = set()
+ while True:
+ path = self.categories_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a>.*?</li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ if title not in seen:
+ seen.add(title)
+ page = catlib.Category(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def deadendpages(self, number = 10, repeat = False):
+ """Yield Page objects retrieved from
Special:Deadendpages."""
+ seen = set()
+ while True:
+ path = self.deadendpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def ancientpages(self, number = 10, repeat = False):
+ """Yield Pages, datestamps from
Special:Ancientpages."""
+ seen = set()
+ while True:
+ path = self.ancientpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+'<li><a href=".+?"
title="(?P<title>.+?)">.+?</a>
(?P<date>.+?)</li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ date = m.group('date')
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page, date
+ if not repeat:
+ break
+
+ def lonelypages(self, number = 10, repeat = False):
+ """Yield Pages retrieved from
Special:Lonelypages."""
+ throttle = True
+ seen = set()
+ while True:
+ path = self.lonelypages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def unwatchedpages(self, number = 10, repeat = False):
+ """Yield Pages from Special:Unwatchedpages (requires Admin
privileges)."""
+ seen = set()
+ while True:
+ path = self.unwatchedpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path, sysop = True)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a>.+?</li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def uncategorizedcategories(self, number = 10, repeat = False):
+ """Yield Categories from
Special:Uncategorizedcategories."""
+ import catlib
+ seen = set()
+ while True:
+ path = self.uncategorizedcategories_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ if title not in seen:
+ seen.add(title)
+ page = catlib.Category(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def newimages(self, number = 10, repeat = False):
+ """Yield ImagePages from
Special:Log&type=upload"""
+
+ seen = set()
+ regexp = re.compile('<li[^>]*>(?P<date>.+?)\s+<a
href=.*?>(?P<user>.+?)</a>\s+\(.+?</a>\).*?<a
href=".*?"(?P<new> class="new")?
title="(?P<image>.+?)"\s*>(?:.*?<span
class="comment">(?P<comment>.*?)</span>)?', re.UNICODE)
+
+ while True:
+ path = self.log_address(number, mode = 'upload')
+ get_throttle()
+ html = self.getUrl(path)
+
+ for m in regexp.finditer(html):
+ image = m.group('image')
+
+ if image not in seen:
+ seen.add(image)
+
+ if m.group('new'):
+ output(u"Image \'%s\' has been deleted." %
image)
+ continue
+
+ date = m.group('date')
+ user = m.group('user')
+ comment = m.group('comment') or ''
+
+ yield ImagePage(self, image), date, user, comment
+ if not repeat:
+ break
+
+ def uncategorizedimages(self, number = 10, repeat = False):
+ """Yield ImagePages from
Special:Uncategorizedimages."""
+ seen = set()
+ ns = self.image_namespace()
+ entryR = re.compile(
+ '<a href=".+?"
title="(?P<title>%s:.+?)">.+?</a>' % ns)
+ while True:
+ path = self.uncategorizedimages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ if title not in seen:
+ seen.add(title)
+ page = ImagePage(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def uncategorizedpages(self, number = 10, repeat = False):
+ """Yield Pages from Special:Uncategorizedpages."""
+ seen = set()
+ while True:
+ path = self.uncategorizedpages_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile(
+ '<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def unusedcategories(self, number = 10, repeat = False):
+ """Yield Category objects from
Special:Unusedcategories."""
+ import catlib
+ seen = set()
+ while True:
+ path = self.unusedcategories_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile('<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+
+ if title not in seen:
+ seen.add(title)
+ page = catlib.Category(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def unusedfiles(self, number = 10, repeat = False, extension = None):
+ """Yield ImagePage objects from
Special:Unusedimages."""
+ seen = set()
+ ns = self.image_namespace()
+ entryR = re.compile(
+ '<a href=".+?"
title="(?P<title>%s:.+?)">.+?</a>' % ns)
+ while True:
+ path = self.unusedfiles_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ for m in entryR.finditer(html):
+ fileext = None
+ title = m.group('title')
+ if extension:
+ fileext = title[len(title)-3:]
+ if title not in seen and fileext == extension:
+ ## Check whether the media is used in a Proofread page
+ # code disabled because it slows this method down, and
+ # because it is unclear what it's supposed to do.
+ #basename = title[6:]
+ #page = Page(self, 'Page:' + basename)
+
+ #if not page.exists():
+ seen.add(title)
+ image = ImagePage(self, title)
+ yield image
+ if not repeat:
+ break
+
+ def withoutinterwiki(self, number=10, repeat=False):
+ """Yield Pages without language links from
Special:Withoutinterwiki."""
+ seen = set()
+ while True:
+ path = self.withoutinterwiki_address(n=number)
+ get_throttle()
+ html = self.getUrl(path)
+ entryR = re.compile('<li><a href=".+?"
title="(?P<title>.+?)">.+?</a></li>')
+ for m in entryR.finditer(html):
+ title = m.group('title')
+ if title not in seen:
+ seen.add(title)
+ page = Page(self, title)
+ yield page
+ if not repeat:
+ break
+
+ def allpages(self, start='!', namespace=0, includeredirects=True,
+ throttle=True):
+ """Yield all Pages from Special:Allpages.
+
+ Parameters:
+ start Start at this page. By default, it starts at '!', and yields
+ all pages.
+ namespace Yield all pages in this namespace; defaults to 0.
+ MediaWiki software will only return pages in one namespace
+ at a time.
+
+ If includeredirects is False, redirects will not be found.
+ If includeredirects equals the string 'only', only redirects
+ will be found. Note that this has not been tested on older
+ versions of the MediaWiki code.
+
+ It is advised not to use this directly, but to use the
+ AllpagesPageGenerator from pagegenerators.py instead.
+
+ """
+ while True:
+ # encode Non-ASCII characters in hexadecimal format (e.g. %F6)
+ start = start.encode(self.encoding())
+ start = urllib.quote(start)
+ # load a list which contains a series of article names (always 480)
+ path = self.allpages_address(start, namespace)
+ output(u'Retrieving Allpages special page for %s from %s, namespace
%i' % (repr(self), start, namespace))
+ returned_html = self.getUrl(path)
+ # Try to find begin and end markers
+ try:
+ # In 1.4, another table was added above the navigational links
+ if self.versionnumber() >= 4:
+ begin_s = '</table><hr /><table'
+ end_s = '</table'
+ else:
+ begin_s = '<table'
+ end_s = '</table'
+ ibegin = returned_html.index(begin_s)
+ iend = returned_html.index(end_s,ibegin + 3)
+ except ValueError:
+ raise ServerError(
+"Couldn't extract allpages special page. Make sure you're using MonoBook
skin.")
+ # remove the irrelevant sections
+ returned_html = returned_html[ibegin:iend]
+ if self.versionnumber()==2:
+ R = re.compile('/wiki/(.*?)\"
*class=[\'\"]printable')
+ elif self.versionnumber()<5:
+ # Apparently the special code for redirects was added in 1.5
+ R = re.compile('title ?=\"(.*?)\"')
+ elif not includeredirects:
+ R = re.compile('\<td(?: width="33%")?\>\<a
href=\"\S*\" +title ?="(.*?)"')
+ elif includeredirects == 'only':
+ R = re.compile('\<td(?:
width="33%")?>\<[^\<\>]*allpagesredirect\"\>\<a
href=\"\S*\" +title ?="(.*?)"')
+ else:
+ R = re.compile('title ?=\"(.*?)\"')
+ # Count the number of useful links on this page
+ n = 0
+ for hit in R.findall(returned_html):
+ # count how many articles we found on the current page
+ n = n + 1
+ if self.versionnumber()==2:
+ yield Page(self, url2link(hit, site = self, insite = self))
+ else:
+ yield Page(self, hit)
+ # save the last hit, so that we know where to continue when we
+ # finished all articles on the current page. Append a '!' so
that
+ # we don't yield a page twice.
+ start = Page(self,hit).titleWithoutNamespace() + '!'
+ # A small shortcut: if there are less than 100 pages listed on this
+ # page, there is certainly no next. Probably 480 would do as well,
+ # but better be safe than sorry.
+ if n < 100:
+ if (not includeredirects) or includeredirects == 'only':
+ # Maybe there were only so few because the rest is or is not a
redirect
+ R = re.compile('title ?=\"(.*?)\"')
+ allLinks = R.findall(returned_html)
+ if len(allLinks) < 100:
+ break
+ elif n == 0:
+ # In this special case, no pages of the requested type
+ # were found, and "start" will remain and be
double-encoded.
+ # Use the last page as the start of the next page.
+ start = Page(self, allLinks[-1]).titleWithoutNamespace() +
'!'
+ else:
+ break
+
+ def prefixindex(self, prefix, namespace=0, includeredirects=True):
+ """Yield all pages with a given prefix.
+
+ Parameters:
+ prefix The prefix of the pages.
+ namespace Namespace number; defaults to 0.
+ MediaWiki software will only return pages in one namespace
+ at a time.
+
+ If includeredirects is False, redirects will not be found.
+ If includeredirects equals the string 'only', only redirects
+ will be found. Note that this has not been tested on older
+ versions of the MediaWiki code.
+
+ It is advised not to use this directly, but to use the
+ PrefixingPageGenerator from pagegenerators.py instead.
+ """
+ for page in self.allpages(start = prefix, namespace = namespace, includeredirects
= includeredirects):
+ if page.titleWithoutNamespace().startswith(prefix):
+ yield page
+ else:
+ break
+
+ def linksearch(self, siteurl):
+ """Yield Pages from results of Special:Linksearch for
'siteurl'."""
+ if siteurl.startswith('*.'):
+ siteurl = siteurl[2:]
+ output(u'Querying [[Special:Linksearch]]...')
+ cache = []
+ for url in [siteurl, '*.' + siteurl]:
+ path = self.linksearch_address(url)
+ get_throttle()
+ html = self.getUrl(path)
+ loc = html.find('<div class="mw-spcontent">')
+ if loc > -1:
+ html = html[loc:]
+ loc = html.find('<div class="printfooter">')
+ if loc > -1:
+ html = html[:loc]
+ R = re.compile('title ?=\"(.*?)\"')
+ for title in R.findall(html):
+ if not siteurl in title:
+ # the links themselves have similar form
+ if title in cache:
+ continue
+ else:
+ cache.append(title)
+ yield Page(self, title)
+
+ def linkto(self, title, othersite = None):
+ """Return unicode string in the form of a wikilink to
'title'
+
+ Use optional Site argument 'othersite' to generate an interwiki link.
+
+ """
+ if othersite and othersite.lang != self.language():
+ return u'[[%s:%s]]' % (self.language(), title)
+ else:
+ return u'[[%s]]' % title
+
+ def isInterwikiLink(self, s):
+ """Return True if s is in the form of an interwiki link.
+
+ Interwiki links have the form "foo:bar" or ":foo:bar" where
foo is a
+ known language code or family. Called recursively if the first part
+ of the link refers to this site's own family and/or language.
+
+ """
+ s = s.strip().lstrip(":")
+ if not ':' in s:
+ return False
+ first, rest = s.split(':',1)
+ # interwiki codes are case-insensitive
+ first = first.lower().strip()
+ # commons: forwards interlanguage links to wikipedia:, etc.
+ if self.family().interwiki_forward:
+ interlangTargetFamily = Family(self.family().interwiki_forward)
+ else:
+ interlangTargetFamily = self.family()
+ if self.getNamespaceIndex(first):
+ return False
+ if first in interlangTargetFamily.langs:
+ if first == self.language():
+ return self.isInterwikiLink(rest)
+ else:
+ return True
+ if first in self.family().get_known_families(site = self):
+ if first == self.family().name:
+ return self.isInterwikiLink(rest)
+ else:
+ return True
+ return False
+
+ def redirect(self, default = False):
+ """Return the localized redirect tag for the site.
+
+ If default is True, falls back to 'REDIRECT' if the site has no
+ special redirect tag.
+
+ """
+ if default:
+ if self.language() == 'ar':
+ # It won't work with REDIRECT[[]] but it work with the local,
+ # if problems, try to find a work around. FixMe!
+ return self.family().redirect.get(self.language(),
[u"تحويل"])[0]
+ else:
+ return self.family().redirect.get(self.language(),
[u"REDIRECT"])[0]
+ else:
+ return self.family().redirect.get(self.language(), None)
+
+ def redirectRegex(self):
+ """Return a compiled regular expression matching on redirect
pages.
+
+ Group 1 in the regex match object will be the target title.
+
+ """
+ redDefault = 'redirect'
+ red = 'redirect'
+ if self.language() == 'ar':
+ red = u"تحويل"
+ try:
+ if redDefault == red:
+ redirKeywords = [red] + self.family().redirect[self.language()]
+ redirKeywordsR = r'(?:' + '|'.join(redirKeywords) +
')'
+ else:
+ redirKeywords = [red] + self.family().redirect[self.language()]
+ redirKeywordsR = r'(?:' + redDefault +
'|'.join(redirKeywords) + ')'
+ except KeyError:
+ # no localized keyword for redirects
+ if redDefault == red:
+ redirKeywordsR = r'%s' % red
+ else:
+ redirKeywordsR = r'(?:%s|%s)' % (red, redDefault)
+ # A redirect starts with hash (#), followed by a keyword, then
+ # arbitrary stuff, then a wikilink. The wikilink may contain
+ # a label, although this is not useful.
+ return re.compile(r'#' + redirKeywordsR +
+ '.*?\[\[(.*?)(?:\|.*?)?\]\]',
+ re.IGNORECASE | re.UNICODE | re.DOTALL)
+
+ # The following methods are for convenience, so that you can access
+ # methods of the Family class easily.
+ def encoding(self):
+ """Return the current encoding for this site."""
+ return self.family().code2encoding(self.language())
+
+ def encodings(self):
+ """Return a list of all historical encodings for this
site."""
+ return self.family().code2encodings(self.language())
+
+ def category_namespace(self):
+ """Return the canonical name of the Category namespace on this
site."""
+ # equivalent to self.namespace(14)?
+ return self.family().category_namespace(self.language())
+
+ def category_namespaces(self):
+ """Return a list of all valid names for the Category
namespace."""
+ return self.family().category_namespaces(self.language())
+
+ def image_namespace(self, fallback = '_default'):
+ """Return the canonical name of the Image namespace on this
site."""
+ # equivalent to self.namespace(6)?
+ return self.family().image_namespace(self.language(), fallback)
+
+ def template_namespace(self, fallback = '_default'):
+ """Return the canonical name of the Template namespace on this
site."""
+ # equivalent to self.namespace(10)?
+ return self.family().template_namespace(self.language(), fallback)
+
+ def export_address(self):
+ """Return URL path for Special:Export."""
+ return self.family().export_address(self.language())
+
+ def query_address(self):
+ """Return URL path + '?' for query.php (if enabled on this
Site)."""
+ return self.family().query_address(self.language())
+
+ def api_address(self):
+ """Return URL path + '?' for api.php (if enabled on this
Site)."""
+ return self.family().api_address(self.language())
+
+ def apipath(self):
+ """Return URL path for api.php (if enabled on this
Site)."""
+ return self.family().apipath(self.language())
+
+ def protocol(self):
+ """Return protocol ('http' or 'https') for access
to this site."""
+ return self.family().protocol(self.language())
+
+ def hostname(self):
+ """Return host portion of site URL."""
+ return self.family().hostname(self.language())
+
+ def path(self):
+ """Return URL path for index.php on this Site."""
+ return self.family().path(self.language())
+
+ def dbName(self):
+ """Return MySQL database name."""
+ return self.family().dbName(self.language())
+
+ def move_address(self):
+ """Return URL path for Special:Movepage."""
+ return self.family().move_address(self.language())
+
+ def delete_address(self, s):
+ """Return URL path to delete title 's'."""
+ return self.family().delete_address(self.language(), s)
+
+ def undelete_view_address(self, s, ts=''):
+ """Return URL path to view Special:Undelete for title 's'
+
+ Optional argument 'ts' returns path to view specific deleted version.
+
+ """
+ return self.family().undelete_view_address(self.language(), s, ts)
+
+ def undelete_address(self):
+ """Return URL path to Special:Undelete."""
+ return self.family().undelete_address(self.language())
+
+ def protect_address(self, s):
+ """Return URL path to protect title
's'."""
+ return self.family().protect_address(self.language(), s)
+
+ def unprotect_address(self, s):
+ """Return URL path to unprotect title
's'."""
+ return self.family().unprotect_address(self.language(), s)
+
+ def put_address(self, s):
+ """Return URL path to submit revision to page titled
's'."""
+ return self.family().put_address(self.language(), s)
+
+ def get_address(self, s):
+ """Return URL path to retrieve page titled
's'."""
+ return self.family().get_address(self.language(), s)
+
+ def nice_get_address(self, s):
+ """Return shorter URL path to retrieve page titled
's'."""
+ return self.family().nice_get_address(self.language(), s)
+
+ def edit_address(self, s):
+ """Return URL path for edit form for page titled
's'."""
+ return self.family().edit_address(self.language(), s)
+
+ def purge_address(self, s):
+ """Return URL path to purge cache and retrieve page
's'."""
+ return self.family().purge_address(self.language(), s)
+
+ def block_address(self):
+ """Return path to block an IP address."""
+ return self.family().block_address(self.language())
+
+ def unblock_address(self):
+ """Return path to unblock an IP address."""
+ return self.family().unblock_address(self.language())
+
+ def blocksearch_address(self, s):
+ """Return path to search for blocks on IP address
's'."""
+ return self.family().blocksearch_address(self.language(), s)
+
+ def linksearch_address(self, s, limit=500, offset=0):
+ """Return path to Special:Linksearch for target
's'."""
+ return self.family().linksearch_address(self.language(), s, limit=limit,
offset=offset)
+
+ def search_address(self, q, n=50, ns=0):
+ """Return path to Special:Search for query
'q'."""
+ return self.family().search_address(self.language(), q, n, ns)
+
+ def allpages_address(self, s, ns = 0):
+ """Return path to Special:Allpages."""
+ return self.family().allpages_address(self.language(), start=s, namespace = ns)
+
+ def log_address(self, n=50, mode = ''):
+ """Return path to Special:Log."""
+ return self.family().log_address(self.language(), n, mode)
+
+ def newpages_address(self, n=50):
+ """Return path to Special:Newpages."""
+ return self.family().newpages_address(self.language(), n)
+
+ def longpages_address(self, n=500):
+ """Return path to Special:Longpages."""
+ return self.family().longpages_address(self.language(), n)
+
+ def shortpages_address(self, n=500):
+ """Return path to Special:Shortpages."""
+ return self.family().shortpages_address(self.language(), n)
+
+ def unusedfiles_address(self, n=500):
+ """Return path to Special:Unusedimages."""
+ return self.family().unusedfiles_address(self.language(), n)
+
+ def categories_address(self, n=500):
+ """Return path to Special:Categories."""
+ return self.family().categories_address(self.language(), n)
+
+ def deadendpages_address(self, n=500):
+ """Return path to Special:Deadendpages."""
+ return self.family().deadendpages_address(self.language(), n)
+
+ def ancientpages_address(self, n=500):
+ """Return path to Special:Ancientpages."""
+ return self.family().ancientpages_address(self.language(), n)
+
+ def lonelypages_address(self, n=500):
+ """Return path to Special:Lonelypages."""
+ return self.family().lonelypages_address(self.language(), n)
+
+ def unwatchedpages_address(self, n=500):
+ """Return path to Special:Unwatchedpages."""
+ return self.family().unwatchedpages_address(self.language(), n)
+
+ def uncategorizedcategories_address(self, n=500):
+ """Return path to
Special:Uncategorizedcategories."""
+ return self.family().uncategorizedcategories_address(self.language(), n)
+
+ def uncategorizedimages_address(self, n=500):
+ """Return path to Special:Uncategorizedimages."""
+ return self.family().uncategorizedimages_address(self.language(), n)
+
+ def uncategorizedpages_address(self, n=500):
+ """Return path to Special:Uncategorizedpages."""
+ return self.family().uncategorizedpages_address(self.language(), n)
+
+ def unusedcategories_address(self, n=500):
+ """Return path to Special:Unusedcategories."""
+ return self.family().unusedcategories_address(self.language(), n)
+
+ def withoutinterwiki_address(self, n=500):
+ """Return path to Special:Withoutinterwiki."""
+ return self.family().withoutinterwiki_address(self.language(), n)
+
+ def references_address(self, s):
+ """Return path to Special:Whatlinksere for page
's'."""
+ return self.family().references_address(self.language(), s)
+
+ def allmessages_address(self):
+ """Return path to Special:Allmessages."""
+ return self.family().allmessages_address(self.language())
+
+ def upload_address(self):
+ """Return path to Special:Upload."""
+ return self.family().upload_address(self.language())
+
+ def double_redirects_address(self, default_limit = True):
+ """Return path to Special:Doubleredirects."""
+ return self.family().double_redirects_address(self.language(), default_limit)
+
+ def broken_redirects_address(self, default_limit = True):
+ """Return path to Special:Brokenredirects."""
+ return self.family().broken_redirects_address(self.language(), default_limit)
+
+ def login_address(self):
+ """Return path to Special:Userlogin."""
+ return self.family().login_address(self.language())
+
+ def captcha_image_address(self, id):
+ """Return path to Special:Captcha for image
'id'."""
+ return self.family().captcha_image_address(self.language(), id)
+
+ def watchlist_address(self):
+ """Return path to Special:Watchlist editor."""
+ return self.family().watchlist_address(self.language())
+
+ def contribs_address(self, target, limit=500, offset=''):
+ """Return path to Special:Contributions for user
'target'."""
+ return self.family().contribs_address(self.language(),target,limit,offset)
+
+ def __hash__(self):
+ return hash(repr(self))
+
+ def version(self):
+ """Return MediaWiki version number as a string."""
+ return self.family().version(self.language())
+
+ def versionnumber(self):
+ """Return an int identifying MediaWiki version.
+
+ Currently this is implemented as returning the minor version
+ number; i.e., 'X' in version '1.X.Y'
+
+ """
+ return self.family().versionnumber(self.language())
+
+ def live_version(self):
+ """Return the 'real' version number found on
[[Special:Version]]
+
+ Return value is a tuple (int, int, str) of the major and minor
+ version numbers and any other text contained in the version.
+
+ """
+ global htmldata
+ if not hasattr(self, "_mw_version"):
+ versionpage = self.getUrl(self.get_address("Special:Version"))
+ htmldata = BeautifulSoup(versionpage, convertEntities="html")
+ versionstring = htmldata.findAll(text="MediaWiki"
+ )[1].parent.nextSibling
+ m = re.match(r"^: ([0-9]+)\.([0-9]+)(.*)$", str(versionstring))
+ if m:
+ self._mw_version = (int(m.group(1)), int(m.group(2)),
+ m.group(3))
+ else:
+ self._mw_version =
self.family().version(self.language()).split(".")
+ return self._mw_version
+
+ def checkCharset(self, charset):
+ """Warn if charset returned by wiki doesn't match family
file."""
+ if not hasattr(self,'charset'):
+ self.charset = charset
+ assert self.charset.lower() == charset.lower(), \
+ "charset for %s changed from %s to %s" \
+ % (repr(self), self.charset, charset)
+ if self.encoding().lower() != charset.lower():
+ raise ValueError(
+"code2encodings has wrong charset for %s. It should be %s, but is %s"
+ % (repr(self), charset, self.encoding()))
+
+ def shared_image_repository(self):
+ """Return a tuple of image repositories used by this
site."""
+ return self.family().shared_image_repository(self.language())
+
+ def __cmp__(self, other):
+ """Perform equality and inequality tests on Site
objects."""
+ if not isinstance(other, Site):
+ return 1
+ if self.family() == other.family:
+ return cmp(self.language() ,other.lang)
+ return cmp(self.family().name, other.family.name)
+
+ def category_on_one_line(self):
+ """Return True if this site wants all category links on one
line."""
+ return self.language() in self.family().category_on_one_line
+
+ def interwiki_putfirst(self):
+ """Return list of language codes for ordering of interwiki
links."""
+ return self.family().interwiki_putfirst.get(self.language(), None)
+
+ def interwiki_putfirst_doubled(self, list_of_links):
+ # TODO: is this even needed? No family in the framework has this
+ # dictionary defined!
+ if self.family().interwiki_putfirst_doubled.has_key(self.language()):
+ if len(list_of_links) >=
self.family().interwiki_putfirst_doubled[self.language()][0]:
+ list_of_links2 = []
+ for lang in list_of_links:
+ list_of_links2.append(lang.language())
+ list = []
+ for lang in
self.family().interwiki_putfirst_doubled[self.language()][1]:
+ try:
+ list.append(list_of_links[list_of_links2.index(lang)])
+ except ValueError:
+ pass
+ return list
+ else:
+ return False
+ else:
+ return False
+
+ def getSite(self, code):
+ """Return Site object for language 'code' in this
Family."""
+ return getSite(code = code, fam = self.family(), user=self.user)
+
+ def namespace(self, num, all = False):
+ """Return string containing local name of namespace
'num'.
+
+ If optional argument 'all' is true, return a tuple of all recognized
+ values for this namespace.
+
+ """
+ return self.family().namespace(self.language(), num, all = all)
+
+ def normalizeNamespace(self, value):
+ """Return canonical name for namespace 'value' in this
Site's language.
+
+ 'Value' should be a string or unicode.
+ If no match, return 'value' unmodified.
+
+ """
+ if not self.nocapitalize and value[0].islower():
+ value = value[0].upper() + value[1:]
+ return self.family().normalizeNamespace(self.language(), value)
+
+ def namespaces(self):
+ """Return list of canonical namespace names for this
Site."""
+
+ # n.b.: this does not return namespace numbers; to determine which
+ # numeric namespaces the framework recognizes for this Site (which
+ # may or may not actually exist on the wiki), use
+ # self.family().namespaces.keys()
+
+ if _namespaceCache.has_key(self):
+ return _namespaceCache[self]
+ else:
+ nslist = []
+ for n in self.family().namespaces:
+ try:
+ ns = self.family().namespace(self.language(), n)
+ except KeyError:
+ # No default namespace defined
+ continue
+ if ns is not None:
+ nslist.append(self.family().namespace(self.language(), n))
+ _namespaceCache[self] = nslist
+ return nslist
+
+ def validLanguageLinks(self):
+ """Return list of language codes that can be used in interwiki
links."""
+ return self._validlanguages
+
+ def disambcategory(self):
+ """Return Category in which disambig pages are
listed."""
+ import catlib
+ try:
+ return catlib.Category(self,
+
self.namespace(14)+':'+self.family().disambcatname[self.language()])
+ except KeyError:
+ raise NoPage
+
+ def getToken(self, getalways = True, getagain = False, sysop = False):
+ index = self._userIndex(sysop)
+ if getagain or (getalways and self._token[index] is None):
+ output(u'Getting a token.')
+ self._load(sysop = sysop)
+ if self._token[index] is not None:
+ return self._token[index]
+ else:
+ return False
+
Modified: branches/rewrite/pywikibot/tests/api_tests.py
===================================================================
--- branches/rewrite/pywikibot/tests/api_tests.py 2008-02-27 20:05:28 UTC (rev 5087)
+++ branches/rewrite/pywikibot/tests/api_tests.py 2008-02-27 20:08:48 UTC (rev 5088)
@@ -25,7 +25,7 @@
self.assert_(all(len(item) == 2 for item in req.items()))
-class TestListGenerator(unittest.TestCase):
+class TestPageGenerator(unittest.TestCase):
def setUp(self):
self.gen = api.PageGenerator(site=mysite,
generator="links",