Revision: 5088 Author: russblau Date: 2008-02-27 20:08:48 +0000 (Wed, 27 Feb 2008)
Log Message: ----------- Committing page and site modules, related tests
Modified Paths: -------------- branches/rewrite/pywikibot/__init__.py branches/rewrite/pywikibot/config.py branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/family.py branches/rewrite/pywikibot/login.py branches/rewrite/pywikibot/tests/api_tests.py
Added Paths: ----------- branches/rewrite/pywikibot/exceptions.py branches/rewrite/pywikibot/page.py branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/__init__.py =================================================================== --- branches/rewrite/pywikibot/__init__.py 2008-02-27 20:05:28 UTC (rev 5087) +++ branches/rewrite/pywikibot/__init__.py 2008-02-27 20:08:48 UTC (rev 5088) @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +""" +The initialization file for the Pywikibot framework. +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id: $' + + +from exceptions import * + +from page import Page, ImagePage, Category + +import config + +_sites = {} +default_family = config.family +default_code = config.mylang + +def Site(code=None, fam=None, user=None, interface=None): + """Return the specified Site object. + + Returns a cached object if possible, otherwise instantiates a new one. + + @param code: language code + @type code: string + @param fam: family name or object + @type fam: string or Family + @param user: bot user name to use on this site + @type user: unicode + + """ + if code == None: + code = default_code + if fam == None: + fam = default_family + if user == None: + try: + user = config.usernames[fam][code] + except KeyError: + user = None + if interface is None: + interface = config.site_interface + try: + exec "from site import %s as __Site" % interface + except ImportError: + raise ValueError("Invalid interface name '%s'" % interface) + key = '%s:%s:%s' % (fam, code, user) + if not _sites.has_key(key): + _sites[key] = __Site(code=code, fam=fam, user=user) + return _sites[key] + +getSite = Site # alias for backwards-compability + +# DEBUG + +def output(text): + print text + +def input(prompt, password=False): + if password: + import getpass + return getpass.getpass(prompt) + return raw_input(prompt)
Modified: branches/rewrite/pywikibot/config.py =================================================================== --- branches/rewrite/pywikibot/config.py 2008-02-27 20:05:28 UTC (rev 5087) +++ branches/rewrite/pywikibot/config.py 2008-02-27 20:08:48 UTC (rev 5088) @@ -26,7 +26,9 @@ family = 'wikipedia' # The language code of the site we're working on. mylang = 'language' - +# The default interface for communicating with the site +# currently the only defined interface is 'APISite', so don't change this! +site_interface = 'APISite' # The dictionary usernames should contain a username for each site where you # have a bot account. Please set your usernames by adding such lines to your # user-config.py: @@ -71,8 +73,49 @@
# Get the names of all known families, and initialize # with empty dictionaries -import wikipediatools as _wt -_base_dir = _wt.get_base_dir() +def _get_base_dir(): + """Return the directory in which user-specific information is stored. + + This is determined in the following order - + 1. If the script was called with a -dir: argument, use the directory + provided in this argument + 2. If the user has a PYWIKIBOT_DIR environment variable, use the value + of it + 3. If the script was started from a directory that contains a + user-config.py file, use this directory as the base + 4. If all else fails, use the directory from which this module was + loaded. + + """ + for arg in __sys.argv[1:]: + if arg.startswith("-dir:"): + base_dir = arg[5:] + __sys.argv.remove(arg) + break + else: + if os.environ.has_key("PYWIKIBOT_DIR"): + base_dir = os.environ["PYWIKIBOT_DIR"] + else: + if os.path.exists('user-config.py'): + base_dir = '.' + else: + try: + base_dir = os.path.split( + __sys.modules['wikipediatools'].__file__)[0] + except KeyError: + print sys.modules + base_dir = '.' + if not os.path.isabs(base_dir): + base_dir = os.path.normpath(os.path.join(os.getcwd(), base_dir)) + # make sure this path is valid and that it contains user-config file + if not os.path.isdir(base_dir): + raise RuntimeError("Directory '%s' does not exist." % base_dir) + if not os.path.exists(os.path.join(base_dir, "user-config.py")): + raise RuntimeError("No user-config.py found in directory '%s'." + % base_dir) + return base_dir + +_base_dir = _get_base_dir() _RfamilyFile = re.compile('(?P<name>.+)_family.py$') for _filename in os.listdir(os.path.join(_base_dir, 'families')): _m = _RfamilyFile.match(_filename) @@ -477,12 +520,13 @@ """Return an absolute path to a data file in a standard location.
Argument(s) are zero or more directory names, optionally followed by a - data file name. The return path is offset to config.base_dir. Any - directories in the path that do not already exist are created. + data file name. The return path is offset to the "data" subdirectory of + config.base_dir. Any directories in the path that do not already exist + are created.
""" import os - return makepath(os.path.join(base_dir, *filename)) + return makepath(os.path.join(os.path.join(base_dir, "data"), *filename))
def shortpath(path): """Return a file path relative to config.base_dir."""
Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2008-02-27 20:05:28 UTC (rev 5087) +++ branches/rewrite/pywikibot/data/api.py 2008-02-27 20:08:48 UTC (rev 5088) @@ -10,6 +10,7 @@ __version__ = '$Id: $'
from UserDict import DictMixin +from datetime import datetime, timedelta import http import simplejson as json import logging @@ -17,10 +18,10 @@ import traceback import time import urllib -# TODO - replace when Page object is written -from pywikibot.tests.dummy import TestPage as Page
+from pywikibot import login
+ lagpattern = re.compile(r"Waiting for [\d.]+: (?P<lag>\d+) seconds? lagged")
@@ -127,7 +128,7 @@ if self.params['format'] != 'json': raise TypeError("Query format '%s' cannot be parsed." % self.params['format']) - uri = self.site.script_path() + "api.php" + uri = self.site.scriptpath() + "/api.php" params = urllib.urlencode(self.params) while True: # TODO wait on errors @@ -143,6 +144,7 @@ rawdata = http.request(self.site, uri) except Exception, e: #TODO: what exceptions can occur here? logging.warning(traceback.format_exc()) + print uri, params self.wait() continue if rawdata.startswith(u"unknown_action"): @@ -257,9 +259,44 @@ del self.data
+class LoginManager(login.LoginManager): + """Supplies getCookie() method to use API interface.""" + def getCookie(self, remember=True, captchaId=None, captchaAnswer=None): + """ + Login to the site. + + Paramters are all ignored. + + Returns cookie data if succesful, None otherwise. + """ + if hasattr(self, '_waituntil'): + if datetime.now() < self._waituntil: + time.sleep(self._waituntil - datetime.now()) + login_request = Request(site=self.site, + action="login", + lgname=self.username, + lgpassword=self.password + ) + login_result = login_request.submit() + if u"login" not in login_result: + raise RuntimeError("API login response does not have 'login' key.") + if login_result['login']['result'] != u'Success': + self._waituntil = datetime.datetime.now() + datetime.timedelta(seconds=60) + return None + + prefix = login_result['login']['cookieprefix'] + cookies = [] + for key in ('Token', 'UserID', 'UserName'): + cookies.append("%s%s=%s" + % (prefix, key, + login_result['login']['lg'+key.lower()])) + self.username = login_result['login']['lgusername'] + return "\n".join(cookies) + + if __name__ == "__main__": - from pywikibot.tests.dummy import TestSite as Site, TestPage as Page - mysite = Site("en.wikipedia.org") + from pywikibot import Site + mysite = Site("en", "wikipedia") logging.getLogger().setLevel(logging.DEBUG) def _test(): import doctest
Added: branches/rewrite/pywikibot/exceptions.py =================================================================== --- branches/rewrite/pywikibot/exceptions.py (rev 0) +++ branches/rewrite/pywikibot/exceptions.py 2008-02-27 20:08:48 UTC (rev 5088) @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +""" +Exception classes used throughout the framework. +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id: $' + + +# TODO: These are copied from wikipedia.py; not certain that all of them +# will be needed in the rewrite. + +class Error(Exception): + """Wikipedia error""" + +class NoUsername(Error): + """Username is not in user-config.py""" + +class NoPage(Error): + """Page does not exist""" + +class NoSuchSite(Error): + """Site does not exist""" + +class IsRedirectPage(Error): + """Page is a redirect page""" + +class IsNotRedirectPage(Error): + """Page is not a redirect page""" + +class LockedPage(Error): + """Page is locked""" + +class SectionError(Error): + """The section specified by # does not exist""" + +class PageNotSaved(Error): + """Saving the page has failed""" + +class EditConflict(PageNotSaved): + """There has been an edit conflict while uploading the page""" + +class SpamfilterError(PageNotSaved): + """Saving the page has failed because the MediaWiki spam filter detected a blacklisted URL.""" + def __init__(self, arg): + self.url = arg + self.args = arg, + +class ServerError(Error): + """Got unexpected server response""" + +class BadTitle(Error): + """Server responded with BadTitle.""" + +# UserBlocked exceptions should in general not be caught. If the bot has +# been blocked, the bot operator should address the reason for the block +# before continuing. +class UserBlocked(Error): + """Your username or IP has been blocked""" + +class PageNotFound(Error): + """Page not found in list""" + +class CaptchaError(Error): + """Captcha is asked and config.solve_captcha == False.""" +
Modified: branches/rewrite/pywikibot/family.py =================================================================== --- branches/rewrite/pywikibot/family.py 2008-02-27 20:05:28 UTC (rev 5087) +++ branches/rewrite/pywikibot/family.py 2008-02-27 20:08:48 UTC (rev 5088) @@ -2963,6 +2963,16 @@ wiki""" return self.code2encoding(code),
+ # aliases + def encoding(self, code): + """Return the encoding for a specific language wiki""" + return self.code2encoding(code) + + def encodings(self, code): + """Return a list of historical encodings for a specific language + wiki""" + return self.code2encodings(code) + def __cmp__(self, otherfamily): try: return cmp(self.name, otherfamily.name) @@ -2972,6 +2982,9 @@ def __hash__(self): return hash(self.name)
+ def __repr__(self): + return 'Family("%s")' % self.name + def RversionTab(self, code): """Change this to some regular expression that shows the page we found is an existing page, in case the normal regexp does not work."""
Modified: branches/rewrite/pywikibot/login.py =================================================================== --- branches/rewrite/pywikibot/login.py 2008-02-27 20:05:28 UTC (rev 5087) +++ branches/rewrite/pywikibot/login.py 2008-02-27 20:08:48 UTC (rev 5088) @@ -46,7 +46,10 @@
import re import urllib2 -import wikipedia, config +import config +import pywikibot +from pywikibot import Page +from pywikibot.exceptions import *
# On some wikis you are only allowed to run a bot if there is a link to # the bot's user page in a specific list. @@ -65,17 +68,17 @@
class LoginManager: def __init__(self, password = None, sysop = False, site = None): - self.site = site or wikipedia.getSite() + self.site = site or pywikibot.Site() if sysop: try: - self.username = config.sysopnames[self.site.family.name][self.site.lang] + self.username = config.sysopnames[self.site.family().name][self.site.language()] except: - raise wikipedia.NoUsername(u'ERROR: Sysop username for %s:%s is undefined.\nIf you have a sysop account for that site, please add such a line to user-config.py:\n\nsysopnames['%s']['%s'] = 'myUsername'' % (self.site.family.name, self.site.lang, self.site.family.name, self.site.lang)) + raise NoUsername(u'ERROR: Sysop username for %s:%s is undefined.\nIf you have a sysop account for that site, please add such a line to user-config.py:\n\nsysopnames['%s']['%s'] = 'myUsername'' % (self.site.family.name, self.site.lang, self.site.family.name, self.site.lang)) else: try: - self.username = config.usernames[self.site.family.name][self.site.lang] + self.username = config.usernames[self.site.family().name][self.site.language()] except: - raise wikipedia.NoUsername(u'ERROR: Username for %s:%s is undefined.\nIf you have an account for that site, please add such a line to user-config.py:\n\nusernames['%s']['%s'] = 'myUsername'' % (self.site.family.name, self.site.lang, self.site.family.name, self.site.lang)) + raise NoUsername(u'ERROR: Username for %s:%s is undefined.\nIf you have an account for that site, please add such a line to user-config.py:\n\nusernames['%s']['%s'] = 'myUsername'' % (self.site.family.name, self.site.lang, self.site.family.name, self.site.lang)) self.password = password if getattr(config, 'password_file', ''): self.readPassword() @@ -85,9 +88,10 @@ Checks whether the bot is listed on a specific page to comply with the policy on the respective wiki. """ + return True # DEBUG if botList.has_key(self.site.family.name) and botList[self.site.family.name].has_key(self.site.language()): botListPageTitle = botList[self.site.family.name][self.site.language()] - botListPage = wikipedia.Page(self.site, botListPageTitle) + botListPage = Page(self.site, botListPageTitle) for linkedPage in botListPage.linkedPages(): if linkedPage.titleWithoutNamespace() == self.username: return True @@ -171,10 +175,11 @@
The argument data is the raw data, as returned by getCookie().
- Returns nothing.""" - filename = wikipedia.config.datafilepath('login-data', - '%s-%s-%s-login.data' - % (self.site.family.name, self.site.lang, self.username)) + """ + filename = config.datafilepath('%s-%s-%s-login.data' + % (self.site.family().name, + self.site.language(), + self.username)) f = open(filename, 'w') f.write(data) f.close() @@ -211,21 +216,21 @@ if not self.password: # As we don't want the password to appear on the screen, we set # password = True - self.password = wikipedia.input(u'Password for user %s on %s:' % (self.username, self.site), password = True) + self.password = pywikibot.input(u'Password for user %s on %s:' % (self.username, self.site), password = True)
- self.password = self.password.encode(self.site.encoding()) +# self.password = self.password.encode(self.site.encoding())
- wikipedia.output(u"Logging in to %s as %s" % (self.site, self.username)) + pywikibot.output(u"Logging in to %s as %s" % (self.site, self.username)) cookiedata = self.getCookie() if cookiedata: self.storecookiedata(cookiedata) - wikipedia.output(u"Should be logged in now") + pywikibot.output(u"Should be logged in now") # Show a warning according to the local bot policy if not self.botAllowed(): - wikipedia.output(u'*** Your username is not listed on [[%s]].\n*** Please make sure you are allowed to use the robot before actually using it!' % botList[self.site.family.name][self.site.lang]) + pywikibot.output(u'*** Your username is not listed on [[%s]].\n*** Please make sure you are allowed to use the robot before actually using it!' % botList[self.site.family.name][self.site.lang]) return True else: - wikipedia.output(u"Login failed. Wrong password or CAPTCHA answer?") + pywikibot.output(u"Login failed. Wrong password or CAPTCHA answer?") if retry: self.password = None return self.login(retry = True)
Added: branches/rewrite/pywikibot/page.py =================================================================== --- branches/rewrite/pywikibot/page.py (rev 0) +++ branches/rewrite/pywikibot/page.py 2008-02-27 20:08:48 UTC (rev 5088) @@ -0,0 +1,1579 @@ +# -*- coding: utf-8 -*- +""" +Objects representing various types of MediaWiki pages. +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id: $' + +import pywikibot +from pywikibot.exceptions import * + +import htmlentitydefs +import logging +import re +import unicodedata +import urllib + +reNamespace = re.compile("^(.+?) *: *(.*)$") + + +class Page(object): + """Page: A MediaWiki page + + This object only implements internally methods that do not require + reading from or writing to the wiki. All other methods are delegated + to the Site object. + + Methods available: + - site: The wiki this page is in + - title: The name of the page, with various presentation options + - namespace: The namespace in which the page is found + - section: The section of the page (the part of the title after '#', if + any) + - isAutoTitle: Title can be translated using the autoFormat method + - autoFormat: Auto-format certain dates and other standard format page + titles + - isCategory: True if the page is a category + - isDisambig (*): True if the page is a disambiguation page + - isImage: True if the page is an image + - isRedirectPage (*): True if the page is a redirect, false otherwise + - getRedirectTarget (*): The page the page redirects to + - isTalkPage: True if the page is in any "talk" namespace + - toggleTalkPage: Return the talk page (if this is one, return the + non-talk page) + - get (*): The text of the page + - latestRevision (*): The page's current revision id + - userName: Last user to edit page + - isIpEdit: True if last editor was unregistered + - editTime: Timestamp of the last revision to the page + - previousRevision (*): The revision id of the previous version + - permalink (*): The url of the permalink of the current version + - getOldVersion(id) (*): The text of a previous version of the page + - getVersionHistory: Load the version history information from wiki + - getVersionHistoryTable: Create a wiki table from the history data + - fullVersionHistory: Return all past versions including wikitext + - contributingUsers: Return set of users who have edited page + - exists (*): True if the page actually exists, false otherwise + - isEmpty (*): True if the page has 4 characters or less content, not + counting interwiki and category links + - interwiki (*): The interwiki links from the page (list of Pages) + - categories (*): The categories the page is in (list of Pages) + - linkedPages (*): The normal pages linked from the page (list of + Pages) + - imagelinks (*): The pictures on the page (list of ImagePages) + - templates (*): All templates referenced on the page (list of Pages) + - templatesWithParams(*): All templates on the page, with list of + parameters + - isDisambig (*): True if the page is a disambiguation page + - getReferences: List of pages linking to the page + - canBeEdited (*): True if page is unprotected or user has edit + privileges + - botMayEdit (*): True if bot is allowed to edit page + - put(newtext): Saves the page + - put_async(newtext): Queues the page to be saved asynchronously + - move: Move the page to another title + - delete: Deletes the page (requires being logged in) + - protect: Protect or unprotect a page (requires sysop status) + - removeImage: Remove all instances of an image from this page + - replaceImage: Replace all instances of an image with another + - loadDeletedRevisions: Load all deleted versions of this page + - getDeletedRevision: Return a particular deleted revision + - markDeletedRevision: Mark a version to be undeleted, or not + - undelete: Undelete past version(s) of the page + + Deprecated methods (preserved for backwards-compatibility): + - urlname: Title, in a form suitable for a URL + - titleWithoutNamespace: Title, with the namespace part removed + - sectionFreeTitle: Title, without the section part + - aslink: Title in the form [[Title]] or [[lang:Title]] + - encoding: The encoding of the page + + (*) This loads the page if it has not been loaded before; permalink might + even reload it if it has been loaded before + + """ + def __init__(self, site, title, insite=None, + defaultNamespace=0): + """Parameters: + + @param site: the wikimedia Site on which the page resides + @param title: title of the page + @type title: unicode + @param insite: (optional) a wikimedia Site where this link was found + (to help decode interwiki links) + @param defaultNamespace: (optional) A namespace to use if the link + does not contain one + @type defaultNamespace: int + + """ + if site == None: + self._site = pywikibot.Site() + elif isinstance(site, basestring): + self._site = pywikibot.Site(site) + else: + self._site = site + + if not insite: insite = self._site + + # parse the title + # this can throw various exceptions if the title is invalid + link = Link(title, insite, defaultNamespace) + self._site = link.site + self._section = link.section + self._ns = link.namespace + self._title = link.title + # reassemble the canonical title from components + if self._section is not None: + self._title = self._title + "#" + self._section + if self._ns: + self._title = self.site().namespace(self._ns) + ":" + self._title + self._revisions = {} + + def site(self): + """Return the Site object for the wiki on which this Page resides.""" + return self._site + + def namespace(self): + """Return the number of the namespace of the page. + + Only recognizes those namespaces defined in family.py. + If not defined, it will return 0 (the main namespace). + + @return: int + + """ + return self._ns + + def title(self, underscore=False, savetitle=False, withNamespace=True, + withSection=True, asUrl=False, asLink=False, + allowInterwiki=True, forceInterwiki=False, textlink=False): + """Return the title of this Page, as a Unicode string. + + @param underscore: if true, replace all ' ' characters with '_' + @param savetitle: if true, try to quote all non-ASCII characters. + (DEPRECATED: use asURL instead) + @param withNamespace: if false, omit the namespace prefix + @param withSection: if false, omit the section + @param asUrl: if true, quote title as if in an URL + @param asLink: if true, return the title in the form of a wikilink + @param allowInterwiki: (only used if asLink is true) if true, format + the link as an interwiki link if necessary + @param forceInterwiki: (only used if asLink is true) if true, always + format the link as an interwiki link + @param textlink: (only used if asLink is true) if true, place a ':' + before Category: and Image: links + + """ + title = self._title + if not withNamespace and self._ns != 0: + title = title.split(u':', 1)[1] + if not withSection and self._section: + title = title.split(u'#', 1)[0] + if underscore or asUrl: + title = title.replace(u' ', u'_') + if savetitle: + logging.debug( + u"Page.title(savetitle=...) is deprecated.") + if savetitle or asUrl: + encodedTitle = title.encode(self.site().encoding()) + title = urllib.quote(encodedTitle) + if asLink: + if forceInterwiki or ( + allowInterwiki and self.site() != pywikibot.Site()): + if self.site().family() != pywikibot.Site().family() \ + and self.site().family().name != self.site().language(): +# FIXME: Interwiki links shouldn't be fully urlencoded + return u'[[%s:%s:%s]]' % (self.site().family().name, + self.site().language(), + self.title(asUrl=True)) + else: + return u'[[%s:%s]]' % (self.site().language(), + self.title(asUrl=True)) + elif textlink and (self.isImage() or self.isCategory()): + return u'[[:%s]]' % title + else: + return u'[[%s]]' % title + return title + + def section(self, underscore = False): + """Return the name of the section this Page refers to. + + The section is the part of the title following a '#' character, if + any. If no section is present, return None. + + @param underscore: unused, but maintained for backwards compatibility + + """ + if underscore: + logging.debug( + u"Page.section(underscore=...) is deprecated.") + if self._section: + return self._section + else: + return None + + def __str__(self): + """Return a console representation of the pagelink.""" + return self.title(asLink=True, forceInterwiki=True) + + def __repr__(self): + """Return a more complete string representation.""" + return u"%s(%s)" % (self.__class__.__name__, self.title()) + + def __cmp__(self, other): + """Test for equality and inequality of Page objects""" + if not isinstance(other, Page): + # especially, return -1 if other is None + return -1 + if not self.site() == other.site(): + return cmp(self.site(), other.site()) + owntitle = self.title() + othertitle = other.title() + return cmp(owntitle, othertitle) + + def __hash__(self): + # Pseudo method that makes it possible to store Page objects as keys + # in hash-tables. This relies on the fact that the string + # representation of an instance can not change after the construction. + return hash(str(self)) + + def autoFormat(self): + """Return L{date.autoFormat} dictName and value, if any. + + Value can be a year, date, etc., and dictName is 'YearBC', + 'Year_December', or another dictionary name. Please note that two + entries may have exactly the same autoFormat, but be in two + different namespaces, as some sites have categories with the + same names. Regular titles return (None, None). + + """ + if not hasattr(self, '_autoFormat'): + from pywikibot import date + self._autoFormat = date.getAutoFormat( + self.site().language(), + self.title(withNamespace=False) + ) + return self._autoFormat + + def isAutoTitle(self): + """Return True if title of this Page is in the autoFormat dictionary.""" + return self.autoFormat()[0] is not None + + def get(self, force=False, get_redirect=False, throttle=None, + sysop=False, nofollow_redirects=None, change_edit_time=None): + """Return the wiki-text of the page. + + This will retrieve the page from the server if it has not been + retrieved yet, or if force is True. This can raise the following + exceptions that should be caught by the calling code: + + - NoPage: The page does not exist + - IsRedirectPage: The page is a redirect. The argument of the + exception is the title of the page it redirects to. + - SectionError: The section does not exist on a page with a # + link + + @param force: reload all page attributes, including errors. + @param get_redirect: return the redirect text, do not follow the + redirect, do not raise an exception. + @param sysop: if the user has a sysop account, use it to retrieve + this page + @param throttle: DEPRECATED and unused + @param nofollow_redirects: DEPRECATED and unused + @param change_edit_time: DEPRECATED and unused + + """ + if throttle is not None: + logging.debug("Page.get(throttle) option is deprecated.") + if nofollow_redirects is not None: + logging.debug("Page.get(nofollow_redirects) option is deprecated.") + if change_edit_time is not None: + logging.debug("Page.get(change_edit_time) option is deprecated.") + if force: + # When forcing, we retry the page no matter what. Old exceptions + # do not apply any more. + for attr in ['_redirarg', '_getexception']: + if hasattr(self, attr): + delattr(self,attr) + else: + # Make sure we re-raise an exception we got on an earlier attempt + if hasattr(self, '_redirarg') and not get_redirect: + raise IsRedirectPage, self._redirarg + elif hasattr(self, '_getexception'): + raise self._getexception + if force or not hasattr(self, "_revid") \ + or not self._revid in self._revisions: + self.site().getrevisions(self, getText=True, ids=None, sysop=sysop) + # TODO: Exception handling for no-page, redirects, etc. + + return self._revisions[self._revid].text + + def getOldVersion(self, oldid, force=False, get_redirect=False, + throttle=None, sysop=False, nofollow_redirects=None, + change_edit_time=None): + """Return text of an old revision of this page; same options as get(). + + @param oldid: The revid of the revision desired. + + """ + if throttle is not None: + logging.debug( + "Page.getOldVersion(throttle) option is deprecated.") + if nofollow_redirects is not None: + logging.debug( + "Page.getOldVersion(nofollow_redirects) option is deprecated.") + if change_edit_time is not None: + logging.debug( + "Page.getOldVersion(change_edit_time) option is deprecated.") + if force or not oldid in self._revisions: + self.site().getrevisions(self, getText=True, ids=oldid, + redirs=get_redirect, sysop=sysop) + return self._revisions[oldid].text + + def permalink(self): + """Return the permalink URL for current revision of this page.""" + return "%s://%s/%sindex.php?title=%s&oldid=%s" \ + % (self.site().protocol(), + self.site().hostname(), + self.site().script_path(), + self.title(asUrl=True), + self.latestRevision()) + + def latestRevision(self): + """Return the current revision id for this page.""" + if not hasattr(self, '_revid'): + self.site().getrevisions(self) + return self._revid + + def userName(self): + """Return name or IP address of last user to edit page.""" + return self._revisions[self.latestRevision()].user + + def isIpEdit(self): + """Return True if last editor was unregistered.""" + return self._revisions[self.latestRevision()].anon + + def editTime(self): + """Return timestamp (in MediaWiki format) of last revision to page.""" + return self._revisions[self.latestRevision()].timestamp + + def previousRevision(self): + """Return the revision id for the previous revision of this Page.""" + vh = self.getVersionHistory(revCount=2) + return vh[1][0] + + def exists(self): + """Return True if page exists on the wiki, even if it's a redirect. + + If the title includes a section, return False if this section isn't + found. + + """ + return self.site().page_exists(self) + + def isRedirectPage(self): + """Return True if this is a redirect, False if not or not existing.""" + return self.site().page_isredirect(self) + + def isEmpty(self): + """Return True if the page text has less than 4 characters. + + Character count ignores language links and category links. + Can raise the same exceptions as get(). + + """ + txt = self.get() + txt = pywikibot.removeLanguageLinks(txt, site = self.site()) + txt = pywikibot.removeCategoryLinks(txt, site = self.site()) + if len(txt) < 4: + return True + else: + return False + + def isTalkPage(self): + """Return True if this page is in any talk namespace.""" + ns = self.namespace() + return ns >= 0 and ns % 2 == 1 + + def toggleTalkPage(self): + """Return other member of the article-talk page pair for this Page. + + If self is a talk page, returns the associated content page; + otherwise, returns the associated talk page. + Returns None if self is a special page. + + """ + ns = self.namespace() + if ns < 0: # Special page + return None + if self.isTalkPage(): + if self.namespace() == 1: + return Page(self.site(), self.title(withNamespace=False)) + else: + return Page(self.site(), + self.site().namespace(ns - 1) + ':' + + self.title(withNamespace=False)) + else: + return Page(self.site(), + self.site().namespace(ns + 1) + ':' + + self.title(withNamespace=False)) + + def isCategory(self): + """Return True if the page is a Category, False otherwise.""" + return self.namespace() == 14 + + def isImage(self): + """Return True if this is an image description page, False otherwise.""" + return self.namespace() == 6 + + def isDisambig(self): + """Return True if this is a disambiguation page, False otherwise. + + Relies on the presence of specific templates, identified in the Family + file, to identify disambiguation pages. + + """ + if not hasattr(self, '_isDisambig'): + locdis = self.site().family.disambig( self.site().lang ) + for template in self.templates(): + tn = template.title(withNamespace=False) + if tn in locdis: + _isDisambig = True + break + else: + _isDisambig = False + return _isDisambig + + def getReferences(self, follow_redirects=True, withTemplateInclusion=True, + onlyTemplateInclusion=False, redirectsOnly=False): + """Yield all pages that link to the page. + + If you need a full list of referring pages, use + C{pages = list(s.getReferences())} + + @param follow_redirects: if True, also return pages that link to a + redirect pointing to the page. + @param withTemplateInclusion: if True, also return pages where self + is used as a template. + @param onlyTemplateInclusion: if True, only return pages where self + is used as a template. + @param redirectsOnly: if True, only return redirects to self. + + """ + # N.B.: this method intentionally overlaps with backlinks() and + # embeddedin(). Depending on the interface, it may be more efficient + # to implement those methods in the site interface and then combine + # the results for this method, or to implement this method and then + # split up the results for the others. + return self.site().getreferences(self, follow_redirects, + withTemplateInclusion, + onlyTemplateInclusion, + redirectsOnly) + + def backlinks(self, followRedirects=True, filterRedirects=None): + """Yield all pages that contain ordinary wikilinks to this page. + + @param followRedirects: if True, also return pages that link to a + redirect pointing to the page. + @param filterRedirects: if True, only return redirects; if False, + omit redirects; if None, do not filter + + """ + return self.site().getbacklinks(self, followRedirects, filterRedirects) + + def embeddedin(self): + """Yield all pages that embed this page as a template.""" + return self.site().getembeddedin(self) + + def canBeEdited(self): + """Return bool indicating whether this page can be edited. + + This returns True if and only if: + - page is unprotected, and bot has an account for this site, or + - page is protected, and bot has a sysop account for this site. + + """ + return self.site().page_can_be_edited(self) + + def botMayEdit(self): + """Return True if this page allows bots to edit it. + + This will be True if the page doesn't contain {{bots}} or + {{nobots}}, or it contains them and the active bot is allowed to + edit this page. (This method is only useful on those sites that + recognize the bot-exclusion protocol; on other sites, it will always + return True.) + + The framework enforces this restriction by default. It is possible + to override this by setting ignore_bot_templates=True in + user_config.py, or using page.put(force=True). + + """ # TODO: move this to Site object? + if config.ignore_bot_templates: #Check the "master ignore switch" + return True + try: + templates = self.templatesWithParams(); + except (NoPage, IsRedirectPage, SectionError): + return True + for template in templates: + title = template[0].title(withNamespace=False) + if title == 'Nobots': + return False + elif title == 'Bots': + if len(template[1]) == 0: + return True + else: + (ttype, bots) = template[1][0].split('=', 1) + bots = bots.split(',') + if ttype == 'allow': + if 'all' in bots or username in bots: + return True + else: + return False + if ttype == 'deny': + if 'all' in bots or username in bots: + return False + else: + return True + # no restricting template found + return True + + + def put(self, newtext, comment=None, watchArticle=None, minorEdit=True, + force=False): + """Save the page with the contents of the first argument as the text. + + @param newtext: The complete text of the revised page. + @type newtext: unicode + @param comment: The edit summary for the modification (optional, + but most wikis strongly encourage its use) + @type comment: unicode + @param watchArticle: if True, add or if False, remove this Page + to/from bot user's watchlist; if None, leave watchlist status + unchanged + @type watchArticle: bool or None + @param minorEdit: if True, mark this edit as minor + @type minorEdit: bool + @param force: if True, ignore botMayEdit() setting + @type force: bool + + """ + return self.site().put(self, newtext, comment, watchArticle, + minorEdit, force) + + def put_async(self, newtext, + comment=None, watchArticle=None, minorEdit=True, force=False, + callback=None): + """Put page on queue to be saved to wiki asynchronously. + + Asynchronous version of put (takes the same arguments), which places + pages on a queue to be saved by a daemon thread. All arguments are + the same as for .put(), except: + + @param callback: a callable object that will be called after the + page put operation. This object must take two arguments: (1) a + Page object, and (2) an exception instance, which will be None + if the page was saved successfully. The callback is intended for + use by bots that need to keep track of which saves were + successful. + + """ + return self.site().put(self, newtext, comment, watchArticle, + minorEdit, force, callback, async=True) + + def linkedPages(self): + """Iterate Pages that this Page links to. + + Only returns pages from "normal" internal links. Image and category + links are omitted unless prefixed with ":"; embedded templates are + omitted (but links within them are returned); all interwiki and + external links are omitted. + + @return: a generator that yields Page objects. + + """ + return self.site().getlinks(self) + + def interwiki(self): + """Iterate interwiki links in the page text. + + @return: a generator that yields Link objects. + + """ + return self.site().getinterwiki(self) + + def langlinks(self): + """Iterate all interlanguage links on this page. + + Note that the links yielded by this method will be a subset of + the results of self.interwiki(). + + @return: a generator that yields Link objects. + + """ + return self.site().getlanglinks(self) + + def imagelinks(self, followRedirects=False, loose=None): + """Iterate ImagePage objects for images displayed on this Page. + + @param followRedirects: if an image link redirects to another page, + yield the redirect target instead of the original link + @param loose: DEPRECATED and ignored + @return: a generator that yields ImagePage objects. + + """ + if loose is not None: + logging.debug( + u"Page.imagelinks(loose) option is deprecated.") + return self.site().getimages(followRedirects) + + def templates(self): + """Iterate Page objects for templates used on this Page. + + Template parameters are ignored. This method only returns embedded + templates, not template pages that happen to be referenced through + a normal link. + + """ + return self.site().gettemplates(self) + + def templatesWithParams(self): + """Iterate templates used on this Page. + + @return: a generator that yields a tuple for each use of a template + in the page, with the template Page as the first entry and a list of + parameters as the second entry. + + """ + return self.site().templates_with_params(self) + + def categories(self, nofollow_redirects=None, withSortKey=False): + """Iterate categories that the article is in. + + @param nofollow_redirects: DEPRECATED and ignored + @param withSortKey: if True, include the sort key in each Category. + @return: a generator that yields Category objects. + + """ + # follow_redirects makes no sense here because category membership + # doesn't follow redirects + if nofollow_redirects is not None: + logging.debug( + u"Page.categories(nofollow_redirects) option is deprecated.") + return self.site().categories(withSortKey=withSortKey) + + def extlinks(self): + """Iterate all external URLs (not interwiki links) from this page. + + @return: a generator that yields unicode objects containing URLs. + + """ + return self.site().getextlinks(self) + + def getRedirectTarget(self): + """Return a Page object for the target this Page redirects to. + + If this page is not a redirect page, will raise an IsNotRedirectPage + exception. This method also can raise a NoPage exception. + + """ + return self.site().follow_redirect(self) + + def getVersionHistory(self, forceReload=False, reverseOrder=False, + getAll=False, revCount=500): + """Load the version history page and return history information. + + Return value is a list of tuples, where each tuple represents one + edit and is built of revision id, edit date/time, user name, and + edit summary. Starts with the most current revision, unless + reverseOrder is True. Defaults to getting the first revCount edits, + unless getAll is True. + + """ + if getAll: + limit = None + else: + limit = revCount + return self.site().getrevisions(self, withText=False, + older=reverseOrder, limit=limit) + + def getVersionHistoryTable(self, forceReload=False, reverseOrder=False, + getAll=False, revCount=500): + """Return the version history as a wiki table.""" + result = '{| border="1"\n' + result += '! oldid || date/time || username || edit summary\n' + for oldid, time, username, summary \ + in self.getVersionHistory(forceReload=forceReload, + reverseOrder=reverseOrder, + getAll=getAll, revCount=revCount): + result += '|----\n' + result += '| %s || %s || %s || <nowiki>%s</nowiki>\n'\ + % (oldid, time, username, summary) + result += '|}\n' + return result + + def fullVersionHistory(self): + """Iterate all previous versions including wikitext. + + @return: A generator that yields tuples consisting of revision ID, + edit date/time, user name and content + """ + return self.site().getrevisions(self, withText=True, + older=reverseOrder, limit=None) + + def contributingUsers(self): + """Return a set of usernames (or IPs) of users who edited this page.""" + edits = self.getVersionHistory() + users = set([edit[2] for edit in edits]) + return users + + def move(self, newtitle, reason=None, movetalkpage=True, sysop=False, + throttle=None, deleteAndMove=False, safe=True): + """Move this page to a new title. + + @param newtitle: The new page title. + @param reason: The edit summary for the move. + @param movetalkpage: If true, move this page's talk page (if it exists) + @param sysop: Try to move using sysop account, if available + @param throttle: DEPRECATED + @param deleteAndMove: if move succeeds, delete the old page + (requires sysop privileges) + @param safe: If false, attempt to delete existing page at newtitle + (if there is one) and then move this page to that title + + """ + if throttle is not None: + logging.debug( + u"Page.move: throttle option is deprecated.") + if reason is None: + pywikibot.output(u'Moving %s to [[%s]].' + % (self.title(asLink=True), newtitle)) + reason = pywikibot.input(u'Please enter a reason for the move:') + return self.site().move(self, newtitle, reason, + movetalkpage=movetalkpage, sysop=sysop, + deleteAndMove=deleteAndMove, safe=safe) + + def delete(self, reason=None, prompt=True, throttle=None, mark=False): + """Deletes the page from the wiki. Requires administrator status. + + @param reason: The edit summary for the deletion. + @param prompt: If true, prompt user for confirmation before deleting. + @param mark: if true, and user does not have sysop rights, place a + speedy-deletion request on the page instead. + + """ + if throttle is not None: + logging.debug( + u"Page.delete: throttle option is deprecated.") + if reason is None: + pywikibot.output(u'Deleting %s.' % (self.title(asLink=True))) + reason = pywikibot.input(u'Please enter a reason for the deletion:') + answer = u'y' + if prompt and not hasattr(self.site(), '_noDeletePrompt'): + answer = pywikibot.inputChoice(u'Do you want to delete %s?' + % self.title(asLink = True, forceInterwiki = True), + ['Yes', 'No', 'All'], + ['Y', 'N', 'A'], + 'N') + if answer in ['a', 'A']: + answer = 'y' + self.site()._noDeletePrompt = True + if answer in ['y', 'Y']: + return self.site().delete(self, reason, mark=mark) + + def loadDeletedRevisions(self): + """Retrieve all deleted revisions for this Page from Special/Undelete. + + Stores all revisions' timestamps, dates, editors and comments in + self._deletedRevs attribute. + + @return: list of timestamps (which can be used to retrieve revisions + later on). + + """ + return self.site().loadDeletedRevisions(self) + + def getDeletedRevision(self, timestamp, retrieveText=False): + """Return a particular deleted revision by timestamp. + + @return: a list of [date, editor, comment, text, restoration + marker]. text will be None, unless retrieveText is True (or has + been retrieved earlier). If timestamp is not found, returns + None. + + """ + return self.site().getDeletedRevision(self, timestamp, + getText=retrieveText) + + def markDeletedRevision(self, timestamp, undelete=True): + """Mark the revision identified by timestamp for undeletion. + + @param undelete: if False, mark the revision to remain deleted. + + """ + if self._deletedRevs == None: + self.loadDeletedRevisions() + if not self._deletedRevs.has_key(timestamp): + #TODO: Throw an exception? + return None + self._deletedRevs[timestamp][4] = undelete + self._deletedRevsModified = True + + def undelete(self, comment=None, throttle=None): + """Undelete revisions based on the markers set by previous calls. + + If no calls have been made since loadDeletedRevisions(), everything + will be restored. + + Simplest case:: + Page(...).undelete('This will restore all revisions') + + More complex:: + pg = Page(...) + revs = pg.loadDeletedRevsions() + for rev in revs: + if ... #decide whether to undelete a revision + pg.markDeletedRevision(rev) #mark for undeletion + pg.undelete('This will restore only selected revisions.') + + @param comment: The undeletion edit summary. + @param throttle: DEPRECATED + + """ + if throttle is not None: + logging.debug( + u"Page.undelete: throttle option is deprecated.") + if comment is None: + pywikibot.output(u'Preparing to undelete %s.' + % (self.title(asLink=True))) + comment = pywikibot.input( + u'Please enter a reason for the undeletion:') + return self.site().undelete(self, comment) + + def protect(self, edit='sysop', move='sysop', create='sysop', + unprotect=False, reason=None, prompt=True, throttle=None): + """(Un)protect a wiki page. Requires administrator status. + + Valid protection levels (in MediaWiki 1.12) are '' (equivalent to + 'none'), 'autoconfirmed', and 'sysop'. + + @param edit: Level of edit protection + @param move: Level of move protection + @param create: Level of create protection + @param unprotect: If true, unprotect the page (equivalent to setting + all protection levels to '') + @param reason: Edit summary. + @param prompt: If true, ask user for confirmation. + @param throttle: DEPRECATED + + """ + if throttle is not None: + logging.debug( + u"Page.protect: throttle option is deprecated.") + if reason is None: + if unprotect: + un = u'un' + else: + un = u'' + pywikibot.output(u'Preparing to %sprotect %s.' + % (un, self.title(asLink=True))) + reason = pywikibot.input(u'Please enter a reason for the action:') + if unprotect: + edit = move = create = "" + answer = 'y' + if prompt and not hasattr(self.site(), '_noProtectPrompt'): + answer = pywikibot.inputChoice( + u'Do you want to change the protection level of %s?' + % self.title(asLink=True, forceInterwiki = True), + ['Yes', 'No', 'All'], ['Y', 'N', 'A'], 'N') + if answer in ['a', 'A']: + answer = 'y' + self.site()._noProtectPrompt = True + if answer in ['y', 'Y']: + return self.site().protect(self, edit, move, create, reason) + +######## DEPRECATED METHODS ######## + + def encoding(self): + """Return the character encoding used on this Page's wiki Site. + + DEPRECATED: use Site.encoding() instead + + """ + logging.debug(u"Page.encoding() is deprecated; use Site.encoding().") + return self.site().encoding() + + def titleWithoutNamespace(self, underscore=False): + """Return title of Page without namespace and without section. + + DEPRECATED: use self.title(withNamespace=False) instead. + + """ + logging.debug( + u"Page.titleWithoutNamespace() method is deprecated.") + return self.title(underscore=underscore, withNamespace=False, + withSection=False) + + def sectionFreeTitle(self, underscore=False): + """Return the title of this Page, without the section (if any). + + DEPRECATED: use self.title(withSection=False) instead. + + """ + logging.debug( + u"Page.sectionFreeTitle() method is deprecated.") + return self.title(underscore=underscore, withSection=False) + + def aslink(self, forceInterwiki=False, textlink=False, noInterwiki=False): + """Return a string representation in the form of a wikilink. + + DEPRECATED: use self.title(asLink=True) instead. + + """ + logging.debug(u"Page.aslink() method is deprecated.") + return self.title(asLink=True, forceInterwiki=forceInterwiki, + allowInterwiki=not noInterwiki, textlink=textlink) + + def urlname(self): + """Return the Page title encoded for use in an URL. + + DEPRECATED: use self.title(asUrl=True) instead. + + """ + logging.debug(u"Page.urlname() method is deprecated.") + return self.title(asUrl=True) + +####### DISABLED METHODS (warnings provided) ###### + # these methods are easily replaced by editing the page's text using + # textlib methods and then using put() on the result. + + def removeImage(self, image, put=False, summary=None, safe=True): + """Old method to remove all instances of an image from page.""" + logging.warning(u"Page.removeImage() is no longer supported.") + + def replaceImage(self, image, replacement=None, put=False, summary=None, + safe=True): + """Old method to replace all instances of an image with another.""" + logging.warning(u"Page.replaceImage() is no longer supported.") + + +class ImagePage(Page): + """A subclass of Page representing an image descriptor wiki page. + + Supports the same interface as Page, with the following added methods: + + getImagePageHtml : Download image page and return raw HTML text. + fileURL : Return the URL for the image described on this + page. + fileIsOnCommons : Return True if image stored on Wikimedia + Commons. + fileIsShared : Return True if image stored on Wikitravel + shared repository. + getFileMd5Sum : Return image file's MD5 checksum. + getFileVersionHistory : Return the image file's version history. + getFileVersionHistoryTable: Return the version history in the form of a + wiki table. + usingPages : Iterate Pages on which the image is displayed. + + """ + def __init__(self, site, title, insite = None): + Page.__init__(self, site, title, insite, defaultNamespace=6) + if self.namespace() != 6: + raise ValueError(u"'%s' is not in the image namespace!" % title) + + def getImagePageHtml(self): + """ + Download the image page, and return the HTML, as a unicode string. + + Caches the HTML code, so that if you run this method twice on the + same ImagePage object, the page will only be downloaded once. + """ + if not hasattr(self, '_imagePageHtml'): + from pywikibot.data import http + path = "%s/index.php?title=%s" \ + % (self.site().scriptpath(), self.title(asUrl=True)) + self._imagePageHtml = http.request(self.site(), path) + return self._imagePageHtml + + def fileUrl(self): + """Return the URL for the image described on this page.""" + # TODO add scaling option? + if not hasattr(self, '_imageinfo'): + self._imageinfo = self.site().getimageinfo(self) + return self._imageinfo['url'] + + def fileIsOnCommons(self): + """Return True if the image is stored on Wikimedia Commons""" + return self.fileUrl().startswith( + 'http://upload.wikimedia.org/wikipedia/commons/') + + def fileIsShared(self): + """Return True if image is stored on any known shared repository.""" + # as of now, the only known repositories are commons and wikitravel + if 'wikitravel_shared' in self.site().shared_image_repository(): + return self.fileUrl().startswith( + u'http://wikitravel.org/upload/shared/') + return self.fileIsOnCommons() + + def getFileMd5Sum(self): + """Return image file's MD5 checksum.""" + logging.debug( + "ImagePage.getFileMd5Sum() is deprecated; use getFileSHA1Sum().") +# FIXME: MD5 might be performed on incomplete file due to server disconnection +# (see bug #1795683). + import md5, urllib + f = urllib.urlopen(self.fileUrl()) + # TODO: check whether this needs a User-Agent header added + md5Checksum = md5.new(f.read()).hexdigest() + f.close() + return md5Checksum + + def getFileSHA1Sum(self): + """Return image file's SHA1 checksum.""" + if not hasattr(self, '_imageinfo'): + self._imageinfo = self.site().getimageinfo(self) + return self._imageinfo['sha1'] + + def getFileVersionHistory(self): + """Return the image file's version history. + + @return: An iterator yielding tuples containing (timestamp, + username, resolution, filesize, comment). + + """ + #TODO; return value may need to change + return self.site().getimageinfo(self, history=True) + + def getFileVersionHistoryTable(self): + """Return the version history in the form of a wiki table.""" + lines = [] + #TODO: if getFileVersionHistory changes, make sure this follows it + for (datetime, username, resolution, size, comment) \ + in self.getFileVersionHistory(): + lines.append('| %s || %s || %s || %s || <nowiki>%s</nowiki>' \ + % (datetime, username, resolution, size, comment)) + return u'{| border="1"\n! date/time || username || resolution || size || edit summary\n|----\n' + u'\n|----\n'.join(lines) + '\n|}' + + def usingPages(self): + """Yield Pages on which the image is displayed.""" + return self.site().getimageusage(self) + +class Category(Page): + """A page in the Category: namespace""" + + def __init__(self, site, title, insite=None, sortKey=None): + """All parameters are the same as for Page() constructor, except: + + @param sortKey: DEPRECATED (use .aslink() method instead) + + """ + Page.__init__(self, site=site, title=title, insite=insite, + defaultNamespace=14) + if sortKey is not None: + logging.debug( + "The 'sortKey' option in Category constructor is deprecated.") + if self.namespace() != 14: + raise ValueError(u"'%s' is not in the category namespace!" + % title) + + def aslink(self, sortKey=u'', forceInterwiki=None, textlink=None, + noInterwiki=None): + """Return a link to place a page in this Category. + + Use this only to generate a "true" category link, not for interwikis + or text links to category pages. + + Parameters are deprecated and preserved for backwards-compatibility, + except: + + @param sortKey: The sort key for the article to be placed in this + Category; if omitted, default sort key is used. + @type sortKey: (optional) unicode + + """ + if forceInterwiki is not None \ + or textlink is not None or noInterwiki is not None: + logging.debug("All arguments to Category.aslink() are deprecated.") + if sortKey: + titleWithSortKey = '%s|%s' % (self.title(withSection=False), + self.sortKey) + else: + titleWithSortKey = self.title(withSection=False) + return '[[%s]]' % titleWithSortKey + + def subcategories(self, recurse=False): + """Iterate all subcategories of the current category. + + @param recurse: if not False or 0, also iterate subcategories of + subcategories. If an int, limit recursion to this number of + levels. (Example: recurse=1 will iterate direct subcats and + first-level sub-sub-cats, but no deeper.) + @type recurse: int or bool + + """ + if not isinstance(recurse, bool) and recurse: + recurse = recurse - 1 + if not hasattr(self, "_subcats"): + self._subcats = [] + for member in self.site().categorymembers(self, namespaces=[14]): + subcat = Category(self.site(), member.title()) + self.subcats.append(subcat) + yield subcat + if recurse: + for item in subcat.subcategories(recurse): + yield item + else: + for subcat in self._subcats: + yield subcat + if recurse: + for item in subcat.subcategories(recurse): + yield item + + def articles(self, recurse=False, startFrom=None): + """ + Yields all articles in the current category. + + @param recurse: if not False or 0, also iterate articles in + subcategories. If an int, limit recursion to this number of + levels. (Example: recurse=1 will iterate articles in first-level + subcats, but no deeper.) + @type recurse: int or bool + + """ + namespaces = self.site().namespaces() + namespaces.remove(14) + for member in self.site().categorymembers(self, namespaces=namespaces): + yield member + if recurse: + if not isinstance(recurse, bool) and recurse: + recurse = recurse - 1 + for subcat in self.subcategories(): + for article in subcat.articles(recurse): + yield article + + def isEmptyCategory(self): + """Return True if category has no members (including subcategories).""" + for member in self.site().categorymembers(self, limit=1): + return False + return True + + def copyTo(self, catname): + """ + Copy text of category page to a new page. Does not move contents. + + @param catname: New category title (without namespace) + @return: True if copying was successful, False if target page + already existed. + + """ + # This seems far too specialized to be in the top-level framework + catname = self.site().category_namespace() + ':' + catname + targetCat = Category(self.site(), catname) + if targetCat.exists(): + pywikibot.output('Target page %s already exists!' + % targetCat.title()) + return False + else: + pywikibot.output('Moving text from %s to %s.' + % (self.title(), targetCat.title())) + authors = ', '.join(self.contributingUsers()) + creationSummary = pywikibot.translate( + self.site(), msg_created_for_renaming + ) % (self.title(), authors) + targetCat.put(self.get(), creationSummary) + return True + + def copyAndKeep(self, catname, cfdTemplates): + """Copy partial category page text (not contents) to a new title. + + Like copyTo above, except this removes a list of templates (like + deletion templates) that appear in the old category text. It also + removes all text between the two HTML comments BEGIN CFD TEMPLATE + and END CFD TEMPLATE. (This is to deal with CFD templates that are + substituted.) + + Returns true if copying was successful, false if target page already + existed. + + @param catname: New category title (without namespace) + @param cfdTemplates: A list (or iterator) of templates to be removed + from the page text + @return: True if copying was successful, False if target page + already existed. + + """ + # I don't see why we need this as part of the framework either + catname = self.site().category_namespace() + ':' + catname + targetCat = Category(self.site(), catname) + if targetCat.exists(): + pywikibot.output('Target page %s already exists!' + % targetCat.title()) + return False + else: + pywikibot.output('Moving text from %s to %s.' + % (self.title(), targetCat.title())) + authors = ', '.join(self.contributingUsers()) + creationSummary = pywikibot.translate( + self.site(), msg_created_for_renaming + ) % (self.title(), authors) + newtext = self.get() + for regexName in cfdTemplates: + matchcfd = re.compile(r"{{%s.*?}}" % regexName, re.IGNORECASE) + newtext = matchcfd.sub('',newtext) + matchcomment = re.compile( + r"<!--BEGIN CFD TEMPLATE-->.*?<!--END CFD TEMPLATE-->", + re.IGNORECASE | re.MULTILINE | re.DOTALL) + newtext = matchcomment.sub('', newtext) + pos = 0 + while (newtext[pos:pos+1] == "\n"): + pos = pos + 1 + newtext = newtext[pos:] + targetCat.put(newtext, creationSummary) + return True + +#### DEPRECATED METHODS #### + def subcategoriesList(self, recurse=False): + """DEPRECATED: Equivalent to list(self.subcategories(...))""" + logging.debug("Category.subcategoriesList() method is deprecated.") + return sorted(list(set(self.subcategories(recurse)))) + + def articlesList(self, recurse=False): + """DEPRECATED: equivalent to list(self.articles(...))""" + logging.debug("Category.articlesList() method is deprecated.") + return sorted(list(set(self.articles(recurse)))) + + def supercategories(self): + """DEPRECATED: equivalent to self.categories()""" + logging.debug("Category.supercategories() method is deprecated.") + return self.categories() + + def supercategoriesList(self): + """DEPRECATED: equivalent to list(self.categories(...))""" + logging.debug("Category.articlesList() method is deprecated.") + return sorted(list(set(self.categories()))) + + +class Revision(object): + """A structure holding information about a single revision of a Page.""" + def __init__(self, revid, timestamp, user, anon=False, comment=u"", + text=None, minor=False): + """All parameters correspond to object attributes (e.g., revid + parameter is stored as self.revid) + + @param revid: Revision id number + @type revid: int + @param text: Revision wikitext. + @type text: unicode, or None if text not yet retrieved + @param timestamp: Revision time stamp (in MediaWiki text format) + @type timestamp: unicode + @param user: user who edited this revision + @type user: unicode + @param anon: user is unregistered + @type anon: bool + @param comment: edit comment text + @type comment: unicode + @param minor: edit flagged as minor + @type minor: bool + + """ + self.revid = revid + self.text = text + self.timestamp = timestamp + self.user = user + self.anon = anon + self.comment = comment + self.minor = minor + + +class Link(object): + """A Mediawiki link (local or interwiki) + + Has the following attributes: + + - site: The Site object for the wiki linked to + - namespace: The namespace of the page linked to (int) + - title: The title of the page linked to (unicode); does not include + namespace or section + - section: The section of the page linked to (unicode or None); this + contains any text following a '#' character in the title + - anchor: The anchor text (unicode or None); this contains any text + following a '|' character inside the link + + """ + illegal_titles_pattern = re.compile( + # Matching titles will be held as illegal. + u'''[^ %!"$&'()*,\-.\/0-9:;=?@A-Z\\^_`a-z~\x80-\xFF+]''' + # URL percent encoding sequences interfere with the ability + # to round-trip titles -- you can't link to them consistently. + u'|%[0-9A-Fa-f]{2}' + # XML/HTML character references produce similar issues. + u'|&[A-Za-z0-9\x80-\xff]+;' + u'|&#[0-9]+;' + u'|&#x[0-9A-Fa-f]+;' + ) + namespace_pattern = re.compile("^(.+?)_*:_*(.*)$") + + def __init__(self, text, source=None, defaultNamespace=0): + """Parse text into a Link object. + + @param text: the link text (everything appearing between [[ and ]] + on a wiki page) + @type text: unicode + @param source: the Site on which the link was found (not necessarily + the site to which the link refers) + @type source: Site + @param defaultNamespace: a namespace to use if the link does not + contain one (defaults to 0) + @type defaultNamespace: int + + """ + # First remove the anchor, which is stored unchanged, if there is one + if u"|" in text: + text, self.anchor = text.split(u"|", 1) + else: + self.anchor = None + + if source is None: + source = pywikibot.Site() + self.source = self.site = source + + # Clean up the name, it can come from anywhere. + # Convert HTML entities to unicode + t = html2unicode(text) + + # Convert URL-encoded characters to unicode + t = url2unicode(t, site=self.site) + + # Normalize unicode string to a NFC (composed) format to allow proper + # string comparisons. According to + # http://svn.wikimedia.org/viewvc/mediawiki/branches/REL1_6/phase3/includes/no... + # the mediawiki code normalizes everything to NFC, not NFKC (which + # might result in information loss). + t = unicodedata.normalize('NFC', t) + + # This code was adapted from Title.php : secureAndSplit() + # + if u'\ufffd' in t: + raise Error("Title contains illegal char (\uFFFD)") + self.namespace = defaultNamespace + + # Replace underscores by spaces + t = t.replace(u'_', u' ') + # replace multiple spaces and underscores with a single space + while u" " in t: t = t.replace(u" ", u" ") + # Strip spaces at both ends + t = t.strip() + # Remove left-to-right and right-to-left markers. + t = t.replace(u'\u200e', u'').replace(u'\u200f', u'') + + # Initial colon indicates main namespace rather than specified default + if t.startswith(u':'): + self.namespace = 0 + # remove the colon but continue processing + # remove any subsequent whitespace + t = t[1:].strip() + + # Namespace or interwiki prefix + firstPass = True + while True: + fam = self.site.family + + m = Link.namespace_pattern.match(t) + if m: + pre = m.group(1).lower() + ns = self.site.getNamespaceIndex(pre) + if ns: + # Ordinary namespace + t = m.group(2) + self.namespace = ns + elif pre in fam.langs.keys()\ + or pre in fam.get_known_families(site=self.site): + + if not firstPass: + # Can't make a local interwiki link to an interwiki link. + # That's just crazy! + raise Error("Improperly formatted interwiki link '%s'" + % text) + + # Interwiki link + t = m.group(2) + if pre in fam.langs.keys(): + newsite = pywikibot.Site(pre, fam) + else: + otherlang = self.site.lang + familyName = fam.get_known_families(site=self.site)[pre] + if familyName in ['commons', 'meta']: + otherlang = familyName + try: + newsite = pywikibot.Site(otherlang, familyName) + except ValueError: + raise Error("""\ +%s is not a local page on %s, and the %s family is +not supported by PyWikiBot!""" + % (title, self.site(), familyName)) + + # Redundant interwiki prefix to the local wiki + if newsite == self.site: + if not t: + # Can't have an empty self-link + raise Error("Invalid link title: '%s'" % text) + firstPass = False + continue + self.site = newsite + # If there's an initial colon after the interwiki, that also + # resets the default namespace + if t.startswith(":"): + self.namespace = 0 + t = t[1:] + break + + if u"#" in t: + t, sec = t.split(u'#', 1) + t, self.section = t.rstrip(), sec.lstrip() + else: + self.section = None + + # Reject illegal characters. + if Link.illegal_titles_pattern.search(t): + raise Error("Invalid title (contains illegal char(s)): '%s'" % text) + + # Pages with "/./" or "/../" appearing in the URLs will + # often be unreachable due to the way web browsers deal + #* with 'relative' URLs. Forbid them explicitly. + + if u'.' in t and ( + t == u'.' or t == u'..' + or t.startswith(u"./") + or t.startswith(u"../") + or u"/./" in t + or u"/../" in t + or t.endswith(u"/.") + or t.endswith(u"/..") + ): + raise Error("Invalid title (contains . / combinations): '%s'" + % text) + + # Magic tilde sequences? Nu-uh! + if u"~~~" in t: + raise Error("Invalid title (contains ~~~): '%s'" % text) + + if self.namespace != -1 and len(t) > 255: + raise Error("Invalid title (over 255 bytes): '%s'" % t) + + if self.site.case() == 'first-letter': + t = t[:1].upper() + t[1:] + + # Can't make a link to a namespace alone... + # "empty" local links can only be self-links + # with a fragment identifier. + if not t and self.site == self.source and self.namespace != 0: + raise ValueError("Invalid link (no page title): '%s'" % text) + + self.title = t + + +# Utility functions for parsing page titles + +def html2unicode(text, ignore = []): + """Return text, replacing HTML entities by equivalent unicode characters.""" + # This regular expression will match any decimal and hexadecimal entity and + # also entities that might be named entities. + entityR = re.compile( + r'&(#(?P<decimal>\d+)|#x(?P<hex>[0-9a-fA-F]+)|(?P<name>[A-Za-z]+));') + # These characters are Html-illegal, but sadly you *can* find some of + # these and converting them to unichr(decimal) is unsuitable + convertIllegalHtmlEntities = { + 128 : 8364, # € + 130 : 8218, # ‚ + 131 : 402, # ƒ + 132 : 8222, # „ + 133 : 8230, # … + 134 : 8224, # † + 135 : 8225, # ‡ + 136 : 710, # ˆ + 137 : 8240, # ‰ + 138 : 352, # Š + 139 : 8249, # ‹ + 140 : 338, # Œ + 142 : 381, # Ž + 145 : 8216, # ‘ + 146 : 8217, # ’ + 147 : 8220, # “ + 148 : 8221, # ” + 149 : 8226, # • + 150 : 8211, # – + 151 : 8212, # — + 152 : 732, # ˜ + 153 : 8482, # ™ + 154 : 353, # š + 155 : 8250, # › + 156 : 339, # œ + 158 : 382, # ž + 159 : 376 # Ÿ + } + #ensuring that illegal   and , which have no known values, + #don't get converted to unichr(129), unichr(141) or unichr(157) + ignore = set(ignore) | set([129, 141, 157]) + result = u'' + i = 0 + found = True + while found: + text = text[i:] + match = entityR.search(text) + if match: + unicodeCodepoint = None + if match.group('decimal'): + unicodeCodepoint = int(match.group('decimal')) + elif match.group('hex'): + unicodeCodepoint = int(match.group('hex'), 16) + elif match.group('name'): + name = match.group('name') + if htmlentitydefs.name2codepoint.has_key(name): + # We found a known HTML entity. + unicodeCodepoint = htmlentitydefs.name2codepoint[name] + result += text[:match.start()] + try: + unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint] + except KeyError: + pass + if unicodeCodepoint and unicodeCodepoint not in ignore and (WIDEBUILD or unicodeCodepoint < 65534): + result += unichr(unicodeCodepoint) + else: + # Leave the entity unchanged + result += text[match.start():match.end()] + i = match.end() + else: + result += text + found = False + return result + +def url2unicode(title, site, site2 = None): + """Convert url-encoded text to unicode using site's encoding. + + If site2 is provided, try its encodings as well. Uses the first encoding + that doesn't cause an error. + + """ + # create a list of all possible encodings for both hint sites + encList = [site.encoding()] + list(site.encodings()) + if site2 and site2 <> site: + encList.append(site2.encoding()) + encList += list(site2.encodings()) + firstException = None + # try to handle all encodings (will probably retry utf-8) + for enc in encList: + try: + t = title.encode(enc) + t = urllib.unquote(t) + return unicode(t, enc) + except UnicodeError, ex: + if not firstException: + firstException = ex + pass + # Couldn't convert, raise the original exception + raise firstException +
Added: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py (rev 0) +++ branches/rewrite/pywikibot/site.py 2008-02-27 20:08:48 UTC (rev 5088) @@ -0,0 +1,1947 @@ +# -*- coding: utf-8 -*- +""" +Objects representing MediaWiki sites (wikis) and families (groups of wikis +on the same topic in different languages). +""" +# +# (C) Pywikipedia bot team, 2008 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id: $' + +import pywikibot +from pywikibot.exceptions import * +from pywikibot.data import api + +import os + +def Family(fam=None, fatal=True): + """Import the named family. + + @param fam: family name (if omitted, uses the configured default) + @type fam: str + @param fatal: if True, the bot will stop running if the given family is + unknown. If False, it will only raise a ValueError exception. + @param fatal: bool + @return: a Family instance configured for the named family. + + """ + if fam == None: + fam = pywikibot.default_family + try: + # first try the built-in families + exec "import pywikibot.families.%s_family as myfamily" % fam + except ImportError: + # next see if user has defined a local family module + try: + sys.path.append(pywikibot.config.datafilepath('families')) + exec "import %s_family as myfamily" % fam + except ImportError: + if fatal: + output(u"""\ +Error importing the %s family. This probably means the family +does not exist. Also check your configuration file.""" + % fam) + import traceback + traceback.print_stack() + sys.exit(1) + else: + raise ValueError("Family %s does not exist" % repr(fam)) + return myfamily.Family() + + +class BaseSite(object): + """Site methods that are independent of the communication interface.""" + # to implement a specific interface, define a Site class that inherits + # from this + def __init__(self, code, fam=None, user=None): + """ + @param code: the site's language code + @type code: str + @param fam: wiki family name (optional) + @type fam: str or Family + @param user: bot user name (optional) + @type user: str + + """ + self._lang = code.lower() + if isinstance(fam, basestring) or fam is None: + self._family = Family(fam, fatal=False) + else: + self._family = fam + +## # if we got an outdated language code, use the new one instead. +## if self._family.obsolete.has_key(self._lang): +## if self._family.obsolete[self._lang] is not None: +## self._lang = self._family.obsolete[self._lang] +## else: +## # no such language anymore +## raise NoSuchSite("Language %s in family %s is obsolete" +## % (self._lang, self._family.name)) +## +## if self._lang not in self.languages(): +## if self._lang == 'zh-classic' and 'zh-classical' in self.languages(): +## self._lang = 'zh-classical' +## # database hack (database is varchar[10] -> zh-classical +## # is cut to zh-classic. +## else: +## raise NoSuchSite("Language %s does not exist in family %s" +## % (self._lang, self._family.name)) + self._username = user + + def family(self): + """Return the associated Family object.""" + return self._family + + def language(self): + """Return the site's language code.""" + # N.B. this code does not always identify a language as such, but + # may identify a wiki that is part of any family grouping + return self._lang + + def user(self): + """Return the currently-logged in bot user, or None.""" + if self.logged_in(): + return self._username + return None + + def __getattr__(self, attr): + """Calls to methods not defined in this object are passed to Family.""" + try: + method = getattr(self.family(), attr) + return lambda self=self: method(self.language()) + except AttributeError: + raise AttributeError("%s instance has no attribute '%s'" + % (self.__class__.__name__, attr) + ) + + def sitename(self): + """Return string representing this Site's name and language.""" + return self.family().name+':'+self.language() + + __str__ = sitename + + def __repr__(self): + return 'Site("%s", "%s")' % (self.language(), self.family().name) + + def linktrail(self): + """Return regex for trailing chars displayed as part of a link.""" + return self.family().linktrail(self.language()) + + def languages(self): + """Return list of all valid language codes for this site's Family.""" + return self.family().langs.keys() + + def getNamespaceIndex(self, namespace): + """Given a namespace name, return its int index, or None if invalid.""" + return self.family().getNamespaceIndex(self.language(), namespace) + + +class APISite(BaseSite): + """API interface to MediaWiki site. + + Do not use directly; use pywikibot.Site function. + + """ +## Site methods from version 1.0 (as these are implemented in this file, +## or declared deprecated/obsolete, they will be removed from this list) +########## +## validLanguageLinks: A list of language codes that can be used in interwiki +## links. +## +## messages: return True if there are new messages on the site +## cookies: return user's cookies as a string +## +## getUrl: retrieve an URL from the site +## urlEncode: Encode a query to be sent using an http POST request. +## postForm: Post form data to an address at this site. +## postData: Post encoded form data to an http address at this site. +## +## namespace(num): Return local name of namespace 'num'. +## normalizeNamespace(value): Return preferred name for namespace 'value' in +## this Site's language. +## namespaces: Return list of canonical namespace names for this Site. +## getNamespaceIndex(name): Return the int index of namespace 'name', or None +## if invalid. +## +## redirect: Return the localized redirect tag for the site. +## redirectRegex: Return compiled regular expression matching on redirect +## pages. +## mediawiki_message: Retrieve the text of a specified MediaWiki message +## has_mediawiki_message: True if this site defines specified MediaWiki +## message +## +## shared_image_repository: Return tuple of image repositories used by this +## site. +## category_on_one_line: Return True if this site wants all category links +## on one line. +## interwiki_putfirst: Return list of language codes for ordering of +## interwiki links. +## linkto(title): Return string in the form of a wikilink to 'title' +## isInterwikiLink(s): Return True if 's' is in the form of an interwiki +## link. +## getSite(lang): Return Site object for wiki in same family, language +## 'lang'. +## version: Return MediaWiki version string from Family file. +## versionnumber: Return int identifying the MediaWiki version. +## live_version: Return version number read from Special:Version. +## checkCharset(charset): Warn if charset doesn't match family file. +## +## linktrail: Return regex for trailing chars displayed as part of a link. +## disambcategory: Category in which disambiguation pages are listed. +## +## Methods that yield Page objects derived from a wiki's Special: pages +## (note, some methods yield other information in a tuple along with the +## Pages; see method docs for details) -- +## +## search(query): query results from Special:Search +## allpages(): Special:Allpages +## prefixindex(): Special:Prefixindex +## newpages(): Special:Newpages +## newimages(): Special:Log&type=upload +## longpages(): Special:Longpages +## shortpages(): Special:Shortpages +## categories(): Special:Categories (yields Category objects) +## deadendpages(): Special:Deadendpages +## ancientpages(): Special:Ancientpages +## lonelypages(): Special:Lonelypages +## unwatchedpages(): Special:Unwatchedpages (sysop accounts only) +## uncategorizedcategories(): Special:Uncategorizedcategories (yields +## Category objects) +## uncategorizedpages(): Special:Uncategorizedpages +## uncategorizedimages(): Special:Uncategorizedimages (yields +## ImagePage objects) +## unusedcategories(): Special:Unusuedcategories (yields Category) +## unusedfiles(): Special:Unusedimages (yields ImagePage) +## withoutinterwiki: Special:Withoutinterwiki +## linksearch: Special:Linksearch +## +## Convenience methods that provide access to properties of the wiki Family +## object; all of these are read-only and return a unicode string unless +## noted -- +## +## encoding: The current encoding for this site. +## encodings: List of all historical encodings for this site. +## category_namespace: Canonical name of the Category namespace on this +## site. +## category_namespaces: List of all valid names for the Category +## namespace. +## image_namespace: Canonical name of the Image namespace on this site. +## template_namespace: Canonical name of the Template namespace on this +## site. +## protocol: Protocol ('http' or 'https') for access to this site. +## hostname: Host portion of site URL. +## path: URL path for index.php on this Site. +## dbName: MySQL database name. +## +## Methods that return addresses to pages on this site (usually in +## Special: namespace); these methods only return URL paths, they do not +## interact with the wiki -- +## +## export_address: Special:Export. +## query_address: URL path + '?' for query.php +## api_address: URL path + '?' for api.php +## apipath: URL path for api.php +## move_address: Special:Movepage. +## delete_address(s): Delete title 's'. +## undelete_view_address(s): Special:Undelete for title 's' +## undelete_address: Special:Undelete. +## protect_address(s): Protect title 's'. +## unprotect_address(s): Unprotect title 's'. +## put_address(s): Submit revision to page titled 's'. +## get_address(s): Retrieve page titled 's'. +## nice_get_address(s): Short URL path to retrieve page titled 's'. +## edit_address(s): Edit form for page titled 's'. +## purge_address(s): Purge cache and retrieve page 's'. +## block_address: Block an IP address. +## unblock_address: Unblock an IP address. +## blocksearch_address(s): Search for blocks on IP address 's'. +## linksearch_address(s): Special:Linksearch for target 's'. +## search_address(q): Special:Search for query 'q'. +## allpages_address(s): Special:Allpages. +## newpages_address: Special:Newpages. +## longpages_address: Special:Longpages. +## shortpages_address: Special:Shortpages. +## unusedfiles_address: Special:Unusedimages. +## categories_address: Special:Categories. +## deadendpages_address: Special:Deadendpages. +## ancientpages_address: Special:Ancientpages. +## lonelypages_address: Special:Lonelypages. +## unwatchedpages_address: Special:Unwatchedpages. +## uncategorizedcategories_address: Special:Uncategorizedcategories. +## uncategorizedimages_address: Special:Uncategorizedimages. +## uncategorizedpages_address: Special:Uncategorizedpages. +## unusedcategories_address: Special:Unusedcategories. +## withoutinterwiki_address: Special:Withoutinterwiki. +## references_address(s): Special:Whatlinksere for page 's'. +## allmessages_address: Special:Allmessages. +## upload_address: Special:Upload. +## double_redirects_address: Special:Doubleredirects. +## broken_redirects_address: Special:Brokenredirects. +## login_address: Special:Userlogin. +## captcha_image_address(id): Special:Captcha for image 'id'. +## watchlist_address: Special:Watchlist editor. +## contribs_address(target): Special:Contributions for user 'target'. + + def __init__(self, code, fam=None, user=None): + BaseSite.__init__(self, code, fam, user) + self._namespaces = { + # these are the MediaWiki built-in names, which always work + # localized names are loaded later upon accessing the wiki + -2: [u"Media"], + -1: [u"Special"], + 0: [u""], + 1: [u"Talk"], + 2: [u"User"], + 3: [u"User talk"], + 4: [u"Project"], + 5: [u"Project talk"], + 6: [u"Image"], + 7: [u"Image talk"], + 8: [u"MediaWiki"], + 9: [u"MediaWiki talk"], + 10: [u"Template"], + 11: [u"Template talk"], + 12: [u"Help"], + 13: [u"Help talk"], + 14: [u"Category"], + 15: [u"Category talk"], + } + return +# START HERE + self._mediawiki_messages = {} + self.nocapitalize = self._lang in self.family().nocapitalize + self._userData = [False, False] + self._userName = [None, None] + self._isLoggedIn = [None, None] + self._isBlocked = [None, None] + self._messages = [None, None] + self._rights = [None, None] + self._token = [None, None] + self._cookies = [None, None] + # Calculating valid languages took quite long, so we calculate it once + # in initialization instead of each time it is used. + self._validlanguages = [] + for language in self.languages(): + if not language[:1].upper() + language[1:] in self.namespaces(): + self._validlanguages.append(language) + + def logged_in(self, sysop=False): + """Return True if logged in with specified privileges, otherwise False. + + @param sysop: if True, require sysop privileges. + + """ + if not hasattr(self, '_userinfo'): + return False + if self._userinfo['name'] != self._username: + return False + return (not sysop) or 'sysop' in self._userinfo['groups'] + + def loggedInAs(self, sysop = False): + """Return the current username if logged in, otherwise return None. + + DEPRECATED (use .user() method instead) + Checks if we're logged in by loading a page and looking for the login + link. We assume that we're not being logged out during a bot run, so + loading the test page is only required once. + + """ + logging.debug("Site.loggedInAs() method is deprecated.") + return self.logged_in(sysop) and self.user() + + def login(self, sysop=False): + """Log the user in if not already logged in.""" + if not self.logged_in(sysop): + loginMan = api.LoginManager(site=self, sysop=sysop) + if loginMan.login(retry = True): + self._username = loginMan.username + if hasattr(self, "_userinfo"): + del self._userinfo + self.getuserinfo() + + forceLogin = login # alias for backward-compatibility + + def getuserinfo(self): + """Retrieve userinfo from site and store in _userinfo attribute. + + self._userinfo will be a dict with the following keys and values: + + - id: user id (numeric str) + - name: username (if user is logged in) + - anon: present if user is not logged in + - groups: list of groups (could be empty) + - rights: list of rights (could be empty) + - message: present if user has a new message on talk page + - blockinfo: present if user is blocked (dict) + + """ + if not hasattr(self, "_userinfo"): + uirequest = api.Request( + site=self, + action="query", + meta="userinfo", + uiprop="blockinfo|hasmsg|groups|rights" + ) + uidata = uirequest.submit() + assert 'query' in uidata, \ + "API userinfo response lacks 'query' key" + uidata = uidata['query'] + assert 'userinfo' in uidata, \ + "API userinfo response lacks 'userinfo' key" + self._userinfo = uidata['userinfo'] + return self._userinfo + + def getsiteinfo(self): + """Retrieve siteinfo from site and store in _siteinfo attribute.""" + if not hasattr(self, "_siteinfo"): + sirequest = api.Request( + site=self, + action="query", + meta="siteinfo", + siprop="general|namespaces|namespacealiases" + ) + try: + sidata = sirequest.submit() + except api.APIError: + # hack for older sites that don't support 1.12 properties + sirequest = api.Request( + site=self, + action="query", + meta="siteinfo", + siprop="general|namespaces" + ) + sidata = sirequest.submit() + + assert 'query' in sidata, \ + "API siteinfo response lacks 'query' key" + sidata = sidata['query'] + assert 'general' in sidata, \ + "API siteinfo response lacks 'general' key" + assert 'namespaces' in sidata, \ + "API siteinfo response lacks 'namespaces' key" + self._siteinfo = sidata['general'] + nsdata = sidata['namespaces'] + for nskey in nsdata: + ns = int(nskey) + if ns in self._namespaces: + if nsdata[nskey]["*"] in self._namespaces[ns]: + continue + # this is the preferred form so it goes at front of list + self._namespaces[ns].insert(0, nsdata[nskey]["*"]) + else: + self._namespaces[ns] = [nsdata[nskey]["*"]] + if 'namespacealiases' in sidata: + aliasdata = sidata['namespacealiases'] + for item in aliasdata: + # this is a less preferred form so it goes at the end + self._namespaces[int(item['id'])].append(item["*"]) + return self._siteinfo + + def case(self): + return self.getsiteinfo()['case'] + + def namespace(self, num, all = False): + """Return string containing local name of namespace 'num'. + + If optional argument 'all' is true, return a tuple of all recognized + values for this namespace. + + """ + return self._namespaces[num][0] + + +class NotImplementedYet: + + def isBlocked(self, sysop = False): + """Check if the user is blocked.""" + try: + text = self.getUrl(u'%saction=query&meta=userinfo&uiprop=blockinfo' + % self.api_address(), sysop=sysop) + return text.find('blockedby=') > -1 + except NotImplementedError: + return False + + def isAllowed(self, right, sysop = False): + """Check if the user has a specific right. + Among possible rights: + * Actions: edit, move, delete, protect, upload + * User levels: autoconfirmed, sysop, bot, empty string (always true) + """ + if right == '' or right == None: + return True + else: + self._load(sysop = sysop) + index = self._userIndex(sysop) + return right in self._rights[index] + + def messages(self, sysop = False): + """Returns true if the user has new messages, and false otherwise.""" + self._load(sysop = sysop) + index = self._userIndex(sysop) + return self._messages[index] + + def cookies(self, sysop = False): + """Return a string containing the user's current cookies.""" + self._loadCookies(sysop = sysop) + index = self._userIndex(sysop) + return self._cookies[index] + + def _loadCookies(self, sysop = False): + """Retrieve session cookies for login""" + index = self._userIndex(sysop) + if self._cookies[index] is not None: + return + try: + if sysop: + try: + username = config.sysopnames[self.family().name][self.language()] + except KeyError: + raise NoUsername("""\ +You tried to perform an action that requires admin privileges, but you haven't +entered your sysop name in your user-config.py. Please add +sysopnames['%s']['%s']='name' to your user-config.py""" + % (self.family().name, self.language())) + else: + username = config.usernames[self.family().name][self.language()] + except KeyError: + self._cookies[index] = None + self._isLoggedIn[index] = False + else: + tmp = '%s-%s-%s-login.data' % ( + self.family().name, self.language(), username) + fn = config.datafilepath('login-data', tmp) + if not os.path.exists(fn): + self._cookies[index] = None + self._isLoggedIn[index] = False + else: + f = open(fn) + self._cookies[index] = '; '.join([x.strip() for x in f.readlines()]) + f.close() + + def urlEncode(self, query): + """Encode a query so that it can be sent using an http POST request.""" + if not query: + return None + if hasattr(query, 'iteritems'): + iterator = query.iteritems() + else: + iterator = iter(query) + l = [] + wpEditToken = None + for key, value in iterator: + if isinstance(key, unicode): + key = key.encode('utf-8') + if isinstance(value, unicode): + value = value.encode('utf-8') + key = urllib.quote(key) + value = urllib.quote(value) + if key == 'wpEditToken': + wpEditToken = value + continue + l.append(key + '=' + value) + + # wpEditToken is explicitly added as last value. + # If a premature connection abort occurs while putting, the server will + # not have received an edit token and thus refuse saving the page + if wpEditToken != None: + l.append('wpEditToken=' + wpEditToken) + return '&'.join(l) + + def postForm(self, address, predata, sysop=False, useCookie=True): + """Post http form data to the given address at this site. + + address is the absolute path without hostname. + predata is a dict or any iterable that can be converted to a dict, + containing keys and values for the http form. + + Return a (response, data) tuple, where response is the HTTP + response object and data is a Unicode string containing the + body of the response. + + """ + data = self.urlEncode(predata) + try: + return self.postData(address, data, sysop=sysop, + useCookie=useCookie) + except socket.error, e: + raise ServerError(e) + + def postData(self, address, data, + contentType='application/x-www-form-urlencoded', + sysop=False, useCookie=True, compress=True): + """Post encoded data to the given http address at this site. + + address is the absolute path without hostname. + data is an ASCII string that has been URL-encoded. + + Returns a (response, data) tuple where response is the HTTP + response object and data is a Unicode string containing the + body of the response. + """ + + # TODO: add the authenticate stuff here + + if False: #self.persistent_http: + conn = self.conn + else: + # Encode all of this into a HTTP request + if self.protocol() == 'http': + conn = httplib.HTTPConnection(self.hostname()) + elif self.protocol() == 'https': + conn = httplib.HTTPSConnection(self.hostname()) + # otherwise, it will crash, as other protocols are not supported + + conn.putrequest('POST', address) + conn.putheader('Content-Length', str(len(data))) + conn.putheader('Content-type', contentType) + conn.putheader('User-agent', useragent) + if useCookie and self.cookies(sysop = sysop): + conn.putheader('Cookie', self.cookies(sysop = sysop)) + if False: #self.persistent_http: + conn.putheader('Connection', 'Keep-Alive') + if compress: + conn.putheader('Accept-encoding', 'gzip') + conn.endheaders() + conn.send(data) + + # Prepare the return values + # Note that this can raise network exceptions which are not + # caught here. + try: + response = conn.getresponse() + except httplib.BadStatusLine: + # Blub. + conn.close() + conn.connect() + return self.postData(address, data, contentType, sysop, useCookie) + + data = response.read() + + if compress and response.getheader('Content-Encoding') == 'gzip': + data = decompress_gzip(data) + + data = data.decode(self.encoding()) + response.close() + + if True: #not self.persistent_http: + conn.close() + + # If a wiki page, get user data + self._getUserData(data, sysop = sysop) + + return response, data + + def getUrl(self, path, retry = True, sysop = False, data = None, compress = True): + """ + Low-level routine to get a URL from the wiki. + + Parameters: + path - The absolute path, without the hostname. + retry - If True, retries loading the page when a network error + occurs. + sysop - If True, the sysop account's cookie will be used. + data - An optional dict providing extra post request parameters + + Returns the HTML text of the page converted to unicode. + """ + if False: #self.persistent_http and not data: + self.conn.putrequest('GET', path) + self.conn.putheader('User-agent', useragent) + self.conn.putheader('Cookie', self.cookies(sysop = sysop)) + self.conn.putheader('Connection', 'Keep-Alive') + if compress: + self.conn.putheader('Accept-encoding', 'gzip') + self.conn.endheaders() + + # Prepare the return values + # Note that this can raise network exceptions which are not + # caught here. + try: + response = self.conn.getresponse() + except httplib.BadStatusLine: + # Blub. + self.conn.close() + self.conn.connect() + return self.getUrl(path, retry, sysop, data, compress) + + text = response.read() + headers = dict(response.getheaders()) + + else: + if self.hostname() in config.authenticate.keys(): + uo = authenticateURLopener + else: + uo = MyURLopener() + if self.cookies(sysop = sysop): + uo.addheader('Cookie', self.cookies(sysop = sysop)) + if compress: + uo.addheader('Accept-encoding', 'gzip') + + url = '%s://%s%s' % (self.protocol(), self.hostname(), path) + data = self.urlEncode(data) + + # Try to retrieve the page until it was successfully loaded (just in + # case the server is down or overloaded). + # Wait for retry_idle_time minutes (growing!) between retries. + retry_idle_time = 1 + retrieved = False + while not retrieved: + try: + if self.hostname() in config.authenticate.keys(): + if False: # compress: + request = urllib2.Request(url, data) + request.add_header('Accept-encoding', 'gzip') + opener = urllib2.build_opener() + f = opener.open(request) + else: + f = urllib2.urlopen(url, data) + else: + f = uo.open(url, data) + retrieved = True + except KeyboardInterrupt: + raise + except Exception, e: + if retry: + # We assume that the server is down. Wait some time, then try again. + output(u"%s" % e) + output(u"""\ +WARNING: Could not open '%s://%s%s'. Maybe the server or +your connection is down. Retrying in %i minutes...""" + % (self.protocol(), self.hostname(), path, + retry_idle_time)) + time.sleep(retry_idle_time * 60) + # Next time wait longer, but not longer than half an hour + retry_idle_time *= 2 + if retry_idle_time > 30: + retry_idle_time = 30 + else: + raise + text = f.read() + + headers = f.info() + + contentType = headers.get('content-type', '') + contentEncoding = headers.get('content-encoding', '') + + # Ensure that all sent data is received + if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers: + output(u'Warning! len(text) does not match content-length: %s != %s' % \ + (len(text), headers.get('content-length'))) + if False: #self.persistent_http + self.conn.close() + self.conn.connect() + return self.getUrl(path, retry, sysop, data, compress) + + if compress and contentEncoding == 'gzip': + text = decompress_gzip(text) + + R = re.compile('charset=([^'";]+)') + m = R.search(contentType) + if m: + charset = m.group(1) + else: + output(u"WARNING: No character set found.") + # UTF-8 as default + charset = 'utf-8' + # Check if this is the charset we expected + self.checkCharset(charset) + # Convert HTML to Unicode + try: + text = unicode(text, charset, errors = 'strict') + except UnicodeDecodeError, e: + print e + output(u'ERROR: Invalid characters found on %s://%s%s, replaced by \ufffd.' % (self.protocol(), self.hostname(), path)) + # We use error='replace' in case of bad encoding. + text = unicode(text, charset, errors = 'replace') + + # If a wiki page, get user data + self._getUserData(text, sysop = sysop) + + return text + + def _getUserData(self, text, sysop = False): + """ + Get the user data from a wiki page data. + + Parameters: + * text - the page text + * sysop - is the user a sysop? + """ + if '<div id="globalWrapper">' not in text: + # Not a wiki page + return + + index = self._userIndex(sysop) + + # Check for blocks - but only if version is 1.11 (userinfo is available) + # and the user data was not yet loaded + if self.versionnumber() >= 11 and not self._userData[index]: + blocked = self.isBlocked(sysop = sysop) + if blocked and not self._isBlocked[index]: + # Write a warning if not shown earlier + if sysop: + account = 'Your sysop account' + else: + account = 'Your account' + output(u'WARNING: %s on %s is blocked. Editing using this account will stop the run.' % (account, self)) + self._isBlocked[index] = blocked + + # Check for new messages + if '<div class="usermessage">' in text: + if not self._messages[index]: + # User has *new* messages + if sysop: + output(u'NOTE: You have new messages in your sysop account on %s' % self) + else: + output(u'NOTE: You have new messages on %s' % self) + self._messages[index] = True + else: + self._messages[index] = False + + # Don't perform other checks if the data was already loaded + if self._userData[index]: + return + + # Search for the the user page link at the top. + # Note that the link of anonymous users (which doesn't exist at all + # in Wikimedia sites) has the ID pt-anonuserpage, and thus won't be + # found here. + userpageR = re.compile('<li id="pt-userpage"><a href=".+?">(?P<username>.+?)</a></li>') + m = userpageR.search(text) + if m: + self._isLoggedIn[index] = True + self._userName[index] = m.group('username') + else: + self._isLoggedIn[index] = False + # No idea what is the user name, and it isn't important + self._userName[index] = None + + # Check user groups, if possible (introduced in 1.10) + groupsR = re.compile(r'var wgUserGroups = ["(.+)"];') + m = groupsR.search(text) + if m: + rights = m.group(1) + rights = rights.split('", "') + if '*' in rights: + rights.remove('*') + self._rights[index] = rights + # Warnings + # Don't show warnings for not logged in users, they will just fail to + # do any action + if self._isLoggedIn[index]: + if 'bot' not in self._rights[index]: + if sysop: + output(u'Note: Your sysop account on %s does not have a bot flag. Its edits will be visible in the recent changes.' % self) + else: + output(u'WARNING: Your account on %s does not have a bot flag. Its edits will be visible in the recent changes and it may get blocked.' % self) + if sysop and 'sysop' not in self._rights[index]: + output(u'WARNING: Your sysop account on %s does not seem to have sysop rights. You may not be able to perform any sysop-restricted actions using it.' % self) + else: + # We don't have wgUserGroups, and can't check the rights + self._rights[index] = [] + if self._isLoggedIn[index]: + # Logged in user + self._rights[index].append('user') + # Assume bot, and thus autoconfirmed + self._rights[index].extend(['bot', 'autoconfirmed']) + if sysop: + # Assume user reported as a sysop indeed has the sysop rights + self._rights[index].append('sysop') + # Assume the user has the default rights + self._rights[index].extend(['read', 'createaccount', 'edit', 'upload', 'createpage', 'createtalk', 'move', 'upload']) + if 'bot' in self._rights[index] or 'sysop' in self._rights[index]: + self._rights[index].append('apihighlimits') + if 'sysop' in self._rights[index]: + self._rights[index].extend(['delete', 'undelete', 'block', 'protect', 'import', 'deletedhistory', 'unwatchedpages']) + + # Search for a token + tokenR = re.compile(r"<input type='hidden' value="(.*?)" name="wpEditToken"") + tokenloc = tokenR.search(text) + if tokenloc: + self._token[index] = tokenloc.group(1) + if self._rights[index] is not None: + # In this case, token and rights are loaded - user data is now loaded + self._userData[index] = True + else: + # Token not found + # Possible reason for this is the user is blocked, don't show a + # warning in this case, otherwise do show a warning + # Another possible reason is that the page cannot be edited - ensure + # there is a textarea and the tab "view source" is not shown + if u'<textarea' in text and u'<li id="ca-viewsource"' not in text and not self._isBlocked[index]: + # Token not found + output(u'WARNING: Token not found on %s. You will not be able to edit any page.' % self) + + def mediawiki_message(self, key): + """Return the MediaWiki message text for key "key" """ + global mwpage, tree + if key.lower() not in self._mediawiki_messages.keys() \ + and not hasattr(self, "_phploaded"): + get_throttle() + mwpage = self.getUrl("%s?title=%s:%s&action=edit" + % (self.path(), urllib.quote( + self.namespace(8).replace(' ', '_').encode( + self.encoding())), + key)) + tree = BeautifulSoup(mwpage, + convertEntities=BeautifulSoup.HTML_ENTITIES, + parseOnlyThese=SoupStrainer("textarea")) + if tree.textarea is not None and tree.textarea.string is not None: + value = tree.textarea.string.strip() + else: + value = None + if value: + self._mediawiki_messages[key.lower()] = value + else: + self._mediawiki_messages[key.lower()] = None + # Fallback in case MediaWiki: page method doesn't work + if verbose: + output( + u"Retrieving mediawiki messages from Special:Allmessages") + retry_idle_time = 1 + while True: + get_throttle() + phppage = self.getUrl(self.get_address("Special:Allmessages") + + "&ot=php") + Rphpvals = re.compile(r"(?ms)'([^']*)' => '(.*?[^\])',") + count = 0 + for (phpkey, phpval) in Rphpvals.findall(phppage): + count += 1 + self._mediawiki_messages[str(phpkey).lower()] = phpval + if count == 0: + # No messages could be added. + # We assume that the server is down. + # Wait some time, then try again. + output('WARNING: No messages found in Special:Allmessages. Maybe the server is down. Retrying in %i minutes...' % retry_idle_time) + time.sleep(retry_idle_time * 60) + # Next time wait longer, but not longer than half an hour + retry_idle_time *= 2 + if retry_idle_time > 30: + retry_idle_time = 30 + continue + break + self._phploaded = True + + key = key.lower() + if self._mediawiki_messages[key] is None: + raise KeyError("MediaWiki key '%s' does not exist on %s" + % (key, self)) + return self._mediawiki_messages[key] + + def has_mediawiki_message(self, key): + """Return True iff this site defines a MediaWiki message for 'key'.""" + try: + v = self.mediawiki_message(key) + return True + except KeyError: + return False + + def _load(self, sysop = False): + """ + Loads user data. + This is only done if we didn't do get any page yet and the information + is requested, otherwise we should already have this data. + + Parameters: + * sysop - Get sysop user data? + """ + index = self._userIndex(sysop) + if self._userData[index]: + return + + if verbose: + output(u'Getting information for site %s' % self) + + # Get data + url = self.edit_address('Non-existing_page') + text = self.getUrl(url, sysop = sysop) + + # Parse data + self._getUserData(text, sysop = sysop) + + def search(self, query, number = 10, namespaces = None): + """Yield search results (using Special:Search page) for query.""" + throttle = True + path = self.search_address(urllib.quote_plus(query), + n=number, ns=namespaces) + get_throttle() + html = self.getUrl(path) + + entryR = re.compile(ur'<li[^>]*><a href=".+?" title="(?P<title>.+?)">.+?</a>' + '<br />(?P<match>.*?)<span style="color[^>]*>.+?: ' + '(?P<relevance>[0-9.]+)% - ' +# '(?P<size>[0-9.]*) ' +# '(?P<sizeunit>[A-Za-z]) ' +# '((?P<words>.+?) \w+) - ' +# '(?P<date>.+?)</span></li>' + , re.DOTALL) + + for m in entryR.finditer(html): + page = Page(self, m.group('title')) + match = m.group('match') + relevance = m.group('relevance') + #size = m.group('size') + ## sizeunit appears to always be "KB" + #words = m.group('words') + #date = m.group('date') + + #print "%s - %s %s (%s words) - %s" % (relevance, size, sizeunit, words, date) + + #yield page, match, relevance, size, words, date + yield page, match, relevance, '', '', '' + + # TODO: avoid code duplication for the following methods + def newpages(self, number = 10, get_redirect = False, repeat = False): + """Yield new articles (as Page objects) from Special:Newpages. + + Starts with the newest article and fetches the number of articles + specified in the first argument. If repeat is True, it fetches + Newpages again. If there is no new page, it blocks until there is + one, sleeping between subsequent fetches of Newpages. + + The objects yielded are tuples composed of the Page object, + timestamp (unicode), length (int), an empty unicode string, username + or IP address (str), comment (unicode). + + """ + # TODO: in recent MW versions Special:Newpages takes a namespace parameter, + # and defaults to 0 if not specified. + # TODO: Detection of unregistered users is broken + # TODO: Repeat mechanism doesn't make much sense as implemented; + # should use both offset and limit parameters, and have an + # option to fetch older rather than newer pages + seen = set() + while True: + path = self.newpages_address(n=number) + # The throttling is important here, so always enabled. + get_throttle() + html = self.getUrl(path) + + entryR = re.compile( +'<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"' +' title="(?P<title>.+?)">.+?</a>.+?[([](?P<length>[\d,.]+)[^)]]*[)]]' +' .?<a href=".+?" title=".+?:(?P<username>.+?)">' + ) + for m in entryR.finditer(html): + date = m.group('date') + title = m.group('title') + title = title.replace('"', '"') + length = int(re.sub("[,.]", "", m.group('length'))) + loggedIn = u'' + username = m.group('username') + comment = u'' + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, date, length, loggedIn, username, comment + if not repeat: + break + + def longpages(self, number = 10, repeat = False): + """Yield Pages from Special:Longpages. + + Return values are a tuple of Page object, length(int). + + """ + #TODO: should use offset and limit parameters; 'repeat' as now + # implemented is fairly useless + # this comment applies to all the XXXXpages methods following, as well + seen = set() + while True: + path = self.longpages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">hist</a>) <a href=".+?" title="(?P<title>.+?)">.+?</a> [(?P<length>\d+)(.+?)]</li>') + for m in entryR.finditer(html): + title = m.group('title') + length = int(m.group('length')) + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, length + if not repeat: + break + + def shortpages(self, number = 10, repeat = False): + """Yield Pages and lengths from Special:Shortpages.""" + throttle = True + seen = set() + while True: + path = self.shortpages_address(n = number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">hist</a>) <a href=".+?" title="(?P<title>.+?)">.+?</a> [(?P<length>\d+)(.+?)]</li>') + for m in entryR.finditer(html): + title = m.group('title') + length = int(m.group('length')) + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, length + if not repeat: + break + + def categories(self, number=10, repeat=False): + """Yield Category objects from Special:Categories""" + import catlib + seen = set() + while True: + path = self.categories_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a>.*?</li>') + for m in entryR.finditer(html): + title = m.group('title') + if title not in seen: + seen.add(title) + page = catlib.Category(self, title) + yield page + if not repeat: + break + + def deadendpages(self, number = 10, repeat = False): + """Yield Page objects retrieved from Special:Deadendpages.""" + seen = set() + while True: + path = self.deadendpages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def ancientpages(self, number = 10, repeat = False): + """Yield Pages, datestamps from Special:Ancientpages.""" + seen = set() + while True: + path = self.ancientpages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( +'<li><a href=".+?" title="(?P<title>.+?)">.+?</a> (?P<date>.+?)</li>') + for m in entryR.finditer(html): + title = m.group('title') + date = m.group('date') + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, date + if not repeat: + break + + def lonelypages(self, number = 10, repeat = False): + """Yield Pages retrieved from Special:Lonelypages.""" + throttle = True + seen = set() + while True: + path = self.lonelypages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def unwatchedpages(self, number = 10, repeat = False): + """Yield Pages from Special:Unwatchedpages (requires Admin privileges).""" + seen = set() + while True: + path = self.unwatchedpages_address(n=number) + get_throttle() + html = self.getUrl(path, sysop = True) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a>.+?</li>') + for m in entryR.finditer(html): + title = m.group('title') + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def uncategorizedcategories(self, number = 10, repeat = False): + """Yield Categories from Special:Uncategorizedcategories.""" + import catlib + seen = set() + while True: + path = self.uncategorizedcategories_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + if title not in seen: + seen.add(title) + page = catlib.Category(self, title) + yield page + if not repeat: + break + + def newimages(self, number = 10, repeat = False): + """Yield ImagePages from Special:Log&type=upload""" + + seen = set() + regexp = re.compile('<li[^>]*>(?P<date>.+?)\s+<a href=.*?>(?P<user>.+?)</a>\s+(.+?</a>).*?<a href=".*?"(?P<new> class="new")? title="(?P<image>.+?)"\s*>(?:.*?<span class="comment">(?P<comment>.*?)</span>)?', re.UNICODE) + + while True: + path = self.log_address(number, mode = 'upload') + get_throttle() + html = self.getUrl(path) + + for m in regexp.finditer(html): + image = m.group('image') + + if image not in seen: + seen.add(image) + + if m.group('new'): + output(u"Image '%s' has been deleted." % image) + continue + + date = m.group('date') + user = m.group('user') + comment = m.group('comment') or '' + + yield ImagePage(self, image), date, user, comment + if not repeat: + break + + def uncategorizedimages(self, number = 10, repeat = False): + """Yield ImagePages from Special:Uncategorizedimages.""" + seen = set() + ns = self.image_namespace() + entryR = re.compile( + '<a href=".+?" title="(?P<title>%s:.+?)">.+?</a>' % ns) + while True: + path = self.uncategorizedimages_address(n=number) + get_throttle() + html = self.getUrl(path) + for m in entryR.finditer(html): + title = m.group('title') + if title not in seen: + seen.add(title) + page = ImagePage(self, title) + yield page + if not repeat: + break + + def uncategorizedpages(self, number = 10, repeat = False): + """Yield Pages from Special:Uncategorizedpages.""" + seen = set() + while True: + path = self.uncategorizedpages_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile( + '<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def unusedcategories(self, number = 10, repeat = False): + """Yield Category objects from Special:Unusedcategories.""" + import catlib + seen = set() + while True: + path = self.unusedcategories_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile('<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + + if title not in seen: + seen.add(title) + page = catlib.Category(self, title) + yield page + if not repeat: + break + + def unusedfiles(self, number = 10, repeat = False, extension = None): + """Yield ImagePage objects from Special:Unusedimages.""" + seen = set() + ns = self.image_namespace() + entryR = re.compile( + '<a href=".+?" title="(?P<title>%s:.+?)">.+?</a>' % ns) + while True: + path = self.unusedfiles_address(n=number) + get_throttle() + html = self.getUrl(path) + for m in entryR.finditer(html): + fileext = None + title = m.group('title') + if extension: + fileext = title[len(title)-3:] + if title not in seen and fileext == extension: + ## Check whether the media is used in a Proofread page + # code disabled because it slows this method down, and + # because it is unclear what it's supposed to do. + #basename = title[6:] + #page = Page(self, 'Page:' + basename) + + #if not page.exists(): + seen.add(title) + image = ImagePage(self, title) + yield image + if not repeat: + break + + def withoutinterwiki(self, number=10, repeat=False): + """Yield Pages without language links from Special:Withoutinterwiki.""" + seen = set() + while True: + path = self.withoutinterwiki_address(n=number) + get_throttle() + html = self.getUrl(path) + entryR = re.compile('<li><a href=".+?" title="(?P<title>.+?)">.+?</a></li>') + for m in entryR.finditer(html): + title = m.group('title') + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page + if not repeat: + break + + def allpages(self, start='!', namespace=0, includeredirects=True, + throttle=True): + """Yield all Pages from Special:Allpages. + + Parameters: + start Start at this page. By default, it starts at '!', and yields + all pages. + namespace Yield all pages in this namespace; defaults to 0. + MediaWiki software will only return pages in one namespace + at a time. + + If includeredirects is False, redirects will not be found. + If includeredirects equals the string 'only', only redirects + will be found. Note that this has not been tested on older + versions of the MediaWiki code. + + It is advised not to use this directly, but to use the + AllpagesPageGenerator from pagegenerators.py instead. + + """ + while True: + # encode Non-ASCII characters in hexadecimal format (e.g. %F6) + start = start.encode(self.encoding()) + start = urllib.quote(start) + # load a list which contains a series of article names (always 480) + path = self.allpages_address(start, namespace) + output(u'Retrieving Allpages special page for %s from %s, namespace %i' % (repr(self), start, namespace)) + returned_html = self.getUrl(path) + # Try to find begin and end markers + try: + # In 1.4, another table was added above the navigational links + if self.versionnumber() >= 4: + begin_s = '</table><hr /><table' + end_s = '</table' + else: + begin_s = '<table' + end_s = '</table' + ibegin = returned_html.index(begin_s) + iend = returned_html.index(end_s,ibegin + 3) + except ValueError: + raise ServerError( +"Couldn't extract allpages special page. Make sure you're using MonoBook skin.") + # remove the irrelevant sections + returned_html = returned_html[ibegin:iend] + if self.versionnumber()==2: + R = re.compile('/wiki/(.*?)" *class=['"]printable') + elif self.versionnumber()<5: + # Apparently the special code for redirects was added in 1.5 + R = re.compile('title ?="(.*?)"') + elif not includeredirects: + R = re.compile('<td(?: width="33%")?><a href="\S*" +title ?="(.*?)"') + elif includeredirects == 'only': + R = re.compile('<td(?: width="33%")?><[^<>]*allpagesredirect"><a href="\S*" +title ?="(.*?)"') + else: + R = re.compile('title ?="(.*?)"') + # Count the number of useful links on this page + n = 0 + for hit in R.findall(returned_html): + # count how many articles we found on the current page + n = n + 1 + if self.versionnumber()==2: + yield Page(self, url2link(hit, site = self, insite = self)) + else: + yield Page(self, hit) + # save the last hit, so that we know where to continue when we + # finished all articles on the current page. Append a '!' so that + # we don't yield a page twice. + start = Page(self,hit).titleWithoutNamespace() + '!' + # A small shortcut: if there are less than 100 pages listed on this + # page, there is certainly no next. Probably 480 would do as well, + # but better be safe than sorry. + if n < 100: + if (not includeredirects) or includeredirects == 'only': + # Maybe there were only so few because the rest is or is not a redirect + R = re.compile('title ?="(.*?)"') + allLinks = R.findall(returned_html) + if len(allLinks) < 100: + break + elif n == 0: + # In this special case, no pages of the requested type + # were found, and "start" will remain and be double-encoded. + # Use the last page as the start of the next page. + start = Page(self, allLinks[-1]).titleWithoutNamespace() + '!' + else: + break + + def prefixindex(self, prefix, namespace=0, includeredirects=True): + """Yield all pages with a given prefix. + + Parameters: + prefix The prefix of the pages. + namespace Namespace number; defaults to 0. + MediaWiki software will only return pages in one namespace + at a time. + + If includeredirects is False, redirects will not be found. + If includeredirects equals the string 'only', only redirects + will be found. Note that this has not been tested on older + versions of the MediaWiki code. + + It is advised not to use this directly, but to use the + PrefixingPageGenerator from pagegenerators.py instead. + """ + for page in self.allpages(start = prefix, namespace = namespace, includeredirects = includeredirects): + if page.titleWithoutNamespace().startswith(prefix): + yield page + else: + break + + def linksearch(self, siteurl): + """Yield Pages from results of Special:Linksearch for 'siteurl'.""" + if siteurl.startswith('*.'): + siteurl = siteurl[2:] + output(u'Querying [[Special:Linksearch]]...') + cache = [] + for url in [siteurl, '*.' + siteurl]: + path = self.linksearch_address(url) + get_throttle() + html = self.getUrl(path) + loc = html.find('<div class="mw-spcontent">') + if loc > -1: + html = html[loc:] + loc = html.find('<div class="printfooter">') + if loc > -1: + html = html[:loc] + R = re.compile('title ?="(.*?)"') + for title in R.findall(html): + if not siteurl in title: + # the links themselves have similar form + if title in cache: + continue + else: + cache.append(title) + yield Page(self, title) + + def linkto(self, title, othersite = None): + """Return unicode string in the form of a wikilink to 'title' + + Use optional Site argument 'othersite' to generate an interwiki link. + + """ + if othersite and othersite.lang != self.language(): + return u'[[%s:%s]]' % (self.language(), title) + else: + return u'[[%s]]' % title + + def isInterwikiLink(self, s): + """Return True if s is in the form of an interwiki link. + + Interwiki links have the form "foo:bar" or ":foo:bar" where foo is a + known language code or family. Called recursively if the first part + of the link refers to this site's own family and/or language. + + """ + s = s.strip().lstrip(":") + if not ':' in s: + return False + first, rest = s.split(':',1) + # interwiki codes are case-insensitive + first = first.lower().strip() + # commons: forwards interlanguage links to wikipedia:, etc. + if self.family().interwiki_forward: + interlangTargetFamily = Family(self.family().interwiki_forward) + else: + interlangTargetFamily = self.family() + if self.getNamespaceIndex(first): + return False + if first in interlangTargetFamily.langs: + if first == self.language(): + return self.isInterwikiLink(rest) + else: + return True + if first in self.family().get_known_families(site = self): + if first == self.family().name: + return self.isInterwikiLink(rest) + else: + return True + return False + + def redirect(self, default = False): + """Return the localized redirect tag for the site. + + If default is True, falls back to 'REDIRECT' if the site has no + special redirect tag. + + """ + if default: + if self.language() == 'ar': + # It won't work with REDIRECT[[]] but it work with the local, + # if problems, try to find a work around. FixMe! + return self.family().redirect.get(self.language(), [u"تحويل"])[0] + else: + return self.family().redirect.get(self.language(), [u"REDIRECT"])[0] + else: + return self.family().redirect.get(self.language(), None) + + def redirectRegex(self): + """Return a compiled regular expression matching on redirect pages. + + Group 1 in the regex match object will be the target title. + + """ + redDefault = 'redirect' + red = 'redirect' + if self.language() == 'ar': + red = u"تحويل" + try: + if redDefault == red: + redirKeywords = [red] + self.family().redirect[self.language()] + redirKeywordsR = r'(?:' + '|'.join(redirKeywords) + ')' + else: + redirKeywords = [red] + self.family().redirect[self.language()] + redirKeywordsR = r'(?:' + redDefault + '|'.join(redirKeywords) + ')' + except KeyError: + # no localized keyword for redirects + if redDefault == red: + redirKeywordsR = r'%s' % red + else: + redirKeywordsR = r'(?:%s|%s)' % (red, redDefault) + # A redirect starts with hash (#), followed by a keyword, then + # arbitrary stuff, then a wikilink. The wikilink may contain + # a label, although this is not useful. + return re.compile(r'#' + redirKeywordsR + + '.*?[[(.*?)(?:|.*?)?]]', + re.IGNORECASE | re.UNICODE | re.DOTALL) + + # The following methods are for convenience, so that you can access + # methods of the Family class easily. + def encoding(self): + """Return the current encoding for this site.""" + return self.family().code2encoding(self.language()) + + def encodings(self): + """Return a list of all historical encodings for this site.""" + return self.family().code2encodings(self.language()) + + def category_namespace(self): + """Return the canonical name of the Category namespace on this site.""" + # equivalent to self.namespace(14)? + return self.family().category_namespace(self.language()) + + def category_namespaces(self): + """Return a list of all valid names for the Category namespace.""" + return self.family().category_namespaces(self.language()) + + def image_namespace(self, fallback = '_default'): + """Return the canonical name of the Image namespace on this site.""" + # equivalent to self.namespace(6)? + return self.family().image_namespace(self.language(), fallback) + + def template_namespace(self, fallback = '_default'): + """Return the canonical name of the Template namespace on this site.""" + # equivalent to self.namespace(10)? + return self.family().template_namespace(self.language(), fallback) + + def export_address(self): + """Return URL path for Special:Export.""" + return self.family().export_address(self.language()) + + def query_address(self): + """Return URL path + '?' for query.php (if enabled on this Site).""" + return self.family().query_address(self.language()) + + def api_address(self): + """Return URL path + '?' for api.php (if enabled on this Site).""" + return self.family().api_address(self.language()) + + def apipath(self): + """Return URL path for api.php (if enabled on this Site).""" + return self.family().apipath(self.language()) + + def protocol(self): + """Return protocol ('http' or 'https') for access to this site.""" + return self.family().protocol(self.language()) + + def hostname(self): + """Return host portion of site URL.""" + return self.family().hostname(self.language()) + + def path(self): + """Return URL path for index.php on this Site.""" + return self.family().path(self.language()) + + def dbName(self): + """Return MySQL database name.""" + return self.family().dbName(self.language()) + + def move_address(self): + """Return URL path for Special:Movepage.""" + return self.family().move_address(self.language()) + + def delete_address(self, s): + """Return URL path to delete title 's'.""" + return self.family().delete_address(self.language(), s) + + def undelete_view_address(self, s, ts=''): + """Return URL path to view Special:Undelete for title 's' + + Optional argument 'ts' returns path to view specific deleted version. + + """ + return self.family().undelete_view_address(self.language(), s, ts) + + def undelete_address(self): + """Return URL path to Special:Undelete.""" + return self.family().undelete_address(self.language()) + + def protect_address(self, s): + """Return URL path to protect title 's'.""" + return self.family().protect_address(self.language(), s) + + def unprotect_address(self, s): + """Return URL path to unprotect title 's'.""" + return self.family().unprotect_address(self.language(), s) + + def put_address(self, s): + """Return URL path to submit revision to page titled 's'.""" + return self.family().put_address(self.language(), s) + + def get_address(self, s): + """Return URL path to retrieve page titled 's'.""" + return self.family().get_address(self.language(), s) + + def nice_get_address(self, s): + """Return shorter URL path to retrieve page titled 's'.""" + return self.family().nice_get_address(self.language(), s) + + def edit_address(self, s): + """Return URL path for edit form for page titled 's'.""" + return self.family().edit_address(self.language(), s) + + def purge_address(self, s): + """Return URL path to purge cache and retrieve page 's'.""" + return self.family().purge_address(self.language(), s) + + def block_address(self): + """Return path to block an IP address.""" + return self.family().block_address(self.language()) + + def unblock_address(self): + """Return path to unblock an IP address.""" + return self.family().unblock_address(self.language()) + + def blocksearch_address(self, s): + """Return path to search for blocks on IP address 's'.""" + return self.family().blocksearch_address(self.language(), s) + + def linksearch_address(self, s, limit=500, offset=0): + """Return path to Special:Linksearch for target 's'.""" + return self.family().linksearch_address(self.language(), s, limit=limit, offset=offset) + + def search_address(self, q, n=50, ns=0): + """Return path to Special:Search for query 'q'.""" + return self.family().search_address(self.language(), q, n, ns) + + def allpages_address(self, s, ns = 0): + """Return path to Special:Allpages.""" + return self.family().allpages_address(self.language(), start=s, namespace = ns) + + def log_address(self, n=50, mode = ''): + """Return path to Special:Log.""" + return self.family().log_address(self.language(), n, mode) + + def newpages_address(self, n=50): + """Return path to Special:Newpages.""" + return self.family().newpages_address(self.language(), n) + + def longpages_address(self, n=500): + """Return path to Special:Longpages.""" + return self.family().longpages_address(self.language(), n) + + def shortpages_address(self, n=500): + """Return path to Special:Shortpages.""" + return self.family().shortpages_address(self.language(), n) + + def unusedfiles_address(self, n=500): + """Return path to Special:Unusedimages.""" + return self.family().unusedfiles_address(self.language(), n) + + def categories_address(self, n=500): + """Return path to Special:Categories.""" + return self.family().categories_address(self.language(), n) + + def deadendpages_address(self, n=500): + """Return path to Special:Deadendpages.""" + return self.family().deadendpages_address(self.language(), n) + + def ancientpages_address(self, n=500): + """Return path to Special:Ancientpages.""" + return self.family().ancientpages_address(self.language(), n) + + def lonelypages_address(self, n=500): + """Return path to Special:Lonelypages.""" + return self.family().lonelypages_address(self.language(), n) + + def unwatchedpages_address(self, n=500): + """Return path to Special:Unwatchedpages.""" + return self.family().unwatchedpages_address(self.language(), n) + + def uncategorizedcategories_address(self, n=500): + """Return path to Special:Uncategorizedcategories.""" + return self.family().uncategorizedcategories_address(self.language(), n) + + def uncategorizedimages_address(self, n=500): + """Return path to Special:Uncategorizedimages.""" + return self.family().uncategorizedimages_address(self.language(), n) + + def uncategorizedpages_address(self, n=500): + """Return path to Special:Uncategorizedpages.""" + return self.family().uncategorizedpages_address(self.language(), n) + + def unusedcategories_address(self, n=500): + """Return path to Special:Unusedcategories.""" + return self.family().unusedcategories_address(self.language(), n) + + def withoutinterwiki_address(self, n=500): + """Return path to Special:Withoutinterwiki.""" + return self.family().withoutinterwiki_address(self.language(), n) + + def references_address(self, s): + """Return path to Special:Whatlinksere for page 's'.""" + return self.family().references_address(self.language(), s) + + def allmessages_address(self): + """Return path to Special:Allmessages.""" + return self.family().allmessages_address(self.language()) + + def upload_address(self): + """Return path to Special:Upload.""" + return self.family().upload_address(self.language()) + + def double_redirects_address(self, default_limit = True): + """Return path to Special:Doubleredirects.""" + return self.family().double_redirects_address(self.language(), default_limit) + + def broken_redirects_address(self, default_limit = True): + """Return path to Special:Brokenredirects.""" + return self.family().broken_redirects_address(self.language(), default_limit) + + def login_address(self): + """Return path to Special:Userlogin.""" + return self.family().login_address(self.language()) + + def captcha_image_address(self, id): + """Return path to Special:Captcha for image 'id'.""" + return self.family().captcha_image_address(self.language(), id) + + def watchlist_address(self): + """Return path to Special:Watchlist editor.""" + return self.family().watchlist_address(self.language()) + + def contribs_address(self, target, limit=500, offset=''): + """Return path to Special:Contributions for user 'target'.""" + return self.family().contribs_address(self.language(),target,limit,offset) + + def __hash__(self): + return hash(repr(self)) + + def version(self): + """Return MediaWiki version number as a string.""" + return self.family().version(self.language()) + + def versionnumber(self): + """Return an int identifying MediaWiki version. + + Currently this is implemented as returning the minor version + number; i.e., 'X' in version '1.X.Y' + + """ + return self.family().versionnumber(self.language()) + + def live_version(self): + """Return the 'real' version number found on [[Special:Version]] + + Return value is a tuple (int, int, str) of the major and minor + version numbers and any other text contained in the version. + + """ + global htmldata + if not hasattr(self, "_mw_version"): + versionpage = self.getUrl(self.get_address("Special:Version")) + htmldata = BeautifulSoup(versionpage, convertEntities="html") + versionstring = htmldata.findAll(text="MediaWiki" + )[1].parent.nextSibling + m = re.match(r"^: ([0-9]+).([0-9]+)(.*)$", str(versionstring)) + if m: + self._mw_version = (int(m.group(1)), int(m.group(2)), + m.group(3)) + else: + self._mw_version = self.family().version(self.language()).split(".") + return self._mw_version + + def checkCharset(self, charset): + """Warn if charset returned by wiki doesn't match family file.""" + if not hasattr(self,'charset'): + self.charset = charset + assert self.charset.lower() == charset.lower(), \ + "charset for %s changed from %s to %s" \ + % (repr(self), self.charset, charset) + if self.encoding().lower() != charset.lower(): + raise ValueError( +"code2encodings has wrong charset for %s. It should be %s, but is %s" + % (repr(self), charset, self.encoding())) + + def shared_image_repository(self): + """Return a tuple of image repositories used by this site.""" + return self.family().shared_image_repository(self.language()) + + def __cmp__(self, other): + """Perform equality and inequality tests on Site objects.""" + if not isinstance(other, Site): + return 1 + if self.family() == other.family: + return cmp(self.language() ,other.lang) + return cmp(self.family().name, other.family.name) + + def category_on_one_line(self): + """Return True if this site wants all category links on one line.""" + return self.language() in self.family().category_on_one_line + + def interwiki_putfirst(self): + """Return list of language codes for ordering of interwiki links.""" + return self.family().interwiki_putfirst.get(self.language(), None) + + def interwiki_putfirst_doubled(self, list_of_links): + # TODO: is this even needed? No family in the framework has this + # dictionary defined! + if self.family().interwiki_putfirst_doubled.has_key(self.language()): + if len(list_of_links) >= self.family().interwiki_putfirst_doubled[self.language()][0]: + list_of_links2 = [] + for lang in list_of_links: + list_of_links2.append(lang.language()) + list = [] + for lang in self.family().interwiki_putfirst_doubled[self.language()][1]: + try: + list.append(list_of_links[list_of_links2.index(lang)]) + except ValueError: + pass + return list + else: + return False + else: + return False + + def getSite(self, code): + """Return Site object for language 'code' in this Family.""" + return getSite(code = code, fam = self.family(), user=self.user) + + def namespace(self, num, all = False): + """Return string containing local name of namespace 'num'. + + If optional argument 'all' is true, return a tuple of all recognized + values for this namespace. + + """ + return self.family().namespace(self.language(), num, all = all) + + def normalizeNamespace(self, value): + """Return canonical name for namespace 'value' in this Site's language. + + 'Value' should be a string or unicode. + If no match, return 'value' unmodified. + + """ + if not self.nocapitalize and value[0].islower(): + value = value[0].upper() + value[1:] + return self.family().normalizeNamespace(self.language(), value) + + def namespaces(self): + """Return list of canonical namespace names for this Site.""" + + # n.b.: this does not return namespace numbers; to determine which + # numeric namespaces the framework recognizes for this Site (which + # may or may not actually exist on the wiki), use + # self.family().namespaces.keys() + + if _namespaceCache.has_key(self): + return _namespaceCache[self] + else: + nslist = [] + for n in self.family().namespaces: + try: + ns = self.family().namespace(self.language(), n) + except KeyError: + # No default namespace defined + continue + if ns is not None: + nslist.append(self.family().namespace(self.language(), n)) + _namespaceCache[self] = nslist + return nslist + + def validLanguageLinks(self): + """Return list of language codes that can be used in interwiki links.""" + return self._validlanguages + + def disambcategory(self): + """Return Category in which disambig pages are listed.""" + import catlib + try: + return catlib.Category(self, + self.namespace(14)+':'+self.family().disambcatname[self.language()]) + except KeyError: + raise NoPage + + def getToken(self, getalways = True, getagain = False, sysop = False): + index = self._userIndex(sysop) + if getagain or (getalways and self._token[index] is None): + output(u'Getting a token.') + self._load(sysop = sysop) + if self._token[index] is not None: + return self._token[index] + else: + return False +
Modified: branches/rewrite/pywikibot/tests/api_tests.py =================================================================== --- branches/rewrite/pywikibot/tests/api_tests.py 2008-02-27 20:05:28 UTC (rev 5087) +++ branches/rewrite/pywikibot/tests/api_tests.py 2008-02-27 20:08:48 UTC (rev 5088) @@ -25,7 +25,7 @@ self.assert_(all(len(item) == 2 for item in req.items()))
-class TestListGenerator(unittest.TestCase): +class TestPageGenerator(unittest.TestCase): def setUp(self): self.gen = api.PageGenerator(site=mysite, generator="links",
pywikipedia-l@lists.wikimedia.org