Revision: 4400
Author: russblau
Date: 2007-10-02 21:57:44 +0000 (Tue, 02 Oct 2007)
Log Message:
-----------
Major docstring cleanup (in progress); see PEP 8 and PEP 257. In the process, some minor
code changes, including (1) removed unused NoSuchEntity exception; (2) moved
ignore_bot_templates to config.py; (3) changed defaults for several methods to
throttle=True; (4) removed redundant getFileLinks method (ImagePage.usingPages does the
same thing better); (5) fixed a remaining regex bug in replaceCategoryInPlace(); (5) split
Page.previousVersion() method into separate .previousRevision() and .getOldVersion()
methods. Made conforming changes in other files.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/nowcommons.py
trunk/pywikipedia/pagegenerators.py
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2007-10-02 12:46:59 UTC (rev 4399)
+++ trunk/pywikipedia/config.py 2007-10-02 21:57:44 UTC (rev 4400)
@@ -78,6 +78,9 @@
sysopnames[familyName] = {}
disambiguation_comment[familyName] = {}
+# Set to True to override the {{bots}} exclusion protocol (at your own risk!)
+ignore_bot_templates = False
+
############## USER INTERFACE SETTINGS ##############
# The encoding that's used in the user's console, i.e. how strings are encoded
Modified: trunk/pywikipedia/nowcommons.py
===================================================================
--- trunk/pywikipedia/nowcommons.py 2007-10-02 12:46:59 UTC (rev 4399)
+++ trunk/pywikipedia/nowcommons.py 2007-10-02 21:57:44 UTC (rev 4400)
@@ -134,17 +134,23 @@
if not filenameOnCommons:
wikipedia.output(u'NowCommons template not found.')
continue
- commonsImagePage = wikipedia.ImagePage(commons, 'Image:%s' %
filenameOnCommons)
+ commonsImagePage = wikipedia.ImagePage(commons,
+ 'Image:%s' % filenameOnCommons)
if len(localImagePage.getFileVersionHistory()) > 1:
- wikipedia.output(u'This image has a version history. Please
manually delete it after making sure that the old versions aren\'t worth
keeping.')
+ wikipedia.output(u"""\
+This image has a version history. Please delete it manually after making sure
+that the old versions aren't worth keeping.""")
continue
if localImagePage.titleWithoutNamespace() !=
commonsImagePage.titleWithoutNamespace():
- usingPages = localImagePage.usingPages()
+ usingPages = list(localImagePage.usingPages())
if usingPages and usingPages != [localImagePage]:
- wikipedia.output('%s is still used in %i pages. Please change
them manually.' % (localImagePage.title(), len(localImagePage.usingPages())))
+ wikipedia.output(
+ '%s is still used in %i pages. Please change them manually.'
+ % (localImagePage.title(), len(usingPages)))
continue
else:
- wikipedia.output('No page is using %s anymore.' %
localImagePage.title())
+ wikipedia.output('No page is using %s anymore.'
+ % localImagePage.title())
commonsText = commonsImagePage.get()
if md5 == commonsImagePage.getFileMd5Sum():
wikipedia.output(u'The image is identical to the one on
Commons.')
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2007-10-02 12:46:59 UTC (rev 4399)
+++ trunk/pywikipedia/pagegenerators.py 2007-10-02 21:57:44 UTC (rev 4400)
@@ -127,8 +127,8 @@
for page in site.newpages(number=number, get_redirect=get_redirect, repeat=repeat):
yield page[0]
-def FileLinksGenerator(referredPage):
- for page in referredPage.getFileLinks():
+def FileLinksGenerator(referredImagePage):
+ for page in referredImagePage.usingPages():
yield page
def ImagesPageGenerator(pageWithImages):
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2007-10-02 12:46:59 UTC (rev 4399)
+++ trunk/pywikipedia/wikipedia.py 2007-10-02 21:57:44 UTC (rev 4400)
@@ -6,72 +6,30 @@
late August 2004)
Classes:
-Page: A MediaWiki page
- __init__ : Page(Site, Title) - the page with title Title on wikimedia
site Site
- title : The name of the page, in a form suitable for an interwiki
link
- urlname : The name of the page, in a form suitable for a URL
- titleWithoutNamespace : The name of the page, with the namespace part removed
- section : The section of the page (the part of the name after
'#')
- sectionFreeTitle : The name without the section part
- aslink : The name of the page in the form [[Title]] or [[lang:Title]]
- site : The wiki this page is in
- encoding : The encoding of the page
- isAutoTitle : If the title is a well known, auto-translatable title
- autoFormat : Returns (dictName, value), where value can be a year, date,
etc.,
- and dictName is 'YearBC', 'December', etc.
- isCategory : True if the page is a category, false otherwise
- isImage : True if the page is an image, false otherwise
+ Page(site, title): A page on a MediaWiki site
+ ImagePage(site, title): An image descriptor Page
+ Site(lang, fam): A MediaWiki site
+ Throttle: Limits reading and writing rates
- get (*) : The text of the page
- exists (*) : True if the page actually exists, false otherwise
- isRedirectPage (*) : True if the page is a redirect, false otherwise
- isEmpty (*) : True if the page has 4 characters or less content, not
- counting interwiki and category links
- botMayEdit (*) : True if bot is allowed to edit page
- interwiki (*) : The interwiki links from the page (list of Pages)
- categories (*) : The categories the page is in (list of Pages)
- linkedPages (*) : The normal pages linked from the page (list of Pages)
- imagelinks (*) : The pictures on the page (list of ImagePages)
- templates (*) : All templates referenced on the page (list of strings)
- getRedirectTarget (*) : The page the page redirects to
- isDisambig (*) : True if the page is a disambiguation page
- getReferences : List of pages linking to the page
- namespace : The namespace in which the page is
- permalink (*) : The url of the permalink of the current version
- move : Move the page to another title
- put(newtext) : Saves the page
- put_async(newtext) : Queues the page to be saved asynchronously
- delete : Deletes the page (requires being logged in)
-
- (*) : This loads the page if it has not been loaded before; permalink might
- even reload it if it has been loaded before
-
-Site: a MediaWiki site
- messages : There are new messages on the site
- forceLogin() : Does not continue until the user has logged in to
- the site
- getUrl() : Retrieve an URL from the site
- mediawiki_message(key): Retrieve the text of the MediaWiki message with
- the key "key"
- has_mediawiki_message(key) : True if this site defines a MediaWiki message
- with the key "key"
- Special pages:
- Dynamic pages:
- allpages(): Special:Allpages
- newpages(): Special:Newpages
- longpages(): Special:Longpages
- shortpages(): Special:Shortpages
- categories(): Special:Categories
-
- Cached pages:
- deadendpages(): Special:Deadendpages
- ancientpages(): Special:Ancientpages
- lonelypages(): Special:Lonelypages
- uncategorizedcategories(): Special:Uncategorizedcategories
- uncategorizedpages(): Special:Uncategorizedpages
- uncategorizedimages(): Special:Uncategorizedimages
- unusedcategories(): Special:Unusuedcategories
-
+Exceptions:
+ Error: Base class for all exceptions in this module
+ NoUsername: Username is not in user-config.py
+ NoPage: Page does not exist on the wiki
+ IsRedirectPage: Page is a redirect page
+ IsNotRedirectPage: Page is not a redirect page
+ LockedPage: Page is locked
+ LockedNoPage: Page does not exist, and creating it is not
+ possible because of a lock (subclass of NoPage and
+ LockedPage)
+ SectionError: The section specified in the Page title does not exist
+ PageNotSaved: Saving the page has failed
+ EditConflict: PageNotSaved due to edit conflict while uploading
+ SpamfilterError: PageNotSaved due to MediaWiki spam filter
+ ServerError: Got unexpected response from wiki server
+ BadTitle: Server responded with BadTitle.
+ UserBlocked: Client's username or IP has been blocked
+ PageNotFound: Page not found in list
+
Other functions:
getall(): Load pages via Special:Export
setAction(text): Use 'text' instead of "Wikipedia python library" in
@@ -140,7 +98,6 @@
except NameError:
from sets import Set as set
-
# Check Unicode support (is this a wide or narrow python build?)
# See
http://www.python.org/doc/peps/pep-0261/
try:
@@ -149,16 +106,6 @@
except ValueError:
WIDEBUILD = False
-
-# Local settings
-
-# If ignore_bot_templates is True, the bot will always ignore {{bots}}
-# and {{nobots}} templates - botMayEdit() will always return True.
-# In the default (False) state, it will honor these directives and
-# refuse to save pages that forbid it from editing.
-ignore_bot_templates = False
-
-
# Local exceptions
class Error(Exception):
@@ -182,9 +129,6 @@
class LockedNoPage(NoPage, LockedPage):
"""Page does not exist, and creating it is not possible because of a
lock."""
-class NoSuchEntity(ValueError):
- """No entity exist for this character"""
-
class SectionError(Error):
"""The section specified by # does not exist"""
@@ -206,13 +150,13 @@
class BadTitle(Error):
"""Server responded with BadTitle."""
-# UserBlocked exceptions should in general not be catched. If the bot has been
-# blocked, the bot operator has possibly done a mistake and should take care of
-# the issue before continuing.
+# UserBlocked exceptions should in general not be caught. If the bot has
+# been blocked, the bot operator should address the reason for the block
+# before continuing.
class UserBlocked(Error):
"""Your username or IP has been blocked"""
-class PageNotFound(Exception):
+class PageNotFound(Error):
"""Page not found in list"""
SaxError = xml.sax._exceptions.SAXParseException
@@ -220,19 +164,89 @@
# Pre-compile re expressions
reNamespace = re.compile("^(.+?) *: *(.*)$")
-# The most important thing in this whole module: The Page class
+
class Page(object):
- """A page on the wiki."""
- def __init__(self, site, title, insite = None, defaultNamespace = 0):
- """
- Constructor. Normally called with two arguments:
- Parameters:
- 1) The wikimedia site on which the page resides
- 2) The title of the page as a unicode string
+ """Page: A MediaWiki page
- The argument insite can be specified to help decode
- the name; it is the wikimedia site where this link was found.
- """
+ Constructor has two required parameters:
+ 1) The wikimedia Site on which the page resides
+ 2) The title of the page as a unicode string
+
+ Optional parameters:
+ insite - the wikimedia Site where this link was found (to help decode
+ interwiki links)
+ defaultNamespace - A namespace to use if the link does not contain one
+
+ Methods available:
+
+ title : The name of the page, including namespace and
+ section if any
+ urlname : Title, in a form suitable for a URL
+ namespace : The namespace in which the page is found
+ titleWithoutNamespace : Title, with the namespace part removed
+ section : The section of the page (the part of the title
+ after '#', if any)
+ sectionFreeTitle : Title, without the section part
+ aslink : Title in the form [[Title]] or [[lang:Title]]
+ site : The wiki this page is in
+ encoding : The encoding of the page
+ isAutoTitle : Title can be translated using the autoFormat method
+ autoFormat : Auto-format certain dates and other standard
+ format page titles
+ isCategory : True if the page is a category
+ isDisambig (*) : True if the page is a disambiguation page
+ isImage : True if the page is an image
+ isRedirectPage (*) : True if the page is a redirect, false otherwise
+ getRedirectTarget (*) : The page the page redirects to
+ isTalkPage : True if the page is in any "talk" namespace
+ toggleTalkPage : Return the talk page (if this is one, return the
+ non-talk page)
+ get (*) : The text of the page
+ latestRevision (*) : The page's current revision id
+ userName : Last user to edit page
+ isIpEdit : True if last editor was unregistered
+ editTime : Timestamp of the last revision to the page
+ previousRevision (*) : The revision id of the previous version
+ permalink (*) : The url of the permalink of the current version
+ getOldVersion(id) (*) : The text of a previous version of the page
+ getVersionHistory : Load the version history information from wiki
+ getVersionHistoryTable: Create a wiki table from the history data
+ fullVersionHistory : Return all past versions including wikitext
+ contributingUsers : Return set of users who have edited page
+ exists (*) : True if the page actually exists, false otherwise
+ isEmpty (*) : True if the page has 4 characters or less content,
+ not counting interwiki and category links
+ interwiki (*) : The interwiki links from the page (list of Pages)
+ categories (*) : The categories the page is in (list of Pages)
+ linkedPages (*) : The normal pages linked from the page (list of
+ Pages)
+ imagelinks (*) : The pictures on the page (list of ImagePages)
+ templates (*) : All templates referenced on the page (list of
+ strings)
+ templatesWithParams(*): All templates on the page, with list of parameters
+ templatePages (*) : Page objects for all templates used on this page
+ isDisambig (*) : True if the page is a disambiguation page
+ getReferences : List of pages linking to the page
+ canBeEdited (*) : True if page is unprotected or user has edit
+ privileges
+ botMayEdit (*) : True if bot is allowed to edit page
+ put(newtext) : Saves the page
+ put_async(newtext) : Queues the page to be saved asynchronously
+ move : Move the page to another title
+ delete : Deletes the page (requires being logged in)
+ protect : Protect or unprotect a page (requires sysop status)
+ removeImage : Remove all instances of an image from this page
+ replaceImage : Replace all instances of an image with another
+ loadDeletedRevisions : Load all deleted versions of this page
+ getDeletedRevision : Return a particular deleted revision
+ markDeletedRevision : Mark a version to be undeleted, or not
+ undelete : Undelete past version(s) of the page
+
+ (*) : This loads the page if it has not been loaded before; permalink might
+ even reload it if it has been loaded before
+
+ """
+ def __init__(self, site, title, insite=None, defaultNamespace=0):
try:
# if _editrestriction is True, it means that the page has been found
# to have an edit restriction, but we do not know yet whether the
@@ -362,134 +376,136 @@
raise
def site(self):
- """The site of the page this Page refers to,
- without :"""
+ """Return the Site object for the wiki on which this Page
resides."""
return self._site
def encoding(self):
- """
- Returns the character encoding used on this page's wiki.
- """
+ """Return the character encoding used on this Page's wiki
Site."""
return self._site.encoding()
- def urlname(self):
- """The name of the page this Page refers to, in a form suitable
- for the URL of the page."""
- title = self.title(underscore = True)
- encodedTitle = title.encode(self.site().encoding())
- return urllib.quote(encodedTitle)
+ def title(self, underscore = False, savetitle = False):
+ """Return the title of this Page, as a Unicode string.
- def title(self, underscore = False, savetitle = False):
- """The name of this Page, as a Unicode string"""
+ If underscore is True, replace all ' ' characters with '_'.
+ If savetitle is True, try to quote all non-ASCII characters.
+ """
title = self._title
if savetitle: # Ensure there's no wiki syntax in the title
if title.find("''") > -1:
try:
title = urllib.quote(title).replace('%20',' ')
except KeyError:
- # We can't encode everything; to be on the safe side, we encode
nothing
+ # We can't encode everything; to be on the safe side,
+ # we encode nothing
pass
if underscore:
title = title.replace(' ', '_')
return title
- def titleWithoutNamespace(self, underscore = False):
- """
- Returns the name of the page without the namespace and without section.
- """
+ def titleWithoutNamespace(self, underscore=False):
+ """Return title of Page without namespace and without
section."""
if self.namespace() == 0:
- return self.title(underscore = underscore)
+ return self.sectionFreeTitle(underscore=underscore)
else:
- return self.sectionFreeTitle(underscore = underscore).split(':',
1)[1]
+ return self.sectionFreeTitle(underscore=underscore).split(':', 1)[1]
def section(self, underscore = False):
- """The name of the section this Page refers to. Sections are
- denominated by a # in the title(). If no section is referenced,
- None is returned."""
+ """Return the name of the section this Page refers to.
+
+ The section is the part of the title following a '#' character, if any.
+ If no section is present, return None.
+ """
return self._section
- # ln = self.title(underscore = underscore)
- # ln = re.sub('&#', '&hash;', ln)
- # if not '#' in ln:
- # return None
- # else:
- # hn = ln[ln.find('#') + 1:]
- # hn = re.sub('&hash;', '&#', hn)
- # return hn
- def sectionFreeTitle(self, underscore = False):
- sectionName = self.section(underscore = underscore)
- title = self.title(underscore = underscore)
+ def sectionFreeTitle(self, underscore=False):
+ """Return the title of this Page, without the section (if
any)."""
+ sectionName = self.section(underscore=underscore)
+ title = self.title(underscore=underscore)
if sectionName:
return title[:-len(sectionName)-1]
else:
return title
+ def urlname(self):
+ """Return the Page title encoded for use in an
URL."""
+ title = self.title(underscore = True)
+ encodedTitle = title.encode(self.site().encoding())
+ return urllib.quote(encodedTitle)
+
def __str__(self):
- """A console representation of the pagelink"""
+ """Return a console representation of the
pagelink."""
return self.aslink().encode(config.console_encoding, 'replace')
def __repr__(self):
- """A more complete string representation"""
+ """Return a more complete string
representation."""
return "%s{%s}" % (self.__class__.__name__, str(self))
- def aslink(self, forceInterwiki = False, textlink=False):
- """
- A string representation in the form of a link. The link will
- be an interwiki link if needed.
+ def aslink(self, forceInterwiki=False, textlink=False):
+ """Return a string representation in the form of a wikilink.
- If you set forceInterwiki to True, the link will have the format
- of an interwiki link even if it points to the home wiki.
+ If forceInterwiki is True, return an interwiki link even if it
+ points to the home wiki. If False, return an interwiki link only if
+ needed.
- If you set textlink to True, the link will always appear in text
- form (that is, links to the Category: and Image: namespaces will
- be preceded by a : character).
-
- Note that the family is never included.
+ If textlink is True, always return a link in text form (that
+ is, links to the Category: and Image: namespaces will be preceded by
+ a : character). (Not needed if forceInterwiki is True.)
+
"""
if forceInterwiki or self.site() != getSite():
if self.site().family != getSite().family:
- return '[[%s:%s:%s]]' % (self.site().family.name,
self.site().lang, self.title(savetitle=True))
+ return u'[[%s:%s:%s]]' % (self.site().family.name,
self.site().lang, self.title(savetitle=True))
else:
- return '[[%s:%s]]' % (self.site().lang,
self.title(savetitle=True))
- elif textlink and self.namespace() in (6, 14): # Image: or Category:
- return '[[:%s]]' % self.title()
+ return u'[[%s:%s]]' % (self.site().lang,
self.title(savetitle=True))
+ elif textlink and (self.isImage() or self.isCategory()):
+ return u'[[:%s]]' % self.title()
else:
- return '[[%s]]' % self.title()
+ return u'[[%s]]' % self.title()
- def isAutoTitle(self):
- """If the title is a well known, auto-translatable title
+ def autoFormat(self):
+ """Return (dictName, value) if title is in date.autoFormat
dictionary.
+
+ Value can be a year, date, etc., and dictName is 'YearBC',
+ 'Year_December', or another dictionary name. Please note that two
+ entries may have exactly the same autoFormat, but be in two
+ different namespaces, as some sites have categories with the
+ same names. Regular titles return (None, None).
+
"""
- return self.autoFormat()[0] is not None
-
- def autoFormat(self):
- """Returns (dictName, value), where value can be a year, date,
etc.,
- and dictName is 'YearBC', 'Year_December', or another
dictionary name.
- Please note that two entries may have exactly the same autoFormat,
- but be in two different namespaces, as some sites have categories with the
same names.
- Regular titles return (None,None)."""
if not hasattr(self, '_autoFormat'):
import date
- _autoFormat = date.getAutoFormat(self.site().language(),
self.titleWithoutNamespace())
+ _autoFormat = date.getAutoFormat(self.site().language(),
+ self.titleWithoutNamespace())
return _autoFormat
+ def isAutoTitle(self):
+ """Return True if title of this Page is in the autoFormat
dictionary."""
+ return self.autoFormat()[0] is not None
- def get(self, force = False, get_redirect=False, throttle = True, sysop = False,
nofollow_redirects=False, change_edit_time = True):
- """The wiki-text of the page. This will retrieve the page if it
has not
- been retrieved yet. This can raise the following exceptions that
- should be caught by the calling code:
+ def get(self, force=False, get_redirect=False, throttle=True,
+ sysop=False, nofollow_redirects=False, change_edit_time=True):
+ """Return the wiki-text of the page.
+ This will retrieve the page from the server if it has not been
+ retrieved yet, or if force is True. This can raise the following
+ exceptions that should be caught by the calling code:
+
NoPage: The page does not exist
-
IsRedirectPage: The page is a redirect. The argument of the
exception is the title of the page it redirects to.
-
SectionError: The subject does not exist on a page with a # link
- Set get_redirect to True to follow redirects rather than raise an exception.
- Set force to True to force a reload of all page attributes, including
errors.
- Set nofollow_redirects to True to not follow redirects but obey all other
exceptions.
- Set change_version_date to False if you have already loaded the page before
and
- do not check this version for changes before saving
+ If get_redirect is True, return the redirect text and save the
+ target of the redirect, do not raise an exception.
+ If force is True, reload all page attributes, including
+ errors.
+ If nofollow_redirects is True, ignore redirects entirely (do not
+ raise an exception for redirects but do not mark the page as a
+ redirect or save the redirect target page).
+ If change_edit_time is False, do not check this version for changes
+ before saving. This should be used only if the page has been loaded
+ previously.
+
"""
# NOTE: The following few NoPage exceptions could already be thrown at
# the Page() constructor. They are raised here instead for convenience,
@@ -497,7 +513,7 @@
# get(), but not for such raised by the constructor.
# \ufffd represents a badly encoded character, the other characters are
# disallowed by MediaWiki.
- for illegalChar in ['#', '<', '>', '[',
']', '|', '{', '}', '\n', u'\ufffd']:
+ for illegalChar in u'#<>[]|{}\n\ufffd':
if illegalChar in self.sectionFreeTitle():
if verbose:
output(u'Illegal character in %s!' % self.aslink())
@@ -505,11 +521,12 @@
if self.namespace() == -1:
raise NoPage('%s is in the Special namespace!' % self.aslink())
if self.site().isInterwikiLink(self.title()):
- raise NoPage('%s is not a local page on %s!' % (self.aslink(),
self.site()))
+ raise NoPage('%s is not a local page on %s!'
+ % (self.aslink(), self.site()))
if force:
# When forcing, we retry the page no matter what. Old exceptions
# and contents do not apply any more.
- for attr in
['_redirarg','_getexception','_contents']:
+ for attr in ['_redirarg', '_getexception',
'_contents']:
if hasattr(self, attr):
delattr(self,attr)
else:
@@ -526,7 +543,7 @@
# Make sure we did try to get the contents once
if not hasattr(self, '_contents'):
try:
- self._contents, self._isWatched, self.editRestriction =
self.getEditPage(get_redirect = get_redirect, throttle = throttle, sysop = sysop,
nofollow_redirects=nofollow_redirects)
+ self._contents, self._isWatched, self.editRestriction =
self._getEditPage(get_redirect = get_redirect, throttle = throttle, sysop = sysop,
nofollow_redirects=nofollow_redirects)
hn = self.section()
if hn:
m = re.search("=+ *%s *=+" % hn, self._contents)
@@ -548,15 +565,19 @@
raise
return self._contents
- def getEditPage(self, get_redirect=False, throttle = True, sysop = False, oldid =
None, nofollow_redirects = False, change_edit_time = True):
- """
- Get the contents of the Page via the edit page.
+ def _getEditPage(self, get_redirect=False, throttle=True, sysop=False,
+ oldid=None, nofollow_redirects=False,
+ change_edit_time=True):
+ """Get the contents of the Page via the edit page.
+
Do not use this directly, use get() instead.
Arguments:
+ oldid - Retrieve an old revision (by id), not the current one
get_redirect - Get the contents, even if it is a redirect page
- This routine returns a unicode string containing the wiki text.
+ This method returns a 3-tuple containing the raw wiki text as a
+ unicode string, the watchlist status, and any edit restrictions.
"""
isWatched = False
editRestriction = None
@@ -657,14 +678,15 @@
if matchWatching:
isWatched = True
# Now process the contents of the textarea
- m = self.site().redirectRegex().match(text[i1:i2])
if self._editTime == "0":
if verbose:
output(u"DBG> page may be locked?!")
editRestriction = 'sysop'
+ m = self.site().redirectRegex().match(text[i1:i2])
if m:
+ # page text matches the redirect pattern
if self.section():
- redirtarget = "%s#%s"%(m.group(1),self.section())
+ redirtarget = "%s#%s" % (m.group(1), self.section())
else:
redirtarget = m.group(1)
if get_redirect:
@@ -686,26 +708,45 @@
return x, isWatched, editRestriction
+ def getOldVersion(self, oldid, force=False, get_redirect=False,
+ throttle=True, sysop=False, nofollow_redirects=False,
+ change_edit_time=True):
+ """Return text of an old revision of this page; same options as
get()."""
+ # TODO: should probably check for bad pagename, NoPage, and other
+ # exceptions that would prevent retrieving text, as get() does
+ return self._getEditPage(
+ get_redirect=get_redirect, throttle=throttle,
+ sysop=sysop, oldid=oldid,
+ nofollow_redirects=nofollow_redirects,
+ change_edit_time=change_edit_time
+ )[0]
+
def permalink(self):
- """
- Get the permalink page for this page
- """
- return "%s://%s%s&oldid=%i"%(self.site().protocol(),
self.site().hostname(), self.site().get_address(self.title()), self.latestRevision())
+ """Return the permalink URL for current revision of this
page."""
+ return "%s://%s%s&oldid=%i" % (self.site().protocol(),
+ self.site().hostname(),
+ self.site().get_address(self.title()),
+ self.latestRevision())
def latestRevision(self):
- """
- Get the latest revision for this page
- """
+ """Return the latest revision id for this page."""
if not self._permalink:
- # When we get the page with getall, the permalink is received automatically
+ # When we get the page with getall, the permalink is received
+ # automatically
getall(self.site(),[self],force=True)
return int(self._permalink)
+ def previousRevision(self):
+ """Return the revision id for the previous revision of this
Page."""
+ vh = self.getVersionHistory(revCount=2)
+ return vh[1][0]
+
def exists(self):
- """
- True if the page exists, even if it's a redirect.
+ """Return True if page exists on the wiki, even if it's a
redirect.
- If the title includes a section, False if this section isn't found.
+ If the title includes a section, return False if this section isn't
+ found.
+
"""
try:
self.get()
@@ -718,7 +759,7 @@
return True
def isRedirectPage(self):
- """True if the page is a redirect page, False if not or not
existing"""
+ """Return True if this is a redirect, False if not or not
existing."""
try:
self.get()
except NoPage:
@@ -730,11 +771,12 @@
return False
def isEmpty(self):
+ """Return True if the page text has less than 4 characters.
+
+ Character count ignores language links and category links.
+ Can raise the same exceptions as get().
+
"""
- True if the page has less than 4 characters, except for
- language links and category links, False otherwise.
- Can raise the same exceptions as get()
- """
txt = self.get()
txt = removeLanguageLinks(txt)
txt = removeCategoryLinks(txt, site = self.site())
@@ -744,21 +786,25 @@
return False
def isTalkPage(self):
+ """Return True if this page is in any talk
namespace."""
ns = self.namespace()
return ns >= 0 and ns % 2 == 1
def botMayEdit(self):
- """
- True if page doesn't contain {{bots}} or {{nobots}} or
- contains them and active bot is allowed or not allowed
- to edit said page
+ """Return True if this page allows bots to edit it.
+
+ This will be True if the page doesn't contain {{bots}} or
+ {{nobots}}, or it contains them and the active bot is allowed to
+ edit this page. (This method is only useful on those sites that
+ recognize the bot-exclusion protocol; on other sites, it will always
+ return True.)
- The framework enforces this restriction by default. It is possible to
- override this by setting wikipedia.ignore_bot_templates=True or using
- page.put(force=True).
+ The framework enforces this restriction by default. It is possible
+ to override this by setting ignore_bot_templates=True in
+ user_config.py, or using page.put(force=True).
+
"""
- global ignore_bot_templates
- if ignore_bot_templates: #Check the "master ignore switch"
+ if config.ignore_bot_templates: #Check the "master ignore switch"
return True
try:
@@ -798,41 +844,53 @@
return True
def userName(self):
+ """Return name or IP address of last user to edit page.
+
+ Returns None unless page was retrieved with getAll().
+
+ """
return self._userName
def isIpEdit(self):
+ """Return True if last editor was unregistered.
+
+ Returns None unless page was retrieved with getAll().
+
+ """
return self._ipedit
def editTime(self):
+ """Return timestamp (in MediaWiki format) of last revision to
page.
+
+ Returns None if last edit time is unknown.
+
+ """
return self._editTime
def namespace(self):
- """Gives the number of the namespace of the page. Does not work
for
- all namespaces in all languages, only when defined in family.py.
- If not defined, it will return 0 (the main namespace)"""
+ """Return the number of the namespace of the page.
+
+ Only recognizes those namespaces defined in family.py.
+ If not defined, it will return 0 (the main namespace).
+
+ """
return self._namespace
- # t=self.sectionFreeTitle()
- # p=t.split(':')
- # if p[1:]==[]:
- # return 0
- # for namespaceNumber in self.site().family.namespaces.iterkeys():
- # if p[0]==self.site().namespace(namespaceNumber):
- # return namespaceNumber
- # return 0
def isCategory(self):
- """
- True if the page is a Category, false otherwise.
- """
+ """Return True if the page is a Category, False
otherwise."""
return self.namespace() == 14
def isImage(self):
- """
- True if the page is an image description page, false otherwise.
- """
+ """Return True if this is an image description page, False
otherwise."""
return self.namespace() == 6
def isDisambig(self):
+ """Return True if this is a disambiguation page, False otherwise.
+
+ Relies on the presence of specific templates, identified in the Family
+ file, to identify disambiguation pages.
+
+ """
if not hasattr(self, '_isDisambig'):
locdis = self.site().family.disambig( self._site.lang )
@@ -853,10 +911,9 @@
def getReferences(self,
follow_redirects=True, withTemplateInclusion=True,
onlyTemplateInclusion=False, redirectsOnly=False):
- """
- Yield all pages that link to the page. If you need a full list of
- referring pages, use this:
+ """Yield all pages that link to the page.
+ If you need a full list of referring pages, use this:
pages = [page for page in s.getReferences()]
Parameters:
@@ -867,6 +924,7 @@
* onlyTemplateInclusion - if True, only returns pages where self is
used as a template.
* redirectsOnly - if True, only returns redirects to self.
+
"""
# Temporary bug-fix while researching more robust solution:
if config.special_page_limit > 999:
@@ -913,12 +971,10 @@
def _parse_reflist(self, reflist,
follow_redirects=True, withTemplateInclusion=True,
onlyTemplateInclusion=False, redirectsOnly=False):
- """
- For internal use only
+ """For internal use only
Parse a "Special:Whatlinkshere" list of references and yield Page
- objects that meet the criteria
- (used by getReferences)
+ objects that meet the criteria (used by getReferences)
"""
for link in reflist("li", recursive=False):
title = link.a.string
@@ -953,92 +1009,23 @@
onlyTemplateInclusion, redirectsOnly):
yield p
-
- def getFileLinks(self):
- """
- Yield all pages that link to the page. If you need a full list of
- referring pages, use this:
-
- pages = [page for page in s.getReferences()]
-
- """
- site = self.site()
- #path = site.references_address(self.urlname())
- path = site.get_address(self.urlname())
-
- delay = 1
-
- # NOTE: this code relies on the way MediaWiki 1.6 formats the
- # "Whatlinkshere" special page; if future versions change the
- # format, they may break this code.
- if self.site().versionnumber() >= 5:
- startmarker = u"<!-- start content -->"
- endmarker = u"<!-- end content -->"
- else:
- startmarker = u"<body "
- endmarker = "printfooter"
- listitempattern = re.compile(r"<li><a
href=.*>(?P<title>.*)</a></li>")
- # to tell the previous and next link apart, we rely on the closing ) at the end
of the "previous" label.
- more = True
-
- while more:
- more = False #Kill after one loop because MediaWiki will only display up to
the first 500 File links.
- fileLinks = set() # use a set to avoid duplications
- output(u'Getting references to %s' % self.aslink())
- while True:
- txt = site.getUrl(path)
- # trim irrelevant portions of page
- try:
- start = txt.index(startmarker) + len(startmarker)
- end = txt.index(endmarker)
- except ValueError:
- output(u"Invalid page received from server.... Retrying in %i
minutes." % delay)
- time.sleep(delay * 60.)
- delay *= 2
- if delay > 30:
- delay = 30
- continue
- txt = txt[start:end]
- break
- try:
- start = txt.index(u"<ul>")
- end = txt.rindex(u"</ul>")
- except ValueError:
- # No incoming links found on page
- continue
- txt = txt[start:end+5]
-
- txtlines = txt.split(u"\n")
- for num, line in enumerate(txtlines):
- if line == u"</ul>":
- # end of list of references to redirect page
- continue
- if line == u"</li>":
- continue
- lmatch = listitempattern.search(line)
- if lmatch:
- fileLinks.add(lmatch.group("title"))
- if lmatch is None:
- output(u"DBG> Unparsed line:")
- output(u"(%i) %s" % (num, line))
- fileLinks = list(fileLinks)
- fileLinks.sort()
- for fileLink in fileLinks:
- # create Page objects
- yield Page(site, fileLink)
-
def put_async(self, newtext,
comment=None, watchArticle=None, minorEdit=True, force=False,
callback=None):
- """Asynchronous version of put (takes the same arguments), which
- places pages on a queue to be saved by a daemon thread.
- All arguments are the same as for .put(), except --
- callback: a callable object that will be called after the page put
- operation; this object must take two arguments:
- (1) a Page object, and (2) an exception instance, which
- will be None if the page was saved successfully.
- The callback is intended to be used by bots that need to keep track
- of which saves were successful.
+ """Put page on queue to be saved to wiki asynchronously.
+
+ Asynchronous version of put (takes the same arguments), which places
+ pages on a queue to be saved by a daemon thread. All arguments are
+ the same as for .put(), except --
+
+ callback: a callable object that will be called after the page put
+ operation; this object must take two arguments:
+ (1) a Page object, and (2) an exception instance, which
+ will be None if the page was saved successfully.
+
+ The callback is intended to be used by bots that need to keep track
+ of which saves were successful.
+
"""
try:
page_put_queue.mutex.acquire()
@@ -1053,11 +1040,16 @@
def put(self, newtext, comment=None, watchArticle=None, minorEdit=True,
force=False):
- """Replace the new page with the contents of the first argument.
- The second argument is a string that is to be used as the
- summary for the modification
+ """Save the page with the contents of the first argument as the
text.
- If watchArticle is None, leaves the watchlist status unchanged.
+ Optional parameters:
+ comment: a unicode string that is to be used as the summary for
+ the modification.
+ watchArticle: a bool, add or remove this Page to/from bot user's
+ watchlist (if None, leave watchlist status unchanged)
+ minorEdit: mark this edit as minor if True
+ force: ignore botMayEdit() setting
+
"""
# Fetch a page to get an edit token. If we already have
# fetched a page, this will do nothing, because get() is cached.
@@ -1119,17 +1111,15 @@
# of Bordeaux
if self.site().lang == 'eo':
newtext = doubleXForEsperanto(newtext)
- return self.putPage(newtext, comment, watchArticle, minorEdit, newPage,
self.site().getToken(sysop = sysop), sysop = sysop)
+ return self._putPage(newtext, comment, watchArticle, minorEdit, newPage,
self.site().getToken(sysop = sysop), sysop = sysop)
- def putPage(self, text, comment=None, watchArticle=False, minorEdit=True,
+ def _putPage(self, text, comment=None, watchArticle=False, minorEdit=True,
newPage=False, token=None, gettoken=False, sysop=False):
- """
- Upload 'text' as new contents for this Page by filling out the edit
- page.
+ """Upload 'text' as new content of Page by filling out the
edit form.
Don't use this directly, use put() instead.
+
"""
-
newTokenRetrieved = False
if self.site().versionnumber() >= 4:
if gettoken or not token:
@@ -1267,7 +1257,7 @@
if not sysop:
self.site().forceLogin(sysop = True)
output(u'Page is locked, retrying using sysop
account.')
- return self.putPage(text, comment, watchArticle,
+ return self._putPage(text, comment, watchArticle,
minorEdit, newPage, token=None,
gettoken=True, sysop=True)
except NoUsername:
@@ -1275,7 +1265,7 @@
elif not newTokenRetrieved and "<textarea" in data:
# We might have been using an outdated token
output(u"Changing page has failed. Retrying.")
- return self.putPage(text = text, comment = comment,
+ return self._putPage(text = text, comment = comment,
watchArticle = watchArticle, minorEdit = minorEdit, newPage =
newPage,
token = None, gettoken = True, sysop = sysop)
else:
@@ -1290,11 +1280,12 @@
return response.status, response.reason, data
def canBeEdited(self):
+ """Return bool indicating whether this page can be edited.
+
+ This returns True if and only if:
+ * page is unprotected, and bot has an account for this site, or
+ * page is protected, and bot has a sysop account for this site.
"""
- Returns True iff:
- * the page is unprotected, and we have an account for this site, or
- * the page is protected, and we have a sysop account for this site.
- """
if self.editRestriction:
userdict = config.sysopnames
else:
@@ -1308,10 +1299,12 @@
return False
def toggleTalkPage(self):
- """
+ """Return the other member of the article-talk page pair for this
Page.
+
If self is a talk page, returns the associated content page; otherwise,
- returns the associated talk page. Returns None if self is a special
- page.
+ returns the associated talk page.
+ Returns None if self is a special page.
+
"""
ns = self.namespace()
if ns < 0: # Special page
@@ -1325,36 +1318,45 @@
return Page(self.site(), self.site().namespace(ns + 1) + ':' +
self.titleWithoutNamespace())
def interwiki(self):
- """A list of interwiki links in the page. This will retrieve
- the page text to do its work, so it can raise the same exceptions
- that are raised by the get() method.
+ """Return a list of interwiki links in the page text.
- The return value is a list of Page objects for each of the
- interwiki links in the page text.
+ This will retrieve the page to do its work, so it can raise
+ the same exceptions that are raised by the get() method.
+
+ The return value is a list of Page objects for each of the
+ interwiki links in the page text.
+
"""
result = []
- ll = getLanguageLinks(self.get(), insite = self.site(), pageLink =
self.aslink())
+ ll = getLanguageLinks(self.get(), insite=self.site(),
+ pageLink=self.aslink())
for newSite, newPage in ll.iteritems():
- for pagenametext in
self.site().family.pagenamecodes(self.site().language()):
- newTitle = newPage.title().replace("{{" + pagenametext +
"}}", self.title())
+ for pagenametext in self.site().family.pagenamecodes(
+ self.site().language()):
+ newTitle = newPage.title().replace(
+ "{{" + pagenametext + "}}",
self.title())
try:
- result.append(self.__class__(newSite, newTitle, insite = self.site()))
+ result.append(
+ self.__class__(newSite, newTitle, insite=self.site()))
except UnicodeError:
- output(u"ERROR: link from %s to [[%s:%s]] is invalid
encoding?!" % (self.aslink(), newSite, newTitle))
- except NoSuchEntity:
- output(u"ERROR: link from %s to [[%s:%s]] contains invalid
character?!" % (self.aslink(), newSite, newTitle))
+ output(
+ u"ERROR: link from %s to [[%s:%s]] is in an invalid encoding?!"
+ % (self.aslink(), newSite, newTitle))
except ValueError:
- output(u"ERROR: link from %s to [[%s:%s]] contains invalid unicode
reference?!" % (self.aslink(), newSite, newTitle))
+ output(
+ u"ERROR: link from %s to [[%s:%s]] contains invalid unicode reference?!"
+ % (self.aslink(), newSite, newTitle))
return result
def categories(self, nofollow_redirects=False):
- """
- A list of categories that the article is in. This will retrieve
- the page text to do its work, so it can raise the same exceptions
- that are raised by the get() method.
+ """Return a list of categories that the article is in.
+ This will retrieve the page text to do its work, so it can raise
+ the same exceptions that are raised by the get() method.
+
The return value is a list of Category objects, one for each of the
category links in the page text.
+
"""
try:
category_links_to_return =
getCategoryLinks(self.get(nofollow_redirects=nofollow_redirects), self.site())
@@ -1363,8 +1365,7 @@
return category_links_to_return
def __cmp__(self, other):
- """Pseudo method to be able to use equality and inequality tests
on
- Page objects"""
+ """Test for equality and inequality of Page
objects"""
if not isinstance(other, Page):
# especially, return -1 if other is None
return -1
@@ -1375,19 +1376,20 @@
return cmp(owntitle, othertitle)
def __hash__(self):
- """Pseudo method that makes it possible to store Page objects as
- keys in hash-tables. This relies on the fact that the string
- representation of an instance can not change after the construction.
- """
+ # Pseudo method that makes it possible to store Page objects as keys
+ # in hash-tables. This relies on the fact that the string
+ # representation of an instance can not change after the construction.
return hash(str(self))
def linkedPages(self):
- """Gives the normal (not-interwiki, non-category) pages the page
- links to, as a list of Page objects
+ """Return a list of Pages that this Page links to.
+
+ Excludes interwiki and category links.
"""
result = []
try:
- thistxt = removeLanguageLinks(self.get(get_redirect=True), self.site())
+ thistxt = removeLanguageLinks(self.get(get_redirect=True),
+ self.site())
except NoPage:
raise
#return []
@@ -1416,13 +1418,15 @@
result.append(page)
return result
- def imagelinks(self, followRedirects = False, loose = False):
+ def imagelinks(self, followRedirects=False, loose=False):
+ """Return a list of ImagePage objects for images displayed on this
Page.
+
+ Includes images in galleries.
+ If loose is True, this will find anything that looks like it
+ could be an image. This is useful for finding, say, images that are
+ passed as parameters to templates.
+
"""
- Gives the images the page shows, as a list of ImagePage objects.
- This includes images in galleries.
- If loose is set to true, this will find anything that looks like it could be an
image.
- This is useful for finding, say, images that are passed as parameters to
templates.
- """
results = []
# Find normal images
for page in self.linkedPages():
@@ -1446,18 +1450,21 @@
return list(set(results))
def templates(self):
- """
- Gives a list of template names used on a page, as a list of strings.
+ """Return a list of strings containing template names used on this
Page.
+
Template parameters are ignored.
+
"""
return [template for (template, param) in self.templatesWithParams()]
def templatesWithParams(self):
+ """Return a list of templates used on this Page.
+
+ Return value is a list of tuples. There is one tuple for each use of
+ a template in the page, with the template name as the first entry
+ and a list of parameters as the second entry.
+
"""
- Gives a list of tuples. There is one tuple for each use of a template
- in the page, with the template name as the first entry and a list
- of parameters as the second entry.
- """
try:
thistxt = self.get()
except (IsRedirectPage, NoPage):
@@ -1481,17 +1488,19 @@
return result
def templatePages(self):
+ """Return a list of Page objects for templates used on the page.
+
+ Template parameters are ignored.
"""
- Gives a list of Page objects containing the templates used on the page. Template
parameters are ignored.
- """
- return [Page(self.site(), template, self.site(), 10) for template in
self.templates()]
+ return [Page(self.site(), template, self.site(), 10)
+ for template in self.templates()]
def getRedirectTarget(self):
- """
- If the page is a redirect page, gives the page it redirects to.
- Otherwise it will raise an IsNotRedirectPage exception.
+ """Return a Page object for the target this Page redirects to.
- This function can raise a NoPage exception.
+ If this page is not a redirect page, will raise an IsNotRedirectPage
+ exception. This method also can raise a NoPage exception.
+
"""
try:
self.get()
@@ -1499,22 +1508,23 @@
raise
except IsRedirectPage, arg:
if '|' in arg:
- warnings.warn("%s has a | character, this makes no sense",
Warning)
+ warnings.warn("%s has a | character, this makes no sense",
+ Warning)
return Page(self.site(), arg[0])
else:
raise IsNotRedirectPage(self)
- def getPreviousVersion(self):
- vh = self.getVersionHistory(revCount=2)
- oldid = vh[1][0]
- return self.getEditPage(oldid=oldid)[0]
+ def getVersionHistory(self, forceReload=False, reverseOrder=False,
+ getAll=False, revCount=500):
+ """Load the version history page and return history information.
- def getVersionHistory(self, forceReload = False, reverseOrder = False, getAll =
False, revCount = 500):
+ Return value is a list of tuples, where each tuple represents one
+ edit and is built of revision id, edit date/time, user name, and
+ edit summary. Starts with the most current revision, unless
+ reverseOrder is True. Defaults to getting the first revCount edits,
+ unless getAll is True.
+
"""
- Loads the version history page and returns a list of tuples, where each
- tuple represents one edit and is built of edit date/time, user name, and edit
- summary. Defaults to getting the first revCount edits.
- """
site = self.site()
# regular expression matching one edit in the version history.
@@ -1682,10 +1692,9 @@
return self._versionhistory[0:revCount]
return self._versionhistory
- def getVersionHistoryTable(self, forceReload = False, reverseOrder = False, getAll =
False, revCount = 500):
- """
- Returns the version history as a wiki table.
- """
+ def getVersionHistoryTable(self, forceReload=False, reverseOrder=False,
+ getAll=False, revCount=500):
+ """Return the version history as a wiki table."""
result = '{| border="1"\n'
result += '! oldid || date/time || username || edit summary\n'
for oldid, time, username, summary in self.getVersionHistory(forceReload =
forceReload, reverseOrder = reverseOrder, getAll = getAll, revCount = revCount):
@@ -1696,9 +1705,12 @@
def fullVersionHistory(self):
"""
- Returns all previous versions. Gives a list of tuples consisting of
- edit date/time, user name and content
+ Return all previous versions including wikitext.
+
+ Gives a list of tuples consisting of edit date/time, user name and
+ content
"""
+ # TODO: probably should return revision id, as well.
address = self.site().export_address()
predata = {
'action': 'submit',
@@ -1717,27 +1729,29 @@
data = data.encode(self.site().encoding())
get_throttle.setDelay(time.time() - now)
output = []
+ # TODO: parse XML using an actual XML parser instead of regex!
r =
re.compile("\<revision\>.*?\<timestamp\>(.*?)\<\/timestamp\>.*?\<(?:ip|username)\>(.*?)\</(?:ip|username)\>.*?\<text.*?\>(.*?)\<\/text\>",re.DOTALL)
#r =
re.compile("\<revision\>.*?\<timestamp\>(.*?)\<\/timestamp\>.*?\<(?:ip|username)\>(.*?)\<",re.DOTALL)
- return [(match.group(1), unescape(match.group(2)), unescape(match.group(3))) for
match in r.finditer(data)]
+ return [ (match.group(1),
+ unescape(match.group(2)),
+ unescape(match.group(3)))
+ for match in r.finditer(data) ]
def contributingUsers(self):
- """
- Returns a set of all user names (including anonymous IPs) of those who
- edited the page.
- """
+ """Return a set of usernames (or IPs) of users who edited this
page."""
edits = self.getVersionHistory()
- users = set()
- for edit in edits:
- users.add(edit[2])
+ users = set([edit[2] for edit in edits])
return users
- def move(self, newtitle, reason = None, movetalkpage = True, sysop = False, throttle
= False):
+ def move(self, newtitle, reason=None, movetalkpage=True, sysop=False,
+ throttle=True):
+ """Move this page to new title given by
newtitle."""
if throttle:
put_throttle()
if reason == None:
- reason = "Pagemove by bot"
- if self.namespace() // 2 == 1:
+ reason = input(u'Please enter a reason for the move:')
+ reason = reason.encode(self.site().encoding())
+ if self.isTalkPage():
movetalkpage = False
host = self.site().hostname()
address = self.site().move_address()
@@ -1767,7 +1781,9 @@
output(u'Page %s moved to %s' % (self.title(), newtitle))
return True
elif self.site().mediawiki_message('articleexists') in data:
- output(u'Page moved failed: Target page [[%s]] already exists.' %
newtitle)
+ output(u'Page moved failed: Target page [[%s]] already exists.'
+ % newtitle)
+ return False
else:
output(u'Page move failed for unknown reason.')
try:
@@ -1782,10 +1798,13 @@
output(data)
return False
- def delete(self, reason = None, prompt = True, throttle = False):
- """Deletes the page from the wiki. Requires administrator status.
If
- reason is None, asks for a reason. If prompt is True, asks the user
- if he wants to delete the page.
+ def delete(self, reason=None, prompt=True, throttle=True):
+ """Deletes the page from the wiki.
+
+ Requires administrator status. If reason is None, asks for a
+ reason. If prompt is True, asks the user if he wants to delete the
+ page.
+
"""
if throttle:
put_throttle()
@@ -1841,9 +1860,12 @@
return False
def loadDeletedRevisions(self):
- """Loads up Special/Undelete for the page and stores all
revisions'
- timestamps, dates, editors and comments.
- Returns list of timestamps (which are used to refer to revisions later on).
+ """Retrieve all deleted revisions for this Page from
Special/Undelete.
+
+ Stores all revisions' timestamps, dates, editors and comments.
+ Returns list of timestamps (which can be used to retrieve revisions
+ later on).
+
"""
#TODO: Handle image file revisions too.
output(u'Loading list of deleted revisions for [[%s]]...' %
self.title())
@@ -1868,8 +1890,12 @@
return self._deletedRevs.keys()
def getDeletedRevision(self, timestamp, retrieveText=False):
- """Returns a deleted revision [date, editor, comment, text,
restoration marker].
- text will be None, unless retrieveText is True (or has been retrieved
earlier).
+ """Return a particular deleted revision by timestamp.
+
+ Return value is a list of [date, editor, comment, text, restoration
+ marker]. text will be None, unless retrieveText is True (or has been
+ retrieved earlier).
+
"""
if self._deletedRevs == None:
self.loadDeletedRevisions()
@@ -1889,8 +1915,10 @@
return self._deletedRevs[timestamp]
def markDeletedRevision(self, timestamp, undelete=True):
- """Marks revision (identified by timestamp) for undeletion
(default)
- or to remain as deleted (if undelete=False).
+ """Mark the revision identified by timestamp for undeletion.
+
+ If undelete is False, mark the revision to remain deleted.
+
"""
if self._deletedRevs == None:
self.loadDeletedRevisions()
@@ -1900,20 +1928,23 @@
self._deletedRevs[timestamp][4] = undelete
self._deletedRevsModified = True
- def undelete(self, comment='', throttle=False):
+ def undelete(self, comment='', throttle=True):
"""Undeletes page based on the undeletion markers set by previous
calls.
- If no calls have been made since loadDeletedRevisions(), everything will be
restored.
- Simplest case:
- wikipedia.Page(...).undelete('This will restore all revisions')
+ If no calls have been made since loadDeletedRevisions(), everything
+ will be restored.
- More complex:
- pg = wikipedia.Page(...)
- revs = pg.loadDeletedRevsions()
- for rev in revs:
- if ... #decide whether to undelete a revision
- pg.markDeletedRevision(rev) #mark for undeletion
- pg.undelete('This will restore only selected revisions.')
+ Simplest case:
+ wikipedia.Page(...).undelete('This will restore all revisions')
+
+ More complex:
+ pg = wikipedia.Page(...)
+ revs = pg.loadDeletedRevsions()
+ for rev in revs:
+ if ... #decide whether to undelete a revision
+ pg.markDeletedRevision(rev) #mark for undeletion
+ pg.undelete('This will restore only selected revisions.')
+
"""
if throttle:
put_throttle()
@@ -1939,13 +1970,17 @@
#TODO: Check for errors below (have we succeeded? etc):
return self.site().postForm(address,formdata,sysop=True)
- def protect(self, edit = 'sysop', move = 'sysop', unprotect = False,
reason = None, prompt = True, throttle = False):
- """(Un)protects a wiki page. Requires administrator status. If
reason is None,
- asks for a reason. If prompt is True, asks the user if he wants to protect the
page.
- Valid values for edit and move are:
+ def protect(self, edit='sysop', move='sysop', unprotect=False,
+ reason=None, prompt=True, throttle=True):
+ """(Un)protect a wiki page. Requires administrator status.
+
+ If reason is None, asks for a reason. If prompt is True, asks the
+ user if he wants to protect the page. Valid values for edit and move
+ are:
* '' (equivalent to 'none')
* 'autoconfirmed'
* 'sysop'
+
"""
address = self.site().protect_address(self.urlname())
if unprotect:
@@ -1994,23 +2029,30 @@
output(data)
return False
- def removeImage(self, image, put = False, summary = None, safe = True):
+ def removeImage(self, image, put=False, summary=None, safe=True):
+ """Remove all occurrences of an image from this
Page."""
+ # TODO: this should be grouped with other functions that operate on
+ # wiki-text rather than the Page object
return self.replaceImage(image, None, put, summary, safe)
- def replaceImage(self, image, replacement = None, put = False, summary = None, safe =
True):
+ def replaceImage(self, image, replacement=None, put=False, summary=None,
+ safe=True):
"""Replace all occurences of an image by another image.
- Giving None as argument for replacement will delink
- instead of replace.
- The argument image must be without namespace and all
- spaces replaced by underscores.
+ Giving None as argument for replacement will delink instead of
+ replace.
- If put is false, the new text will be returned.
+ The argument image must be without namespace and all spaces replaced
+ by underscores.
- If put is true, the edits will be saved to the wiki
- and True will be returned on succes, and otherwise
- False. Edit errors propagate."""
+ If put is False, the new text will be returned. If put is True, the
+ edits will be saved to the wiki and True will be returned on succes,
+ and otherwise False. Edit errors propagate.
+ """
+ # TODO: this should be grouped with other functions that operate on
+ # wiki-text rather than the Page object
+
# Copyright (c) Orgullomoore, Bryan
# TODO: document and simplify the code
@@ -2024,7 +2066,9 @@
Creates a pattern that matches the string case-insensitively.
"""
s = re.escape(s)
- return ur'(?:%s)' % u''.join([u'[%s%s]' % (c.upper(),
c.lower()) for c in s])
+ return ur'(?:%s)' % u''.join([u'[%s%s]'
+ % (c.upper(), c.lower())
+ for c in s])
def capitalizationPattern(s):
"""
@@ -2088,18 +2132,20 @@
else:
return new_text
+
class ImagePage(Page):
# a Page in the Image namespace
def __init__(self, site, title = None, insite = None):
+ # TODO: raise an exception if title is not in Image: namespace
Page.__init__(self, site, title, insite)
self._imagePageHtml = None
def getImagePageHtml(self):
"""
- Downloads the image page, and returns the HTML, as a unicode string.
+ Download the image page, and return the HTML, as a unicode string.
Caches the HTML code, so that if you run this method twice on the
- same ImagePage object, the page only will be downloaded once.
+ same ImagePage object, the page will only be downloaded once.
"""
if not self._imagePageHtml:
path = self.site().get_address(self.urlname())
@@ -2168,13 +2214,15 @@
return u'{| border="1"\n! date/time || username || resolution ||
size || edit summary\n|----\n' + u'\n|----\n'.join(lines) + '\n|}'
def usingPages(self):
- result = []
- titleList = re.search('(?s)<h2
id="filelinks">.+?</ul>', self.getImagePageHtml()).group()
- lineR = re.compile('<li><a href=".+?"
title=".+?">(?P<title>.+?)</a></li>')
+ """Yield Pages on which this ImagePage is
displayed."""
+ titleList = re.search('(?s)<h2
id="filelinks">.+?</ul>',
+ self.getImagePageHtml()).group()
+ lineR = re.compile(
+ '<li><a href=".+?"
title=".+?">(?P<title>.+?)</a></li>')
for match in lineR.finditer(titleList):
- result.append(Page(self.site(), match.group('title')))
- return result
+ yield Page(self.site(), match.group('title'))
+
class GetAll(object):
def __init__(self, site, pages, throttle, force):
self.site = site
@@ -2295,7 +2343,6 @@
output(u'Expected one of: %s' %
u','.join([page2.aslink(forceInterwiki=True) for page2 in self.pages]))
raise PageNotFound
-
def headerDone(self, header):
# Verify our family data
lang = self.site.lang
@@ -2364,9 +2411,9 @@
output(u'Getting %d pages from %s...' % (len(pages), site))
return GetAll(site, pages, throttle, force).run()
+
# Library functions
-
def unescape(s):
"""Replace escaped HTML-special characters by their
originals"""
if '&' not in s:
@@ -2519,9 +2566,12 @@
f.close()
def __call__(self, requestsize=1):
- """This is called from getEditPage without arguments. It will make
sure
- that if there are no 'ignores' left, there are at least delay seconds
- since the last time it was called before it returns."""
+ """
+ Block the calling program if the throttle time has not expired.
+
+ Parameter requestsize is the number of Pages to be read/written;
+ multiply delay time by an appropriate factor.
+ """
self.lock.acquire()
try:
waittime = self.waittime()
@@ -2896,8 +2946,8 @@
text = replaceExcept(text, categoryR, '', ['nowiki',
'comment', 'math', 'pre'], marker = marker)
return normalWhitespace(text)
-def replaceCategoryInPlace(oldtext, oldcat, newcat, site = None):
- """Replaces the category oldcat with the category newcat and then
returns
+def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None):
+ """Replace the category oldcat with the category newcat and then
return
the modified Wiki source.
"""
#Note that this doesn't work yet and it has some very strange side-effects.
@@ -2909,21 +2959,13 @@
title = oldcat.titleWithoutNamespace()
if not title:
return
- # title might not be formatted correctly on the wiki
+ # title might contain regex special characters
+ title = re.escape(title)
+ # title might not be capitalized correctly on the wiki
if title[0].isalpha() and not site.nocapitalize:
title = "[%s%s]" % (title[0].upper(), title[0].lower()) + title[1:]
- # title might also contain regex special characters
- title = title.replace(" ", "[ _]+")\
- .replace("(", r"\(")\
- .replace(")", r"\)")\
- .replace(".", r"\.")\
- .replace("^", r"\^")\
- .replace("$", r"\$")\
- .replace("*", r"\*")\
- .replace("+", r"\+")\
- .replace("?", r"\?")
- # note: | [ ] { } not escaped here because they are not legal in
- # MW page titles
+ # spaces and underscores in page titles are interchangeable, and collapsible
+ title = title.replace(" ", "[ _]+")
categoryR = re.compile(r'\[\[\s*(%s)\s*:\s*%s\s*((?:\|[^]]+)?\]\])'
% (catNamespace, title))
if newcat is None:
@@ -3192,6 +3234,35 @@
return myfamily.Family()
class Site(object):
+ """A MediaWiki site.
+
+ messages : There are new messages on the site
+ forceLogin() : Does not continue until the user has logged in to
+ the site
+ getUrl() : Retrieve an URL from the site
+ mediawiki_message(key): Retrieve the text of the MediaWiki message with
+ the key "key"
+ has_mediawiki_message(key) : True if this site defines a MediaWiki message
+ with the key "key"
+
+ Special pages:
+ Dynamic pages:
+ allpages(): Special:Allpages
+ newpages(): Special:Newpages
+ longpages(): Special:Longpages
+ shortpages(): Special:Shortpages
+ categories(): Special:Categories
+
+ Cached pages:
+ deadendpages(): Special:Deadendpages
+ ancientpages(): Special:Ancientpages
+ lonelypages(): Special:Lonelypages
+ uncategorizedcategories(): Special:Uncategorizedcategories
+ uncategorizedpages(): Special:Uncategorizedpages
+ uncategorizedimages(): Special:Uncategorizedimages
+ unusedcategories(): Special:Unusuedcategories
+
+ """
def __init__(self, code, fam=None, user=None, persistent_http = None):
"""Constructor takes four arguments: