Revision: 6134
Author: russblau
Date: 2008-12-08 21:41:15 +0000 (Mon, 08 Dec 2008)
Log Message:
-----------
Interwiki link scraping
Modified Paths:
--------------
branches/rewrite/pywikibot/data/api.py
branches/rewrite/pywikibot/page.py
branches/rewrite/pywikibot/site.py
branches/rewrite/pywikibot/tests/page_tests.py
Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py 2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/data/api.py 2008-12-08 21:41:15 UTC (rev 6134)
@@ -184,7 +184,7 @@
self.site.throttle(write=write)
uri = self.site.scriptpath() + "/api.php"
try:
- if write or action == "login":
+ if write or action in ("login", "expandtemplates", "parse"):
# add other actions that require POST requests above
rawdata = http.request(self.site, uri, method="POST",
headers={'Content-Type':
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py 2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/page.py 2008-12-08 21:41:15 UTC (rev 6134)
@@ -368,6 +368,15 @@
text = property(_textgetter, _textsetter, _cleartext,
"The edited wikitext (unicode) of this Page")
+ def expand_text(self):
+ """Return the page text with all templates expanded."""
+ req = pywikibot.data.api.Request(action="expandtemplates",
+ text=self.text,
+ title=self.title(withSection=False),
+ site=self.site())
+ result = req.submit()
+ return result["expandtemplates"]["*"]
+
def userName(self):
"""Return name or IP address of last user to edit page."""
return self._revisions[self.latestRevision()].user
@@ -686,8 +695,8 @@
"""Iterate Pages that this Page links to.
Only returns pages from "normal" internal links. Image and category
- links are omitted unless prefixed with ":"; embedded templates are
- omitted (but links within them are returned); all interwiki and
+ links are omitted unless prefixed with ":". Embedded templates are
+ omitted (but links within them are returned). All interwiki and
external links are omitted.
@return: a generator that yields Page objects.
@@ -695,20 +704,43 @@
"""
return self.site().pagelinks(self)
- def interwiki(self):
- """Iterate interwiki links in the page text.
+ def interwiki(self, expand=True):
+ """Iterate interwiki links in the page text, excluding language links.
- @return: a generator that yields Link objects.
+ @param expand: if True (default), include interwiki links found in
+ templates transcluded onto this page; if False, only iterate
+ interwiki links found in this page's own wikitext
+ @return: a generator that yields Link objects
"""
- return self.site().pageinterwiki(self)
+ # This function does not exist in the API, so it has to be
+ # implemented by screen-scraping
+ Rlink = re.compile(r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]')
+ if expand:
+ text = self.expand_text()
+ else:
+ text = self.text
+ for linkmatch in Rlink.finditer(
+ pywikibot.textlib.removeDisabledParts(text)):
+ linktitle = linkmatch.group("title")
+ link = Link(linktitle, self.site())
+ # only yield links that are to a different site and that
+ # are not language links
+ try:
+ if link.site != self.site():
+ if linktitle.lstrip().startswith(":"):
+ # initial ":" indicates not a language link
+ yield link
+ elif link.site.family != self.site().family:
+ # link to a different family is not a language link
+ yield link
+ except pywikibot.Error:
+ # ignore any links with invalid contents
+ continue
def langlinks(self):
"""Iterate all interlanguage links on this page.
- Note that the links yielded by this method will be a subset of
- the results of self.interwiki().
-
@return: a generator that yields Link objects.
"""
@@ -1729,7 +1761,25 @@
def __str__(self):
return self.astext()
+ def __cmp__(self, other):
+ """Test for equality and inequality of Link objects.
+ Link objects are "equal" if and only if they are on the same site
+ and have the same normalized title, including section if any.
+
+ Link objects are sortable by site, then namespace, then title.
+
+ """
+ if not isinstance(other, Link):
+ # especially, return -1 if other is None
+ return -1
+ if not self.site == other.site:
+ return cmp(self.site, other.site)
+ if self.namespace != other.namespace:
+ return cmp(self.namespace, other.namespace)
+ return cmp(self.title, other.title)
+
+
# Utility functions for parsing page titles
def html2unicode(text, ignore = []):
@@ -1794,7 +1844,7 @@
unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint]
except KeyError:
pass
- if unicodeCodepoint and unicodeCodepoint not in ignore and (WIDEBUILD or unicodeCodepoint < 65534):
+ if unicodeCodepoint and unicodeCodepoint not in ignore:
result += unichr(unicodeCodepoint)
else:
# Leave the entity unchanged
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/site.py 2008-12-08 21:41:15 UTC (rev 6134)
@@ -1357,7 +1357,7 @@
api.update_page(page, pagedata)
def pageinterwiki(self, page):
- # TODO
+ # No such function in the API (this method isn't called anywhere)
raise NotImplementedError
def pagelanglinks(self, page):
Modified: branches/rewrite/pywikibot/tests/page_tests.py
===================================================================
--- branches/rewrite/pywikibot/tests/page_tests.py 2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/tests/page_tests.py 2008-12-08 21:41:15 UTC (rev 6134)
@@ -227,9 +227,12 @@
def testLinks(self):
for p in mainpage.linkedPages():
self.assertTrue(isinstance(p, pywikibot.Page))
-## Not implemented:
-## for p in mainpage.interwiki():
-## self.assertTrue(isinstance(p, pywikibot.Link))
+ iw = list(mainpage.interwiki(expand=True))
+ for p in iw:
+ self.assertTrue(isinstance(p, pywikibot.Link))
+ for p2 in mainpage.interwiki(expand=False):
+ self.assertTrue(isinstance(p2, pywikibot.Link))
+ self.assertTrue(p2 in iw)
for p in mainpage.langlinks():
self.assertTrue(isinstance(p, pywikibot.Link))
for p in mainpage.imagelinks():
Revision: 6133
Author: russblau
Date: 2008-12-05 22:08:29 +0000 (Fri, 05 Dec 2008)
Log Message:
-----------
code cleanup
Modified Paths:
--------------
branches/rewrite/pywikibot/catlib.py
branches/rewrite/pywikibot/page.py
Modified: branches/rewrite/pywikibot/catlib.py
===================================================================
--- branches/rewrite/pywikibot/catlib.py 2008-12-05 16:07:22 UTC (rev 6132)
+++ branches/rewrite/pywikibot/catlib.py 2008-12-05 22:08:29 UTC (rev 6133)
@@ -13,11 +13,10 @@
#
__version__ = '$Id: $'
-import pywikibot
-import pywikibot.textlib
from pywikibot import Category
+
def change_category(article, oldCat, newCat, comment=None, sortKey=None,
inPlace=True):
return article.change_category(oldCat, newCat, comment, sortKey, inPlace)
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py 2008-12-05 16:07:22 UTC (rev 6132)
+++ branches/rewrite/pywikibot/page.py 2008-12-05 22:08:29 UTC (rev 6133)
@@ -35,8 +35,10 @@
to the Site object.
"""
- def __init__(self, source, title=u"", ns=0, insite=None,
- defaultNamespace=None):
+
+ @deprecate_arg("insite", None)
+ @deprecate_arg("defaultNamespace", None)
+ def __init__(self, source, title=u"", ns=0):
"""Instantiate a Page object.
Three calling formats are supported:
@@ -65,16 +67,8 @@
@param ns: namespace number; required if source is a Site, ignored
otherwise
@type ns: int
- @param insite: DEPRECATED (use Link instead)
- @param defaultNamespace: DEPRECATED (use Link instead)
"""
- if insite is not None:
- logger.debug(
- "The 'insite' option in Page constructor is deprecated.")
- if defaultNamespace is not None:
- logger.debug(
- "The 'defaultNamespace' option in Page constructor is deprecated.")
if isinstance(source, pywikibot.site.BaseSite):
self._site = source
if ns not in source.namespaces():
@@ -144,6 +138,7 @@
return self._ns
@deprecate_arg("decode", None)
+ @deprecate_arg("savetitle", "asUrl")
def title(self, underscore=False, savetitle=False, withNamespace=True,
withSection=True, asUrl=False, asLink=False,
allowInterwiki=True, forceInterwiki=False, textlink=False,
@@ -151,8 +146,6 @@
"""Return the title of this Page, as a Unicode string.
@param underscore: if true, replace all ' ' characters with '_'
- @param savetitle: if true, try to quote all non-ASCII characters.
- (DEPRECATED: use asURL instead)
@param withNamespace: if false, omit the namespace prefix
@param withSection: if false, omit the section
@param asUrl: if true, quote title as if in an URL
@@ -174,10 +167,7 @@
title = title.split(u'#', 1)[0]
if underscore or asUrl:
title = title.replace(u' ', u'_')
- if savetitle:
- logger.debug(
- u"Page.title(savetitle=...) is deprecated.")
- if savetitle or asUrl:
+ if asUrl:
encodedTitle = title.encode(self.site().encoding())
title = urllib.quote(encodedTitle)
if asLink:
@@ -724,20 +714,14 @@
"""
return self.site().pagelanglinks(self)
+ @deprecate_arg("followRedirects", None)
+ @deprecate_arg("loose", None)
def imagelinks(self, followRedirects=None, loose=None):
"""Iterate ImagePage objects for images displayed on this Page.
- @param followRedirects: DEPRECATED and ignored
- @param loose: DEPRECATED and ignored
@return: a generator that yields ImagePage objects.
"""
- if followRedirects is not None:
- logger.debug(
- u"Page.imagelinks(followRedirects) option is deprecated.")
- if loose is not None:
- logger.debug(
- u"Page.imagelinks(loose) option is deprecated.")
return self.site().pageimages(self)
def templates(self):
@@ -780,19 +764,14 @@
positional))
return result
- def categories(self, nofollow_redirects=None, withSortKey=False):
+ @deprecate_arg("nofollow_redirects", None)
+ def categories(self, withSortKey=False):
"""Iterate categories that the article is in.
- @param nofollow_redirects: DEPRECATED and ignored
@param withSortKey: if True, include the sort key in each Category.
@return: a generator that yields Category objects.
"""
- # follow_redirects makes no sense here because category membership
- # doesn't follow redirects
- if nofollow_redirects is not None:
- logger.debug(
- u"Page.categories(nofollow_redirects) option is deprecated.")
return self.site().pagecategories(self, withSortKey=withSortKey)
def extlinks(self):
@@ -873,24 +852,21 @@
users = set([edit[2] for edit in edits])
return users
+ @deprecate_arg("throttle", None)
def move(self, newtitle, reason=None, movetalkpage=True, sysop=False,
- throttle=None, deleteAndMove=False, safe=True):
+ deleteAndMove=False, safe=True):
"""Move this page to a new title.
@param newtitle: The new page title.
@param reason: The edit summary for the move.
@param movetalkpage: If true, move this page's talk page (if it exists)
@param sysop: Try to move using sysop account, if available
- @param throttle: DEPRECATED
@param deleteAndMove: if move succeeds, delete the old page
(usually requires sysop privileges, depending on wiki settings)
@param safe: If false, attempt to delete existing page at newtitle
(if there is one) and then move this page to that title
"""
- if throttle is not None:
- logger.debug(
- u"Page.move: throttle option is deprecated.")
if reason is None:
logger.info(u'Moving %s to [[%s]].'
% (self.title(asLink=True), newtitle))
@@ -901,6 +877,7 @@
movetalk=movetalkpage,
noredirect=deleteAndMove)
+ @deprecate_arg("throttle", None)
def delete(self, reason=None, prompt=True, throttle=None, mark=False):
"""Deletes the page from the wiki. Requires administrator status.
@@ -910,9 +887,6 @@
speedy-deletion request on the page instead.
"""
- if throttle is not None:
- logger.debug(
- u"Page.delete: throttle option is deprecated.")
if reason is None:
logger.info(u'Deleting %s.' % (self.title(asLink=True)))
reason = pywikibot.input(u'Please enter a reason for the deletion:')
@@ -967,7 +941,8 @@
self._deletedRevs[timestamp][4] = undelete
self._deletedRevsModified = True
- def undelete(self, comment=None, throttle=None):
+ @deprecate_arg("throttle", None)
+ def undelete(self, comment=None):
"""Undelete revisions based on the markers set by previous calls.
If no calls have been made since loadDeletedRevisions(), everything
@@ -985,12 +960,8 @@
pg.undelete('This will restore only selected revisions.')
@param comment: The undeletion edit summary.
- @param throttle: DEPRECATED
"""
- if throttle is not None:
- logger.debug(
- u"Page.undelete: throttle option is deprecated.")
if comment is None:
logger.info(u'Preparing to undelete %s.'
% (self.title(asLink=True)))
@@ -998,8 +969,9 @@
u'Please enter a reason for the undeletion:')
return self.site().undelete(self, comment)
+ @deprecate_arg("throttle", None)
def protect(self, edit='sysop', move='sysop', unprotect=False,
- reason=None, prompt=True, throttle=None):
+ reason=None, prompt=True):
"""(Un)protect a wiki page. Requires administrator status.
Valid protection levels (in MediaWiki 1.12) are '' (equivalent to
@@ -1011,12 +983,8 @@
all protection levels to '')
@param reason: Edit summary.
@param prompt: If true, ask user for confirmation.
- @param throttle: DEPRECATED
"""
- if throttle is not None:
- logger.debug(
- u"Page.protect: throttle option is deprecated.")
if reason is None:
if unprotect:
un = u'un'
@@ -1294,38 +1262,30 @@
class Category(Page):
"""A page in the Category: namespace"""
- def __init__(self, source, title=u"", insite=None, sortKey=None):
- """All parameters are the same as for Page() constructor, except:
+ @deprecate_arg("sortKey", None)
+ def __init__(self, source, title=u"", insite=None):
+ """All parameters are the same as for Page() constructor.
- @param sortKey: DEPRECATED (use .aslink() method instead)
-
"""
- if sortKey is not None:
- logger.debug(
- "The 'sortKey' option in Category constructor is deprecated.")
Page.__init__(self, source, title, 14)
if self.namespace() != 14:
raise ValueError(u"'%s' is not in the category namespace!"
% title)
- def aslink(self, sortKey=u'', forceInterwiki=None, textlink=None,
- noInterwiki=None):
+ @deprecate_arg("forceInterwiki", None)
+ @deprecate_arg("textlink", None)
+ @deprecate_arg("noInterwiki", None)
+ def aslink(self, sortKey=u''):
"""Return a link to place a page in this Category.
Use this only to generate a "true" category link, not for interwikis
or text links to category pages.
- Parameters are deprecated and preserved for backwards-compatibility,
- except:
-
@param sortKey: The sort key for the article to be placed in this
Category; if omitted, default sort key is used.
@type sortKey: (optional) unicode
"""
- if forceInterwiki is not None \
- or textlink is not None or noInterwiki is not None:
- logger.debug("All arguments to Category.aslink() are deprecated.")
if sortKey:
titleWithSortKey = '%s|%s' % (self.title(withSection=False),
self.sortKey)
Revision: 6125
Author: russblau
Date: 2008-12-03 20:00:42 +0000 (Wed, 03 Dec 2008)
Log Message:
-----------
fix nagging problem with links to redirect pages
Modified Paths:
--------------
branches/rewrite/pywikibot/site.py
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-12-03 18:09:30 UTC (rev 6124)
+++ branches/rewrite/pywikibot/site.py 2008-12-03 20:00:42 UTC (rev 6125)
@@ -1080,8 +1080,24 @@
if filterRedirects is not None:
blgen.request["gblfilterredir"] = filterRedirects and "redirects"\
or "nonredirects"
- if followRedirects: #FIXME This doesn't work correctly
- blgen.request["gblredirect"] = ""
+ if followRedirects:
+ # bug: see http://bugzilla.wikimedia.org/show_bug.cgi?id=16218
+ # links identified by MediaWiki as redirects may not really be,
+ # so we have to check each "redirect" page and see if it
+ # really redirects to this page
+ blgen.request["gblfilterredir"] = "nonredirects"
+ redirgen = api.PageGenerator("backlinks", gbltitle=bltitle,
+ site=self, gblfilterredir="redirects")
+ if "gblnamespace" in blgen.request:
+ redirgen.request["gblnamespace"] = blgen.request["gblnamespace"]
+ genlist = [blgen]
+ for redir in redirgen:
+ if redir.getRedirectTarget() == page:
+ genlist.append(
+ self.pagebacklinks(
+ redir, True, None, namespaces))
+ import itertools
+ return itertools.chain(*genlist)
return blgen
def page_embeddedin(self, page, filterRedirects=None, namespaces=None):