Revision: 6134
Author: russblau
Date: 2008-12-08 21:41:15 +0000 (Mon, 08 Dec 2008)
Log Message:
-----------
Interwiki link scraping
Modified Paths:
--------------
branches/rewrite/pywikibot/data/api.py
branches/rewrite/pywikibot/page.py
branches/rewrite/pywikibot/site.py
branches/rewrite/pywikibot/tests/page_tests.py
Modified: branches/rewrite/pywikibot/data/api.py
===================================================================
--- branches/rewrite/pywikibot/data/api.py 2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/data/api.py 2008-12-08 21:41:15 UTC (rev 6134)
@@ -184,7 +184,7 @@
self.site.throttle(write=write)
uri = self.site.scriptpath() + "/api.php"
try:
- if write or action == "login":
+ if write or action in ("login", "expandtemplates",
"parse"):
# add other actions that require POST requests above
rawdata = http.request(self.site, uri, method="POST",
headers={'Content-Type':
Modified: branches/rewrite/pywikibot/page.py
===================================================================
--- branches/rewrite/pywikibot/page.py 2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/page.py 2008-12-08 21:41:15 UTC (rev 6134)
@@ -368,6 +368,15 @@
text = property(_textgetter, _textsetter, _cleartext,
"The edited wikitext (unicode) of this Page")
+ def expand_text(self):
+ """Return the page text with all templates
expanded."""
+ req = pywikibot.data.api.Request(action="expandtemplates",
+ text=self.text,
+ title=self.title(withSection=False),
+ site=self.site())
+ result = req.submit()
+ return result["expandtemplates"]["*"]
+
def userName(self):
"""Return name or IP address of last user to edit
page."""
return self._revisions[self.latestRevision()].user
@@ -686,8 +695,8 @@
"""Iterate Pages that this Page links to.
Only returns pages from "normal" internal links. Image and category
- links are omitted unless prefixed with ":"; embedded templates are
- omitted (but links within them are returned); all interwiki and
+ links are omitted unless prefixed with ":". Embedded templates are
+ omitted (but links within them are returned). All interwiki and
external links are omitted.
@return: a generator that yields Page objects.
@@ -695,20 +704,43 @@
"""
return self.site().pagelinks(self)
- def interwiki(self):
- """Iterate interwiki links in the page text.
+ def interwiki(self, expand=True):
+ """Iterate interwiki links in the page text, excluding language
links.
- @return: a generator that yields Link objects.
+ @param expand: if True (default), include interwiki links found in
+ templates transcluded onto this page; if False, only iterate
+ interwiki links found in this page's own wikitext
+ @return: a generator that yields Link objects
"""
- return self.site().pageinterwiki(self)
+ # This function does not exist in the API, so it has to be
+ # implemented by screen-scraping
+ Rlink =
re.compile(r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]')
+ if expand:
+ text = self.expand_text()
+ else:
+ text = self.text
+ for linkmatch in Rlink.finditer(
+ pywikibot.textlib.removeDisabledParts(text)):
+ linktitle = linkmatch.group("title")
+ link = Link(linktitle, self.site())
+ # only yield links that are to a different site and that
+ # are not language links
+ try:
+ if link.site != self.site():
+ if linktitle.lstrip().startswith(":"):
+ # initial ":" indicates not a language link
+ yield link
+ elif link.site.family != self.site().family:
+ # link to a different family is not a language link
+ yield link
+ except pywikibot.Error:
+ # ignore any links with invalid contents
+ continue
def langlinks(self):
"""Iterate all interlanguage links on this page.
- Note that the links yielded by this method will be a subset of
- the results of self.interwiki().
-
@return: a generator that yields Link objects.
"""
@@ -1729,7 +1761,25 @@
def __str__(self):
return self.astext()
+ def __cmp__(self, other):
+ """Test for equality and inequality of Link objects.
+ Link objects are "equal" if and only if they are on the same site
+ and have the same normalized title, including section if any.
+
+ Link objects are sortable by site, then namespace, then title.
+
+ """
+ if not isinstance(other, Link):
+ # especially, return -1 if other is None
+ return -1
+ if not self.site == other.site:
+ return cmp(self.site, other.site)
+ if self.namespace != other.namespace:
+ return cmp(self.namespace, other.namespace)
+ return cmp(self.title, other.title)
+
+
# Utility functions for parsing page titles
def html2unicode(text, ignore = []):
@@ -1794,7 +1844,7 @@
unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint]
except KeyError:
pass
- if unicodeCodepoint and unicodeCodepoint not in ignore and (WIDEBUILD or
unicodeCodepoint < 65534):
+ if unicodeCodepoint and unicodeCodepoint not in ignore:
result += unichr(unicodeCodepoint)
else:
# Leave the entity unchanged
Modified: branches/rewrite/pywikibot/site.py
===================================================================
--- branches/rewrite/pywikibot/site.py 2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/site.py 2008-12-08 21:41:15 UTC (rev 6134)
@@ -1357,7 +1357,7 @@
api.update_page(page, pagedata)
def pageinterwiki(self, page):
- # TODO
+ # No such function in the API (this method isn't called anywhere)
raise NotImplementedError
def pagelanglinks(self, page):
Modified: branches/rewrite/pywikibot/tests/page_tests.py
===================================================================
--- branches/rewrite/pywikibot/tests/page_tests.py 2008-12-05 22:08:29 UTC (rev 6133)
+++ branches/rewrite/pywikibot/tests/page_tests.py 2008-12-08 21:41:15 UTC (rev 6134)
@@ -227,9 +227,12 @@
def testLinks(self):
for p in mainpage.linkedPages():
self.assertTrue(isinstance(p, pywikibot.Page))
-## Not implemented:
-## for p in mainpage.interwiki():
-## self.assertTrue(isinstance(p, pywikibot.Link))
+ iw = list(mainpage.interwiki(expand=True))
+ for p in iw:
+ self.assertTrue(isinstance(p, pywikibot.Link))
+ for p2 in mainpage.interwiki(expand=False):
+ self.assertTrue(isinstance(p2, pywikibot.Link))
+ self.assertTrue(p2 in iw)
for p in mainpage.langlinks():
self.assertTrue(isinstance(p, pywikibot.Link))
for p in mainpage.imagelinks():