Revision: 6134 Author: russblau Date: 2008-12-08 21:41:15 +0000 (Mon, 08 Dec 2008)
Log Message: ----------- Interwiki link scraping
Modified Paths: -------------- branches/rewrite/pywikibot/data/api.py branches/rewrite/pywikibot/page.py branches/rewrite/pywikibot/site.py branches/rewrite/pywikibot/tests/page_tests.py
Modified: branches/rewrite/pywikibot/data/api.py =================================================================== --- branches/rewrite/pywikibot/data/api.py 2008-12-05 22:08:29 UTC (rev 6133) +++ branches/rewrite/pywikibot/data/api.py 2008-12-08 21:41:15 UTC (rev 6134) @@ -184,7 +184,7 @@ self.site.throttle(write=write) uri = self.site.scriptpath() + "/api.php" try: - if write or action == "login": + if write or action in ("login", "expandtemplates", "parse"): # add other actions that require POST requests above rawdata = http.request(self.site, uri, method="POST", headers={'Content-Type':
Modified: branches/rewrite/pywikibot/page.py =================================================================== --- branches/rewrite/pywikibot/page.py 2008-12-05 22:08:29 UTC (rev 6133) +++ branches/rewrite/pywikibot/page.py 2008-12-08 21:41:15 UTC (rev 6134) @@ -368,6 +368,15 @@ text = property(_textgetter, _textsetter, _cleartext, "The edited wikitext (unicode) of this Page")
+ def expand_text(self): + """Return the page text with all templates expanded.""" + req = pywikibot.data.api.Request(action="expandtemplates", + text=self.text, + title=self.title(withSection=False), + site=self.site()) + result = req.submit() + return result["expandtemplates"]["*"] + def userName(self): """Return name or IP address of last user to edit page.""" return self._revisions[self.latestRevision()].user @@ -686,8 +695,8 @@ """Iterate Pages that this Page links to.
Only returns pages from "normal" internal links. Image and category - links are omitted unless prefixed with ":"; embedded templates are - omitted (but links within them are returned); all interwiki and + links are omitted unless prefixed with ":". Embedded templates are + omitted (but links within them are returned). All interwiki and external links are omitted.
@return: a generator that yields Page objects. @@ -695,20 +704,43 @@ """ return self.site().pagelinks(self)
- def interwiki(self): - """Iterate interwiki links in the page text. + def interwiki(self, expand=True): + """Iterate interwiki links in the page text, excluding language links.
- @return: a generator that yields Link objects. + @param expand: if True (default), include interwiki links found in + templates transcluded onto this page; if False, only iterate + interwiki links found in this page's own wikitext + @return: a generator that yields Link objects
""" - return self.site().pageinterwiki(self) + # This function does not exist in the API, so it has to be + # implemented by screen-scraping + Rlink = re.compile(r'[[(?P<title>[^]|[#<>{}]*)(|.*?)?]]') + if expand: + text = self.expand_text() + else: + text = self.text + for linkmatch in Rlink.finditer( + pywikibot.textlib.removeDisabledParts(text)): + linktitle = linkmatch.group("title") + link = Link(linktitle, self.site()) + # only yield links that are to a different site and that + # are not language links + try: + if link.site != self.site(): + if linktitle.lstrip().startswith(":"): + # initial ":" indicates not a language link + yield link + elif link.site.family != self.site().family: + # link to a different family is not a language link + yield link + except pywikibot.Error: + # ignore any links with invalid contents + continue
def langlinks(self): """Iterate all interlanguage links on this page.
- Note that the links yielded by this method will be a subset of - the results of self.interwiki(). - @return: a generator that yields Link objects.
""" @@ -1729,7 +1761,25 @@ def __str__(self): return self.astext()
+ def __cmp__(self, other): + """Test for equality and inequality of Link objects.
+ Link objects are "equal" if and only if they are on the same site + and have the same normalized title, including section if any. + + Link objects are sortable by site, then namespace, then title. + + """ + if not isinstance(other, Link): + # especially, return -1 if other is None + return -1 + if not self.site == other.site: + return cmp(self.site, other.site) + if self.namespace != other.namespace: + return cmp(self.namespace, other.namespace) + return cmp(self.title, other.title) + + # Utility functions for parsing page titles
def html2unicode(text, ignore = []): @@ -1794,7 +1844,7 @@ unicodeCodepoint=convertIllegalHtmlEntities[unicodeCodepoint] except KeyError: pass - if unicodeCodepoint and unicodeCodepoint not in ignore and (WIDEBUILD or unicodeCodepoint < 65534): + if unicodeCodepoint and unicodeCodepoint not in ignore: result += unichr(unicodeCodepoint) else: # Leave the entity unchanged
Modified: branches/rewrite/pywikibot/site.py =================================================================== --- branches/rewrite/pywikibot/site.py 2008-12-05 22:08:29 UTC (rev 6133) +++ branches/rewrite/pywikibot/site.py 2008-12-08 21:41:15 UTC (rev 6134) @@ -1357,7 +1357,7 @@ api.update_page(page, pagedata)
def pageinterwiki(self, page): - # TODO + # No such function in the API (this method isn't called anywhere) raise NotImplementedError
def pagelanglinks(self, page):
Modified: branches/rewrite/pywikibot/tests/page_tests.py =================================================================== --- branches/rewrite/pywikibot/tests/page_tests.py 2008-12-05 22:08:29 UTC (rev 6133) +++ branches/rewrite/pywikibot/tests/page_tests.py 2008-12-08 21:41:15 UTC (rev 6134) @@ -227,9 +227,12 @@ def testLinks(self): for p in mainpage.linkedPages(): self.assertTrue(isinstance(p, pywikibot.Page)) -## Not implemented: -## for p in mainpage.interwiki(): -## self.assertTrue(isinstance(p, pywikibot.Link)) + iw = list(mainpage.interwiki(expand=True)) + for p in iw: + self.assertTrue(isinstance(p, pywikibot.Link)) + for p2 in mainpage.interwiki(expand=False): + self.assertTrue(isinstance(p2, pywikibot.Link)) + self.assertTrue(p2 in iw) for p in mainpage.langlinks(): self.assertTrue(isinstance(p, pywikibot.Link)) for p in mainpage.imagelinks():