Revision: 7132 Author: alexsh Date: 2009-08-08 04:25:21 +0000 (Sat, 08 Aug 2009)
Log Message: ----------- Site().newpages(): Add API options
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2009-08-08 01:00:08 UTC (rev 7131) +++ trunk/pywikipedia/wikipedia.py 2009-08-08 04:25:21 UTC (rev 7132) @@ -5309,30 +5309,62 @@ # option to fetch older rather than newer pages # TODO: extract and return edit comment. seen = set() + try: + d = self.apipath() + del d + except NotImplementedError: + config.use_api = False + while True: - path = self.newpages_address(n=number, namespace=namespace) - # The throttling is important here, so always enabled. - get_throttle() - html = self.getUrl(path) + if config.use_api and self.versionnumber() >= 10: + params = { + 'action': 'query', + 'list': 'recentchanges', + 'rctype': 'new', + 'rcnamespace': namespace, + 'rclimit': int(number), + 'rcprop': 'title|timestamp|sizes|user|comment', + 'rcshow': '!bot|!redirect', + #'': '', + } + try: + data = query.GetData(params, self)['query']['recentchanges'] + + for np in data: + date = np['timestamp'] + title = np['title'] + length = np['newlen'] + username = np['user'] + loggedIn = u'' + comment = np['comment'] + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, date, length, loggedIn, username, comment + else: + path = self.newpages_address(n=number, namespace=namespace) + # The throttling is important here, so always enabled. + get_throttle() + html = self.getUrl(path)
- entryR = re.compile( + entryR = re.compile( '<li[^>]*>(?P<date>.+?) \S*?<a href=".+?"' ' title="(?P<title>.+?)">.+?</a>.+?[([](?P<length>[\d,.]+)[^)]]*[)]]' ' .?<a href=".+?" title=".+?:(?P<username>.+?)">' - ) - for m in entryR.finditer(html): - date = m.group('date') - title = m.group('title') - title = title.replace('"', '"') - length = int(re.sub("[,.]", "", m.group('length'))) - loggedIn = u'' - username = m.group('username') - comment = u'' + ) + for m in entryR.finditer(html): + date = m.group('date') + title = m.group('title') + title = title.replace('"', '"') + length = int(re.sub("[,.]", "", m.group('length'))) + loggedIn = u'' + username = m.group('username') + comment = u''
- if title not in seen: - seen.add(title) - page = Page(self, title) - yield page, date, length, loggedIn, username, comment + if title not in seen: + seen.add(title) + page = Page(self, title) + yield page, date, length, loggedIn, username, comment if not repeat: break