Revision: 7720
Author: alexsh
Date: 2009-11-30 18:48:56 +0000 (Mon, 30 Nov 2009)
Log Message:
-----------
Page().getReferences(): add API mode, use query:backlinks and query:embeddedin. Move ordinary to getReferencesOld().
Modified Paths:
--------------
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py 2009-11-30 14:51:16 UTC (rev 7719)
+++ trunk/pywikipedia/wikipedia.py 2009-11-30 18:48:56 UTC (rev 7720)
@@ -1252,14 +1252,12 @@
self._isDisambig = len(disambigInPage) > 0
return self._isDisambig
- def getReferences(self,
- follow_redirects=True, withTemplateInclusion=True,
- onlyTemplateInclusion=False, redirectsOnly=False):
- """Yield all pages that link to the page.
+ def getReferences(self, follow_redirects=True, withTemplateInclusion=True,
+ onlyTemplateInclusion=False, redirectsOnly=False, internal = False):
+ """Yield all pages that link to the page by API
If you need a full list of referring pages, use this:
pages = [page for page in s.getReferences()]
-
Parameters:
* follow_redirects - if True, also returns pages that link to a
redirect pointing to the page.
@@ -1270,6 +1268,98 @@
* redirectsOnly - if True, only returns redirects to self.
"""
+ try:
+ if config.use_api and self.site().versionnumber() > 9:
+ d = self.site().apipath()
+ del d
+ else:
+ raise NotImplementedError
+ except NotImplementedError:
+ for s in self.getReferencesOld(follow_redirects, withTemplateInclusion, onlyTemplateInclusion, redirectsOnly):
+ yield s
+ return
+
+ params = {
+ 'action': 'query',
+ 'list': [],
+ }
+ if not onlyTemplateInclusion:
+ params['list'].append('backlinks')
+ params['bltitle'] = self.title()
+ params['bllimit'] = config.special_page_limit
+ params['blfilterredir'] = 'all'
+ if follow_redirects:
+ params['blredirect'] = 1
+ if redirectsOnly:
+ params['blfilterredir'] = 'redirects'
+ if not self.site().isAllowed('apihighlimits') and config.special_page_limit > 500:
+ params['bllimit'] = 500
+
+ if withTemplateInclusion or onlyTemplateInclusion:
+ params['list'].append('embeddedin')
+ params['eititle'] = self.title()
+ params['eilimit'] = config.special_page_limit
+ params['eifilterredir'] = 'all'
+ if follow_redirects:
+ params['eiredirect'] = 1
+ if redirectsOnly:
+ params['eifilterredir'] = 'redirects'
+ if not self.site().isAllowed('apihighlimits') and config.special_page_limit > 500:
+ params['eilimit'] = 5000
+
+ allDone = False
+
+ while not allDone:
+ if not internal:
+ output(u'Getting references to %s via API...' % self.aslink())
+
+ datas = query.GetData(params, self.site())
+ data = datas['query'].values()
+ if len(data) == 2:
+ data = data[0] + data[1]
+ else:
+ data = data[0]
+
+ refPages = set()
+ for blp in data:
+ pg = Page(self.site(), blp['title'], defaultNamespace = blp['ns'])
+ if pg in refPages:
+ continue
+
+ yield pg
+ refPages.add(pg)
+ if follow_redirects and 'redirect' in blp and 'redirlinks' in blp:
+ for p in blp['redirlinks']:
+ plk = Page(self.site(), p['title'], defaultNamespace = p['ns'])
+ if plk in refPages:
+ continue
+
+ yield plk
+ refPages.add(plk)
+ if follow_redirects and 'redirect' in p:
+ for zms in plk.getReferences(follow_redirects, withTemplateInclusion,
+ onlyTemplateInclusion, redirectsOnly, internal=True):
+ yield zms
+ else:
+ continue
+ else:
+ continue
+
+ if 'query-continue' in datas:
+ if 'backlinks' in datas['query-continue']:
+ params['blcontinue'] = datas['query-continue']['backlinks']['blcontinue']
+
+ if 'embeddedin' in datas['query-continue']:
+ params['eicontinue'] = datas['query-continue']['embeddedin']['eicontinue']
+ else:
+ allDone = True
+
+
+ def getReferencesOld(self,
+ follow_redirects=True, withTemplateInclusion=True,
+ onlyTemplateInclusion=False, redirectsOnly=False):
+ """Yield all pages that link to the page.
+ """
# Temporary bug-fix while researching more robust solution:
if config.special_page_limit > 999:
config.special_page_limit = 999
@@ -5613,7 +5703,15 @@
if retry is None:
retry = config.retry_on_fail
- headers = {'User-agent': useragent,}
+ headers = {
+ 'User-agent': useragent,
+ #'Accept-Language': config.mylang,
+ #'Accept-Charset': config.textfile_encoding,
+ #'Keep-Alive': '115',
+ #'Connection': 'keep-alive',
+ #'Cache-Control': 'max-age=0',
+ #'': '',
+ }
if not no_hostname and self.cookies(sysop = sysop):
headers['Cookie'] = self.cookies(sysop = sysop)