jenkins-bot has submitted this change and it was merged.
Change subject: weblinkchecker.py : get archived URL ......................................................................
weblinkchecker.py : get archived URL
* use API for querying Internet Archive: [[:mw:Archived Pages]] * add query for Web Citation: bug 58815
Change-Id: I46c1737aea471691cd90f9ec21e3592ce0c69fde --- A pywikibot/weblib.py M weblinkchecker.py 2 files changed, 65 insertions(+), 32 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/weblib.py b/pywikibot/weblib.py new file mode 100644 index 0000000..fc30327 --- /dev/null +++ b/pywikibot/weblib.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +""" +Functions for manipulating external links +or querying third-party sites. + +""" +# +# (C) Pywikibot team, 2013 +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import wikipedia as pywikibot +import config +from pywikibot.comms import http + +def getInternetArchiveURL(site, url, timestamp=None): + """Return archived URL by Internet Archive.""" + # See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php + import json + query = u'http://archive.org/wayback/available?' + query += u'url=' + query += url + if not timestamp is None: + query += u'×tamp=' + query += timestamp + if pywikibot.verbose: + pywikibot.output(u"Requesting query from Internet Archive: %s" % query) + jsontext = http.request(uri=query, site=site, retry=False, no_hostname=True) + if "closest" in jsontext: + data = json.loads(jsontext) + return data['archived_snapshots']['closest']['url'] + else: + return None + + +def getWebCitationURL(site, url, timestamp=None): + """Return archived URL by Web Citation.""" + # See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf + from BeautifulSoup import BeautifulStoneSoup + query = u'http://www.webcitation.org/query?' + query += u'returnxml=true' + query += u'&url=' + query += url + if not timestamp is None: + query += u'&date=' + query += timestamp + if pywikibot.verbose: + pywikibot.output(u"Requesting query from Web Citation: %s" % query) + xmltext = http.request(uri=query, site=site, retry=False, no_hostname=True) + if "success" in xmltext: + data = BeautifulStoneSoup(xmltext) + return data.find('webcite_url').string + else: + return None + diff --git a/weblinkchecker.py b/weblinkchecker.py index 1eaa96b..f4237f8 100644 --- a/weblinkchecker.py +++ b/weblinkchecker.py @@ -112,6 +112,7 @@ from pywikibot import i18n import config import pagegenerators +import pywikibot.weblib
docuReplacements = { '¶ms;': pagegenerators.parameterHelp @@ -175,31 +176,6 @@ yield m.group('url') else: yield m.group('urlb') - - -class InternetArchiveConsulter: - def __init__(self, url): - self.url = url - - def getArchiveURL(self): - pywikibot.output(u'Consulting the Internet Archive for %s' % self.url) - archiveURL = 'http://web.archive.org/web/*/%s' % self.url - try: - f = urllib2.urlopen(archiveURL) - except urllib2.HTTPError: - # The Internet Archive yields a 403 error when the site was not - # archived due to robots.txt restrictions. - return - except UnicodeEncodeError: - return - data = f.read() - if f.headers.get('content-encoding', None) == 'gzip': - # Since 2008, the Internet Archive returns pages in GZIPed - # compression format. Unfortunatelly urllib2 doesn't handle - # the decompression for us, so we have to do it ourselves. - data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() - if "Search Results for " in data: - return archiveURL
class LinkChecker(object): @@ -509,10 +485,10 @@
def __init__(self, reportThread): self.reportThread = reportThread - site = pywikibot.getSite() + self.site = pywikibot.getSite() self.semaphore = threading.Semaphore() self.datfilename = pywikibot.config.datafilepath( - 'deadlinks', 'deadlinks-%s-%s.dat' % (site.family.name, site.lang)) + 'deadlinks', 'deadlinks-%s-%s.dat' % (self.site.family.name, self.site.lang)) # Count the number of logged links, so that we can insert captions # from time to time self.logCount = 0 @@ -528,7 +504,6 @@ """ Logs an error report to a text file in the deadlinks subdirectory. """ - site = pywikibot.getSite() if archiveURL: errorReport = u'* %s ([%s archive])\n' % (url, archiveURL) else: @@ -541,8 +516,8 @@ pywikibot.output(u"** Logging link for deletion.") txtfilename = pywikibot.config.datafilepath('deadlinks', 'results-%s-%s.txt' - % (site.family.name, - site.lang)) + % (self.site.family.name, + self.site.lang)) txtfile = codecs.open(txtfilename, 'a', 'utf-8') self.logCount += 1 if self.logCount % 30 == 0: @@ -573,8 +548,9 @@ # We'll list it in a file so that it can be removed manually. if timeSinceFirstFound > 60 * 60 * 24 * day: # search for archived page - iac = InternetArchiveConsulter(url) - archiveURL = iac.getArchiveURL() + archiveURL = pywikibot.weblib.getInternetArchiveURL(self.site, url) + if archiveURL is None: + archiveURL = pywikibot.weblib.getWebCitationURL(self.site, url) self.log(url, error, page, archiveURL) else: self.historyDict[url] = [(page.title(), now, error)]
pywikibot-commits@lists.wikimedia.org