jenkins-bot has submitted this change and it was merged.
Change subject: weblinkchecker.py : XML and archived URL
......................................................................
weblinkchecker.py : XML and archived URL
Same as the follow for compat:
* I7ba4f460897316ae1f5cbcca0080f8c3262d9abf : read XML dump
* I46c1737aea471691cd90f9ec21e3592ce0c69fde : Internet Archive and Web Citation
Bug: 55039
Bug: 58815
Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43
---
A pywikibot/weblib.py
M scripts/weblinkchecker.py
2 files changed, 123 insertions(+), 29 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/weblib.py b/pywikibot/weblib.py
new file mode 100644
index 0000000..d068925
--- /dev/null
+++ b/pywikibot/weblib.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+"""
+Functions for manipulating external links
+or querying third-party sites.
+
+"""
+#
+# (C) Pywikibot team, 2013
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import pywikibot
+from pywikibot.comms import http
+
+
+def getInternetArchiveURL(url, timestamp=None):
+ """Return archived URL by Internet Archive."""
+ # See [[:mw:Archived Pages]] and
http://archive.org/help/wayback_api.php
+ import json
+ query =
u'http://archive.org/wayback/available?'
+ query += u'url='
+ query += url
+ if not timestamp is None:
+ query += u'×tamp='
+ query += timestamp
+ jsontext = http.request(uri=query, site=None)
+ if "closest" in jsontext:
+ data = json.loads(jsontext)
+ return data['archived_snapshots']['closest']['url']
+ else:
+ return None
+
+
+def getWebCitationURL(url, timestamp=None):
+ """Return archived URL by Web Citation."""
+ # See
http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
+ import xml.etree.ElementTree as ET
+ query =
u'http://www.webcitation.org/query?'
+ query += u'returnxml=true'
+ query += u'&url='
+ query += url
+ if not timestamp is None:
+ query += u'&date='
+ query += timestamp
+ xmltext = http.request(uri=query, site=None)
+ if "success" in xmltext:
+ data = ET.fromstring(xmltext)
+ return data.find('.//webcite_url').text
+ else:
+ return None
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index fe138c7..40f283a 100644
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -36,6 +36,11 @@
-namespace Only process templates in the namespace with the given number or
name. This parameter may be used multiple times.
+-xml Should be used instead of a simple page fetching method from
+ pagegenerators.py for performance and load issues
+
+-xmlstart Page to start with when using an XML dump
+
-ignore HTTP return codes to ignore. Can be provided several times :
-ignore:401 -ignore:500
@@ -112,6 +117,8 @@
from pywikibot import i18n
from pywikibot import config
from pywikibot import pagegenerators
+from pywikibot import xmlreader
+from pywikibot import weblib
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp
@@ -177,29 +184,45 @@
yield m.group('urlb')
-class InternetArchiveConsulter:
- def __init__(self, url):
- self.url = url
+class XmlDumpPageGenerator:
+ """Xml generator that yiels pages containing a web
link"""
- def getArchiveURL(self):
- pywikibot.output(u'Consulting the Internet Archive for %s' % self.url)
- archiveURL = 'http://web.archive.org/web/*/%s' % self.url
+ def __init__(self, xmlFilename, xmlStart, namespaces):
+ self.xmlStart = xmlStart
+ self.namespaces = namespaces
+ self.skipping = bool(xmlStart)
+ self.site = pywikibot.getSite()
+
+ dump = xmlreader.XmlDump(xmlFilename)
+ self.parser = dump.parse()
+
+ def __iter__(self):
+ return self
+
+ def next(self):
try:
- f = urllib2.urlopen(archiveURL)
- except urllib2.HTTPError:
- # The Internet Archive yields a 403 error when the site was not
- # archived due to robots.txt restrictions.
- return
- except UnicodeEncodeError:
- return
- data = f.read()
- if f.headers.get('content-encoding', None) == 'gzip':
- # Since 2008, the Internet Archive returns pages in GZIPed
- # compression format. Unfortunatelly urllib2 doesn't handle
- # the decompression for us, so we have to do it ourselves.
- data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
- if "Search Results for " in data:
- return archiveURL
+ for entry in self.parser:
+ if self.skipping:
+ if entry.title != self.xmlStart:
+ continue
+ self.skipping = False
+ page = pywikibot.Page(self.site, entry.title)
+ if not self.namespaces == []:
+ if page.namespace() not in self.namespaces:
+ continue
+ found = False
+ for url in weblinksIn(entry.text):
+ found = True
+ if found:
+ return page
+ except KeyboardInterrupt:
+ try:
+ if not self.skipping:
+ pywikibot.output(
+ u'To resume, use "-xmlstart:%s" on the command
line.'
+ % entry.title)
+ except NameError:
+ pass
class LinkChecker(object):
@@ -509,10 +532,10 @@
def __init__(self, reportThread):
self.reportThread = reportThread
- site = pywikibot.getSite()
+ self.site = pywikibot.getSite()
self.semaphore = threading.Semaphore()
self.datfilename = pywikibot.config.datafilepath(
- 'deadlinks', 'deadlinks-%s-%s.dat' % (site.family.name,
site.code))
+ 'deadlinks', 'deadlinks-%s-%s.dat' % (self.site.family.name,
self.site.code))
# Count the number of logged links, so that we can insert captions
# from time to time
self.logCount = 0
@@ -528,7 +551,6 @@
"""
Logs an error report to a text file in the deadlinks subdirectory.
"""
- site = pywikibot.getSite()
if archiveURL:
errorReport = u'* %s ([%s archive])\n' % (url, archiveURL)
else:
@@ -541,8 +563,8 @@
pywikibot.output(u"** Logging link for deletion.")
txtfilename = pywikibot.config.datafilepath('deadlinks',
'results-%s-%s.txt'
- % (site.family.name,
- site.lang))
+ % (self.site.family.name,
+ self.site.lang))
txtfile = codecs.open(txtfilename, 'a', 'utf-8')
self.logCount += 1
if self.logCount % 30 == 0:
@@ -573,8 +595,9 @@
# We'll list it in a file so that it can be removed manually.
if timeSinceFirstFound > 60 * 60 * 24 * day:
# search for archived page
- iac = InternetArchiveConsulter(url)
- archiveURL = iac.getArchiveURL()
+ archiveURL = pywikibot.weblib.getInternetArchiveURL(url)
+ if archiveURL is None:
+ archiveURL = pywikibot.weblib.getWebCitationURL(url)
self.log(url, error, page, archiveURL)
else:
self.historyDict[url] = [(page.title(), now, error)]
@@ -781,6 +804,7 @@
def main():
gen = None
singlePageTitle = []
+ xmlFilename = None
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
@@ -807,6 +831,17 @@
HTTPignore.append(int(arg[8:]))
elif arg.startswith('-day:'):
day = int(arg[5:])
+ elif arg.startswith('-xmlstart'):
+ if len(arg) == 9:
+ xmlStart = pywikibot.input(
+ u'Please enter the dumped article to start with:')
+ else:
+ xmlStart = arg[10:]
+ elif arg.startswith('-xml'):
+ if len(arg) == 4:
+ xmlFilename = i18n.input('pywikibot-enter-xml-filename')
+ else:
+ xmlFilename = arg[5:]
else:
if not genFactory.handleArg(arg):
singlePageTitle.append(arg)
@@ -816,6 +851,13 @@
page = pywikibot.Page(pywikibot.getSite(), singlePageTitle)
gen = iter([page])
+ if xmlFilename:
+ try:
+ xmlStart
+ except NameError:
+ xmlStart = None
+ gen = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
+
if not gen:
gen = genFactory.getCombinedGenerator()
if gen:
@@ -824,7 +866,7 @@
# fetch at least 240 pages simultaneously from the wiki, but more if
# a high thread number is set.
pageNumber = max(240, config.max_external_links * 2)
- gen = pagegenerators.PreloadingGenerator(gen, pageNumber=pageNumber)
+ gen = pagegenerators.PreloadingGenerator(gen, step=pageNumber)
gen = pagegenerators.RedirectFilterPageGenerator(gen)
bot = WeblinkCheckerRobot(gen, HTTPignore)
try:
--
To view, visit
https://gerrit.wikimedia.org/r/104015
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Beta16 <l.rabinelli(a)gmail.com>
Gerrit-Reviewer: Beta16 <l.rabinelli(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot