[Gerrit] weblinkchecker.py : XML and archived URL - change (pywikibot/core) - Pywikibot-commits

1 Jan 2014

jenkins-bot has submitted this change and it was merged.

Change subject: weblinkchecker.py : XML and archived URL
......................................................................


weblinkchecker.py : XML and archived URL

Same as the follow for compat:
* I7ba4f460897316ae1f5cbcca0080f8c3262d9abf : read XML dump
* I46c1737aea471691cd90f9ec21e3592ce0c69fde : Internet Archive and Web Citation

Bug: 55039
Bug: 58815
Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43
---
A pywikibot/weblib.py
M scripts/weblinkchecker.py
2 files changed, 123 insertions(+), 29 deletions(-)

Approvals:
  Merlijn van Deen: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/weblib.py b/pywikibot/weblib.py
new file mode 100644
index 0000000..d068925
--- /dev/null
+++ b/pywikibot/weblib.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8  -*-
+"""
+Functions for manipulating external links
+or querying third-party sites.
+
+"""
+#
+# (C) Pywikibot team, 2013
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import pywikibot
+from pywikibot.comms import http
+
+
+def getInternetArchiveURL(url, timestamp=None):
+    """Return archived URL by Internet Archive."""
+    # See [[:mw:Archived Pages]] and http://archive.org/help/wayback_api.php
+    import json
+    query = u'http://archive.org/wayback/available?'
+    query += u'url='
+    query += url
+    if not timestamp is None:
+        query += u'&timestamp='
+        query += timestamp
+    jsontext = http.request(uri=query, site=None)
+    if "closest" in jsontext:
+        data = json.loads(jsontext)
+        return data['archived_snapshots']['closest']['url']
+    else:
+        return None
+
+
+def getWebCitationURL(url, timestamp=None):
+    """Return archived URL by Web Citation."""
+    # See http://www.webcitation.org/doc/WebCiteBestPracticesGuide.pdf
+    import xml.etree.ElementTree as ET
+    query = u'http://www.webcitation.org/query?'
+    query += u'returnxml=true'
+    query += u'&url='
+    query += url
+    if not timestamp is None:
+        query += u'&date='
+        query += timestamp
+    xmltext = http.request(uri=query, site=None)
+    if "success" in xmltext:
+        data = ET.fromstring(xmltext)
+        return data.find('.//webcite_url').text
+    else:
+        return None
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index fe138c7..40f283a 100644
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -36,6 +36,11 @@
 -namespace   Only process templates in the namespace with the given number or
              name. This parameter may be used multiple times.
 
+-xml         Should be used instead of a simple page fetching method from
+             pagegenerators.py for performance and load issues
+
+-xmlstart    Page to start with when using an XML dump
+
 -ignore      HTTP return codes to ignore. Can be provided several times :
                 -ignore:401 -ignore:500
 
@@ -112,6 +117,8 @@
 from pywikibot import i18n
 from pywikibot import config
 from pywikibot import pagegenerators
+from pywikibot import xmlreader
+from pywikibot import weblib
 
 docuReplacements = {
     '&params;': pagegenerators.parameterHelp
@@ -177,29 +184,45 @@
             yield m.group('urlb')
 
 
-class InternetArchiveConsulter:
-    def __init__(self, url):
-        self.url = url
+class XmlDumpPageGenerator:
+    """Xml generator that yiels pages containing a web
link"""
 
-    def getArchiveURL(self):
-        pywikibot.output(u'Consulting the Internet Archive for %s' % self.url)
-        archiveURL = 'http://web.archive.org/web/*/%s' % self.url
+    def __init__(self, xmlFilename, xmlStart, namespaces):
+        self.xmlStart = xmlStart
+        self.namespaces = namespaces
+        self.skipping = bool(xmlStart)
+        self.site = pywikibot.getSite()
+
+        dump = xmlreader.XmlDump(xmlFilename)
+        self.parser = dump.parse()
+
+    def __iter__(self):
+        return self
+
+    def next(self):
         try:
-            f = urllib2.urlopen(archiveURL)
-        except urllib2.HTTPError:
-            # The Internet Archive yields a 403 error when the site was not
-            # archived due to robots.txt restrictions.
-            return
-        except UnicodeEncodeError:
-            return
-        data = f.read()
-        if f.headers.get('content-encoding', None) == 'gzip':
-            # Since 2008, the Internet Archive returns pages in GZIPed
-            # compression format. Unfortunatelly urllib2 doesn't handle
-            # the decompression for us, so we have to do it ourselves.
-            data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
-        if "Search Results for " in data:
-            return archiveURL
+            for entry in self.parser:
+                if self.skipping:
+                    if entry.title != self.xmlStart:
+                        continue
+                    self.skipping = False
+                page = pywikibot.Page(self.site, entry.title)
+                if not self.namespaces == []:
+                    if page.namespace() not in self.namespaces:
+                        continue
+                found = False
+                for url in weblinksIn(entry.text):
+                    found = True
+                if found:
+                    return page
+        except KeyboardInterrupt:
+            try:
+                if not self.skipping:
+                    pywikibot.output(
+                        u'To resume, use "-xmlstart:%s" on the command
line.'
+                        % entry.title)
+            except NameError:
+                pass
 
 
 class LinkChecker(object):
@@ -509,10 +532,10 @@
 
     def __init__(self, reportThread):
         self.reportThread = reportThread
-        site = pywikibot.getSite()
+        self.site = pywikibot.getSite()
         self.semaphore = threading.Semaphore()
         self.datfilename = pywikibot.config.datafilepath(
-            'deadlinks', 'deadlinks-%s-%s.dat' % (site.family.name,
site.code))
+            'deadlinks', 'deadlinks-%s-%s.dat' % (self.site.family.name,
self.site.code))
         # Count the number of logged links, so that we can insert captions
         # from time to time
         self.logCount = 0
@@ -528,7 +551,6 @@
         """
         Logs an error report to a text file in the deadlinks subdirectory.
         """
-        site = pywikibot.getSite()
         if archiveURL:
             errorReport = u'* %s ([%s archive])\n' % (url, archiveURL)
         else:
@@ -541,8 +563,8 @@
         pywikibot.output(u"** Logging link for deletion.")
         txtfilename = pywikibot.config.datafilepath('deadlinks',
                                                     'results-%s-%s.txt'
-                                                    % (site.family.name,
-                                                       site.lang))
+                                                    % (self.site.family.name,
+                                                       self.site.lang))
         txtfile = codecs.open(txtfilename, 'a', 'utf-8')
         self.logCount += 1
         if self.logCount % 30 == 0:
@@ -573,8 +595,9 @@
             # We'll list it in a file so that it can be removed manually.
             if timeSinceFirstFound > 60 * 60 * 24 * day:
                 # search for archived page
-                iac = InternetArchiveConsulter(url)
-                archiveURL = iac.getArchiveURL()
+                archiveURL = pywikibot.weblib.getInternetArchiveURL(url)
+                if archiveURL is None:
+                    archiveURL = pywikibot.weblib.getWebCitationURL(url)
                 self.log(url, error, page, archiveURL)
         else:
             self.historyDict[url] = [(page.title(), now, error)]
@@ -781,6 +804,7 @@
 def main():
     gen = None
     singlePageTitle = []
+    xmlFilename = None
     # Which namespaces should be processed?
     # default to [] which means all namespaces will be processed
     namespaces = []
@@ -807,6 +831,17 @@
             HTTPignore.append(int(arg[8:]))
         elif arg.startswith('-day:'):
             day = int(arg[5:])
+        elif arg.startswith('-xmlstart'):
+            if len(arg) == 9:
+                xmlStart = pywikibot.input(
+                    u'Please enter the dumped article to start with:')
+            else:
+                xmlStart = arg[10:]
+        elif arg.startswith('-xml'):
+            if len(arg) == 4:
+                xmlFilename = i18n.input('pywikibot-enter-xml-filename')
+            else:
+                xmlFilename = arg[5:]
         else:
             if not genFactory.handleArg(arg):
                 singlePageTitle.append(arg)
@@ -816,6 +851,13 @@
         page = pywikibot.Page(pywikibot.getSite(), singlePageTitle)
         gen = iter([page])
 
+    if xmlFilename:
+        try:
+            xmlStart
+        except NameError:
+            xmlStart = None
+        gen = XmlDumpPageGenerator(xmlFilename, xmlStart, namespaces)
+
     if not gen:
         gen = genFactory.getCombinedGenerator()
     if gen:
@@ -824,7 +866,7 @@
         # fetch at least 240 pages simultaneously from the wiki, but more if
         # a high thread number is set.
         pageNumber = max(240, config.max_external_links * 2)
-        gen = pagegenerators.PreloadingGenerator(gen, pageNumber=pageNumber)
+        gen = pagegenerators.PreloadingGenerator(gen, step=pageNumber)
         gen = pagegenerators.RedirectFilterPageGenerator(gen)
         bot = WeblinkCheckerRobot(gen, HTTPignore)
         try:

-- 
To view, visit https://gerrit.wikimedia.org/r/104015
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I7279da01b0527c974ea53dc1f234a9268dbc8d43
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Beta16 &lt;l.rabinelli(a)gmail.com&gt;
Gerrit-Reviewer: Beta16 &lt;l.rabinelli(a)gmail.com&gt;
Gerrit-Reviewer: Ladsgroup &lt;ladsgroup(a)gmail.com&gt;
Gerrit-Reviewer: Legoktm &lt;legoktm.wikipedia(a)gmail.com&gt;
Gerrit-Reviewer: Merlijn van Deen &lt;valhallasw(a)arctus.nl&gt;
Gerrit-Reviewer: Xqt &lt;info(a)gno.de&gt;
Gerrit-Reviewer: jenkins-bot