[Pywikipedia-l] SVN: [5424] trunk/pywikipedia/weblinkchecker.py

Thu May 22 18:09:09 UTC 2008

Revision: 5424
Author:   nicdumz
Date:     2008-05-22 18:09:09 +0000 (Thu, 22 May 2008)

Log Message:
-----------
#1968997 "weblinkchecker should ignore 401 unauthorized"
Adding a -ignore parameter to ignore some HTTP error codes

Modified Paths:
--------------
    trunk/pywikipedia/weblinkchecker.py

Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================

--- trunk/pywikipedia/weblinkchecker.py	2008-05-22 17:45:05 UTC (rev 5423)
+++ trunk/pywikipedia/weblinkchecker.py	2008-05-22 18:09:09 UTC (rev 5424)
@@ -36,6 +36,9 @@
 -namespace   Only process templates in the namespace with the given number or
              name. This parameter may be used multiple times.
 
+-ignore      HTTP return codes to ignore. Can be provided several times :
+                -ignore:401 -ignore:500
+
 Furthermore, the following command line parameters are supported:
 
 -talk        Overrides the report_dead_links_on_talk config variable, enabling
@@ -272,7 +275,7 @@
     Warning: Also returns false if your Internet connection isn't working
     correctly! (This will give a Socket Error)
     '''
-    def __init__(self, url, redirectChain = [], serverEncoding = None):
+    def __init__(self, url, redirectChain = [], serverEncoding = None, HTTPignore = []):
         """
         redirectChain is a list of redirects which were resolved by
         resolveRedirect(). This is needed to detect redirect loops.
@@ -293,6 +296,7 @@
         }
         self.redirectChain = redirectChain + [url]
         self.changeUrl(url)
+        self.HTTPignore = HTTPignore
 
     def getConnection(self):
         if self.scheme == 'http':
@@ -438,7 +442,7 @@
                     # which leads to a cyclic list of redirects.
                     # We simply start from the beginning, but this time,
                     # we don't use HEAD, but GET requests.
-                    redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding)
+                    redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding, HTTPignore = self.HTTPignore)
                     return redirChecker.check(useHEAD = False)
                 else:
                     urlList = ['[%s]' % url for url in self.redirectChain + [self.url]]
@@ -449,13 +453,13 @@
                     # which leads to a long (or infinite) list of redirects.
                     # We simply start from the beginning, but this time,
                     # we don't use HEAD, but GET requests.
-                    redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding)
+                    redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding, HTTPignore = self.HTTPignore)
                     return redirChecker.check(useHEAD = False)
                 else:
                     urlList = ['[%s]' % url for url in self.redirectChain + [self.url]]
                     return False, u'Long Chain of Redirects: %s' % ' -> '.join(urlList)
             else:
-                redirChecker = LinkChecker(self.url, self.redirectChain, self.serverEncoding)
+                redirChecker = LinkChecker(self.url, self.redirectChain, self.serverEncoding, HTTPignore = self.HTTPignore)
                 return redirChecker.check(useHEAD = useHEAD)
         else:
             try:
@@ -473,24 +477,27 @@
             # read the server's encoding, in case we need it later
             self.readEncodingFromResponse(response)
             # site down if the server status is between 400 and 499
-            siteDown = response.status in range(400, 500)
-            return not siteDown, '%s %s' % (response.status, response.reason)
+            alive = response.status in range(400, 500)
+            if response.status in self.HTTPignore:
+                alive = False
+            return alive, '%s %s' % (response.status, response.reason)
 
 class LinkCheckThread(threading.Thread):
     '''
     A thread responsible for checking one URL. After checking the page, it
     will die.
     '''
-    def __init__(self, page, url, history):
+    def __init__(self, page, url, history, HTTPignore):
         threading.Thread.__init__(self)
         self.page = page
         self.url = url
         self.history = history
         # identification for debugging purposes
         self.setName((u'%s - %s' % (page.title(), url)).encode('utf-8', 'replace'))
+        self.HTTPignore = HTTPignore
         
     def run(self):
-        linkChecker = LinkChecker(self.url)
+        linkChecker = LinkChecker(self.url, HTTPignore = self.HTTPignore)
         try:
             ok, message = linkChecker.check()
         except:
@@ -696,7 +703,7 @@
     Robot which will use several LinkCheckThreads at once to search for dead
     weblinks on pages provided by the given generator.
     '''
-    def __init__(self, generator):
+    def __init__(self, generator, HTTPignore = []):
         self.generator = generator
         if config.report_dead_links_on_talk:
             #wikipedia.output("Starting talk page thread")
@@ -707,6 +714,7 @@
         else:
             reportThread = None
         self.history = History(reportThread)
+        self.HTTPignore = HTTPignore
 
     def run(self):
         for page in self.generator:
@@ -729,7 +737,7 @@
                 while threading.activeCount() >= config.max_external_links:
                     # wait 100 ms
                     time.sleep(0.1)
-                thread = LinkCheckThread(page, url, self.history)
+                thread = LinkCheckThread(page, url, self.history, self.HTTPignore)
                 # thread dies when program terminates
                 thread.setDaemon(True)
                 thread.start()
@@ -760,6 +768,7 @@
     # Which namespaces should be processed?
     # default to [] which means all namespaces will be processed
     namespaces = []
+    HTTPignore = []
     # This factory is responsible for processing command line arguments
     # that are also used by other scripts and that determine on which pages
     # to work on.
@@ -777,6 +786,8 @@
                 namespaces.append(arg[11:])
         elif arg == '-repeat':
             gen = RepeatPageGenerator()
+        elif arg.startswith('-ignore:'):
+            HTTPignore.append(int(arg[8:]))
         else:
             generator = genFactory.handleArg(arg)
             if generator:
@@ -797,7 +808,7 @@
         pageNumber = max(240, config.max_external_links * 2)
         gen = pagegenerators.PreloadingGenerator(gen, pageNumber = pageNumber)
         gen = pagegenerators.RedirectFilterPageGenerator(gen)
-        bot = WeblinkCheckerRobot(gen)
+        bot = WeblinkCheckerRobot(gen, HTTPignore)
         try:
             bot.run()
         finally: