[Pywikipedia-l] SVN: [5424] trunk/pywikipedia/weblinkchecker.py
nicdumz at svn.wikimedia.org
nicdumz at svn.wikimedia.org
Thu May 22 18:09:09 UTC 2008
Revision: 5424
Author: nicdumz
Date: 2008-05-22 18:09:09 +0000 (Thu, 22 May 2008)
Log Message:
-----------
#1968997 "weblinkchecker should ignore 401 unauthorized"
Adding a -ignore parameter to ignore some HTTP error codes
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2008-05-22 17:45:05 UTC (rev 5423)
+++ trunk/pywikipedia/weblinkchecker.py 2008-05-22 18:09:09 UTC (rev 5424)
@@ -36,6 +36,9 @@
-namespace Only process templates in the namespace with the given number or
name. This parameter may be used multiple times.
+-ignore HTTP return codes to ignore. Can be provided several times :
+ -ignore:401 -ignore:500
+
Furthermore, the following command line parameters are supported:
-talk Overrides the report_dead_links_on_talk config variable, enabling
@@ -272,7 +275,7 @@
Warning: Also returns false if your Internet connection isn't working
correctly! (This will give a Socket Error)
'''
- def __init__(self, url, redirectChain = [], serverEncoding = None):
+ def __init__(self, url, redirectChain = [], serverEncoding = None, HTTPignore = []):
"""
redirectChain is a list of redirects which were resolved by
resolveRedirect(). This is needed to detect redirect loops.
@@ -293,6 +296,7 @@
}
self.redirectChain = redirectChain + [url]
self.changeUrl(url)
+ self.HTTPignore = HTTPignore
def getConnection(self):
if self.scheme == 'http':
@@ -438,7 +442,7 @@
# which leads to a cyclic list of redirects.
# We simply start from the beginning, but this time,
# we don't use HEAD, but GET requests.
- redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding)
+ redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding, HTTPignore = self.HTTPignore)
return redirChecker.check(useHEAD = False)
else:
urlList = ['[%s]' % url for url in self.redirectChain + [self.url]]
@@ -449,13 +453,13 @@
# which leads to a long (or infinite) list of redirects.
# We simply start from the beginning, but this time,
# we don't use HEAD, but GET requests.
- redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding)
+ redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding, HTTPignore = self.HTTPignore)
return redirChecker.check(useHEAD = False)
else:
urlList = ['[%s]' % url for url in self.redirectChain + [self.url]]
return False, u'Long Chain of Redirects: %s' % ' -> '.join(urlList)
else:
- redirChecker = LinkChecker(self.url, self.redirectChain, self.serverEncoding)
+ redirChecker = LinkChecker(self.url, self.redirectChain, self.serverEncoding, HTTPignore = self.HTTPignore)
return redirChecker.check(useHEAD = useHEAD)
else:
try:
@@ -473,24 +477,27 @@
# read the server's encoding, in case we need it later
self.readEncodingFromResponse(response)
# site down if the server status is between 400 and 499
- siteDown = response.status in range(400, 500)
- return not siteDown, '%s %s' % (response.status, response.reason)
+ alive = response.status in range(400, 500)
+ if response.status in self.HTTPignore:
+ alive = False
+ return alive, '%s %s' % (response.status, response.reason)
class LinkCheckThread(threading.Thread):
'''
A thread responsible for checking one URL. After checking the page, it
will die.
'''
- def __init__(self, page, url, history):
+ def __init__(self, page, url, history, HTTPignore):
threading.Thread.__init__(self)
self.page = page
self.url = url
self.history = history
# identification for debugging purposes
self.setName((u'%s - %s' % (page.title(), url)).encode('utf-8', 'replace'))
+ self.HTTPignore = HTTPignore
def run(self):
- linkChecker = LinkChecker(self.url)
+ linkChecker = LinkChecker(self.url, HTTPignore = self.HTTPignore)
try:
ok, message = linkChecker.check()
except:
@@ -696,7 +703,7 @@
Robot which will use several LinkCheckThreads at once to search for dead
weblinks on pages provided by the given generator.
'''
- def __init__(self, generator):
+ def __init__(self, generator, HTTPignore = []):
self.generator = generator
if config.report_dead_links_on_talk:
#wikipedia.output("Starting talk page thread")
@@ -707,6 +714,7 @@
else:
reportThread = None
self.history = History(reportThread)
+ self.HTTPignore = HTTPignore
def run(self):
for page in self.generator:
@@ -729,7 +737,7 @@
while threading.activeCount() >= config.max_external_links:
# wait 100 ms
time.sleep(0.1)
- thread = LinkCheckThread(page, url, self.history)
+ thread = LinkCheckThread(page, url, self.history, self.HTTPignore)
# thread dies when program terminates
thread.setDaemon(True)
thread.start()
@@ -760,6 +768,7 @@
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
+ HTTPignore = []
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
@@ -777,6 +786,8 @@
namespaces.append(arg[11:])
elif arg == '-repeat':
gen = RepeatPageGenerator()
+ elif arg.startswith('-ignore:'):
+ HTTPignore.append(int(arg[8:]))
else:
generator = genFactory.handleArg(arg)
if generator:
@@ -797,7 +808,7 @@
pageNumber = max(240, config.max_external_links * 2)
gen = pagegenerators.PreloadingGenerator(gen, pageNumber = pageNumber)
gen = pagegenerators.RedirectFilterPageGenerator(gen)
- bot = WeblinkCheckerRobot(gen)
+ bot = WeblinkCheckerRobot(gen, HTTPignore)
try:
bot.run()
finally:
More information about the Pywikipedia-l
mailing list