Revision: 3946
Author: wikipedian
Date: 2007-08-02 00:48:21 +0000 (Thu, 02 Aug 2007)
Log Message:
-----------
improved error messages:
* prevent flooding talk pages by putting [ ] around links when reporting
redirect chains/loops
* don't give useless error codes on socket errors
* give better message on BadStatusLine errors
fixed encoding crash when consulting internet archive
Modified Paths:
--------------
trunk/pywikipedia/weblinkchecker.py
Modified: trunk/pywikipedia/weblinkchecker.py
===================================================================
--- trunk/pywikipedia/weblinkchecker.py 2007-08-02 00:41:50 UTC (rev 3945)
+++ trunk/pywikipedia/weblinkchecker.py 2007-08-02 00:48:21 UTC (rev 3946)
@@ -153,6 +153,8 @@
# The Internet Archive yields a 403 error when the site was not
# archived due to robots.txt restrictions.
return None
+ except UnicodeEncodeError:
+ return None
text = f.read()
if text.find("Search Results for ") != -1:
return archiveURL
@@ -201,7 +203,6 @@
def getEncodingUsedByServer(self):
if not self.serverEncoding:
try:
- print conn.__dict__
wikipedia.output(u'Contacting server %s to find out its default encoding...' % self.conn)
conn = self.getConnection()
conn.request('HEAD', '/', None, self.header)
@@ -312,11 +313,14 @@
try:
wasRedirected = self.resolveRedirect(useHEAD = useHEAD)
except UnicodeError, arg:
- return False, u'Encoding Error: %s' % arg
+ return False, u'Encoding Error: %s (%s)' % (arg.__class__.__name__, unicode(arg))
except httplib.error, arg:
- return False, u'HTTP Error: %s' % arg
+ return False, u'HTTP Error: %s (%s)' % (arg.__class__.__name__, arg.line)
except socket.error, arg:
- return False, u'Socket Error: %s' % arg
+ # TODO: decode arg[1]. On Linux, it's encoded in UTF-8.
+ # How is it encoded in Windows? Or can we somehow just
+ # get the English message?
+ return False, u'Socket Error: %s' % arg[1]
#except UnicodeEncodeError, arg:
# return False, u'Non-ASCII Characters in URL: %s' % arg
if wasRedirected:
@@ -329,7 +333,8 @@
redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding)
return redirChecker.check(useHEAD = False)
else:
- return False, u'HTTP Redirect Loop: %s' % ' -> '.join(self.redirectChain + [self.url])
+ urlList = ['[%s]' % url for url in self.redirectChain + [self.url]]
+ return False, u'HTTP Redirect Loop: %s' % ' -> '.join(urlList)
elif len(self.redirectChain) >= 19:
if useHEAD:
# Some servers don't seem to handle HEAD requests properly,
@@ -339,7 +344,8 @@
redirChecker = LinkChecker(self.redirectChain[0], serverEncoding = self.serverEncoding)
return redirChecker.check(useHEAD = False)
else:
- return False, u'Long Chain of Redirects: %s' % ' -> '.join(self.redirectChain + [self.url])
+ urlList = ['[%s]' % url for url in self.redirectChain + [self.url]]
+ return False, u'Long Chain of Redirects: %s' % ' -> '.join(urlList)
else:
redirChecker = LinkChecker(self.url, self.redirectChain, self.serverEncoding)
return redirChecker.check(useHEAD = useHEAD)
@@ -347,13 +353,13 @@
try:
conn = self.getConnection()
except httplib.error, arg:
- return False, u'HTTP Error: %s' % arg
+ return False, u'HTTP Error: %s (%s)' % (arg.__class__.__name__, arg.line)
try:
conn.request('GET', '%s%s' % (self.path, self.query), None, self.header)
except socket.error, arg:
- return False, u'Socket Error: %s' % arg
- except UnicodeEncodeError, arg:
- return False, u'Non-ASCII Characters in URL: %s' % arg
+ return False, u'Socket Error: %s' % arg[1]
+ #except UnicodeEncodeError, arg:
+ # return False, u'Non-ASCII Characters in URL: %s' % arg
try:
response = conn.getresponse()
except Exception, arg: