Revision: 4089
Author: cosoleto
Date: 2007-08-22 20:35:48 +0000 (Wed, 22 Aug 2007)
Log Message:
-----------
Add last modified date and length of URL to copyright.py's result. The script can add
also a Google cache link if a 400 HTTP status code is found. check_in_source() stuff
rewrote object-oriented and equiped with a simple code to decode UTF-8 data.
Modified Paths:
--------------
trunk/pywikipedia/config.py
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2007-08-22 18:03:28 UTC (rev 4088)
+++ trunk/pywikipedia/config.py 2007-08-22 20:35:48 UTC (rev 4089)
@@ -323,6 +323,12 @@
# Number of attempts on connection error.
copyright_connection_tries = 10
+# Append last modified date of URL to script result
+copyright_show_date = True
+
+# Append length of URL to script result
+copyright_show_length = True
+
############## FURTHER SETTINGS ##############
# The bot can make some additional changes to each page it edits, e.g. fix
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-08-22 18:03:28 UTC (rev 4088)
+++ trunk/pywikipedia/copyright.py 2007-08-22 20:35:48 UTC (rev 4089)
@@ -456,8 +456,11 @@
search_words = search_words[:search_words.rindex("
")]
results = get_results(search_words)
group_url = ''
- for url, engine in results:
- group_url += '\n*%s - %s' % (engine, url)
+ for url, engine, comment in results:
+ if comment:
+ group_url += '\n*%s - %s (%s)' % (engine, url, ";
".join(comment))
+ else:
+ group_url += '\n*%s - %s' % (engine, url)
if results:
group_url_list = group_url.splitlines()
group_url_list.sort()
@@ -482,51 +485,177 @@
source_seen = set()
positive_source_seen = set()
-def check_in_source(url):
+class NoWebPage(Exception):
+ """Web page does not exist (404)"""
+
+class URL_exclusion(Exception):
+ """URL in exclusion list"""
+
+class WebPage(object):
"""
- Sources may be different from search engine database and include mentions of
- Wikipedia. This function avoid also errors in search results that can occurs
- either with Google and Yahoo! service.
"""
- import urllib2
- global excl_list, source_seen, positive_source_seen
- if url in positive_source_seen:
- return True
+ def __init__(self, url):
+ """
+ """
- if url in source_seen:
- return False
+ if check_list(url, excl_list):
+ raise URL_exclusion
- if check_list(url, excl_list):
- return False
+ self._url = url
- # very experimental code
- if not url[-4:] in [".pdf", ".doc", ".ppt"]:
try:
- resp = urllib2.urlopen(url)
- text = resp.read()
- #resp.close()
- except urllib2.HTTPError:
+ self._urldata = urllib2.urlopen(urllib2.Request(self._url, None, {
'User-Agent': wikipedia.useragent }))
+ #except httplib.BadStatusLine, line:
+ # print 'URL: %s\nBad status line: %s' % (url, line)
+ except urllib2.HTTPError, err:
+ print "HTTP error: %d / %s (%s)" % (err.code, err.msg, url)
+ #if err.code == 404:
+ if err.code >= 400:
+ raise NoWebPage
+ return None
+ #except urllib2.URLError:
+ except Exception, err:
+ print "ERROR: %s" % (err)
+
+ self._lastmodified = self._urldata.info().getdate('Last-Modified')
+ self._length = self._urldata.info().getheader('Content-Length')
+ self._content_type = self._urldata.info().getheader('Content-Type')
+
+ def length(self):
+ if hasattr(self, '_length'):
+ if self._length:
+ return int(self._length)
+ if hasattr(self, '_contents'):
+ return len(self._contents)
+
+ # print "No length for " + self._url
+
+ return None
+
+ def lastmodified(self):
+ if hasattr(self, '_lastmodified'):
+ return self._lastmodified
+ return None
+
+ def get(self, force = False):
+ """
+ """
+
+ # Exclude URL with listed file extension.
+ if self._url[-4:] in [".pdf", ".doc", ".ppt"]:
+ raise URL_exclusion
+
+ # Make sure we did try to get the contents once
+ if not hasattr(self, '_contents'):
+ self._contents = self._urldata.read()
+ return self._contents
+ return None
+
+ def check_in_source(self):
+ """
+ Sources may be different from search engine database and include mentions of
+ Wikipedia. This function avoid also errors in search results that can occurs
+ either with Google and Yahoo! service.
+ """
+ global excl_list, source_seen, positive_source_seen
+
+ if not hasattr(self, '_urldata'):
return False
- if reWikipediaC.search(text):
- # if 'wikipedia' in text.lower():
- excl_list += [url]
- #write_log(url + '\n',
"copyright/sites_with_'wikipedia'.txt")
- positive_source_seen.add(url)
+ if self._url in positive_source_seen:
return True
- else:
- #write_log(url + '\n',
"copyright/sites_without_'wikipedia'.txt")
- source_seen.add(url)
- return False
+ if self._url in source_seen:
+ return False
+
+ text = self.get()
+
+ # Character encoding conversion if 'Content-Type' field has
+ # charset attribute set to UTF-8.
+
+ if text:
+ if 'utf-8' in self._content_type.lower():
+ text = text.decode("utf-8", 'replace')
+ else:
+ # <META> declaration with "http-equiv" set to
"Content-Type" in HTML document.
+ if 'text/html' in self._content_type and
(re.search("(?is)<meta\s.*?charset\s*=\s*[\"\']*\s*UTF-8.*?>",
text) or
re.search("(?is)<\?.*?encoding\s*=\s*[\"\']*\s*UTF-8.*?\?>",
text)):
+ text = text.decode("utf-8", 'replace')
+
+ m = reWikipediaC.search(text)
+ if m:
+ excl_list += [self._url]
+ write_log("%s (%s)\n" % (self._url, m.group()),
"copyright/sites_with_'wikipedia'.txt")
+ positive_source_seen.add(self._url)
+ return True
+ else:
+ write_log(self._url + '\n',
"copyright/sites_without_'wikipedia'.txt")
+ source_seen.add(self._url)
+ return False
+
def add_in_urllist(url, add_item, engine):
+
+ if (engine == 'google' and config.copyright_check_in_source_google) or \
+ (engine == 'yahoo' and config.copyright_check_in_source_yahoo):
+ check_in_source = True
+ else:
+ check_in_source = False
+
+ if check_in_source or config.copyright_show_date or config.copyright_show_length:
+ s = None
+ cache = False
+
+ # list to store date, length, cache URL
+ comment = list()
+
+ try:
+ s = WebPage(add_item)
+ except URL_exclusion:
+ pass
+ except NoWebPage:
+ cache = True
+
+ if s:
+ # Before of add url in result list, perform the check in source
+ if check_in_source:
+ if s.check_in_source():
+ return
+
+ if config.copyright_show_date:
+ date = s.lastmodified()
+ if date:
+ if date[:3] != time.localtime()[:3]:
+ comment.append("%s/%s/%s" % (date[2], date[1],
date[0]))
+
+ unit = 'bytes'
+
+ if config.copyright_show_length:
+ length = s.length()
+ if length:
+ # convert in kilobyte
+ length /= 1024
+ unit = 'KB'
+ if length > 1024:
+ # convert in megabyte
+ length /= 1024
+ unit = 'MB'
+ if length > 0:
+ comment.append("%d %s" % (length, unit))
+
+ if cache:
+ if engine == 'google':
+
comment.append('[http://www.google.com/search?sourceid=navclient&q=… google
cache]' % add_item[7:])
+ elif engine == 'yahoo':
+ cache = False
+ elif engine == 'msn':
+ cache = False
+
for i in range(len(url)):
if add_item in url[i]:
if engine not in url[i][1]:
- url[i] = (add_item, url[i][1] + ', ' + engine)
+ url[i] = (add_item, url[i][1] + ', ' + engine, comment)
return
- url.append((add_item, engine))
+ url.append((add_item, engine, comment))
return
def get_results(query, numresults = 10):
@@ -543,9 +672,6 @@
data = google.doGoogleSearch('-Wikipedia "' + query +
'"')
search_request_retry = 0
for entry in data.results:
- if config.copyright_check_in_source_google:
- if check_in_source(entry.URL):
- continue
add_in_urllist(url, entry.URL, 'google')
except KeyboardInterrupt:
raise
@@ -565,9 +691,6 @@
while search_request_retry:
try:
for entry in data.parse_results():
- if config.copyright_check_in_source_yahoo:
- if check_in_source(entry.Url):
- continue
add_in_urllist(url, entry.Url, 'yahoo')
search_request_retry = 0
except Exception, err: