SVN: [4089] trunk/pywikipedia - Pywikipedia-l

23 Aug 2007

Revision: 4089
Author:   cosoleto
Date:     2007-08-22 20:35:48 +0000 (Wed, 22 Aug 2007)
Log Message:
-----------
Add last modified date and length of URL to copyright.py's result. The script can add also a Google cache link if a 400 HTTP status code is found. check_in_source() stuff rewrote object-oriented and equiped with a simple code to decode UTF-8 data.
Modified Paths:
--------------
    trunk/pywikipedia/config.py
    trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/config.py
===================================================================

--- trunk/pywikipedia/config.py	2007-08-22 18:03:28 UTC (rev 4088)
+++ trunk/pywikipedia/config.py	2007-08-22 20:35:48 UTC (rev 4089)
@@ -323,6 +323,12 @@
 # Number of attempts on connection error.
 copyright_connection_tries = 10
+# Append last modified date of URL to script result
+copyright_show_date = True
+
+# Append length of URL to script result
+copyright_show_length = True
+
 ############## FURTHER SETTINGS ##############
# The bot can make some additional changes to each page it edits, e.g. fix
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py	2007-08-22 18:03:28 UTC (rev 4088)
+++ trunk/pywikipedia/copyright.py	2007-08-22 20:35:48 UTC (rev 4089)
@@ -456,8 +456,11 @@
                          search_words = search_words[:search_words.rindex(" ")]
                 results = get_results(search_words)
                 group_url = ''
-                for url, engine in results:
-                    group_url += '\n*%s - %s' % (engine, url)
+                for url, engine, comment in results:
+                    if comment:
+                        group_url += '\n*%s - %s (%s)' % (engine, url, "; ".join(comment))
+                    else:
+                        group_url += '\n*%s - %s' % (engine, url)
                 if results:
                     group_url_list = group_url.splitlines()
                     group_url_list.sort()
@@ -482,51 +485,177 @@
 source_seen = set()
 positive_source_seen = set()
-def check_in_source(url):
+class NoWebPage(Exception):
+    """Web page does not exist (404)"""
+
+class URL_exclusion(Exception):
+    """URL in exclusion list"""
+
+class WebPage(object):
     """
-    Sources may be different from search engine database and include mentions of
-    Wikipedia. This function avoid also errors in search results that can occurs
-    either with Google and Yahoo! service.
     """
-    import urllib2
-    global excl_list, source_seen, positive_source_seen
-    if url in positive_source_seen:
-        return True
+    def __init__(self, url):
+        """
+        """
-    if url in source_seen:
-        return False
+        if check_list(url, excl_list):
+            raise URL_exclusion
-    if check_list(url, excl_list):
-        return False
+        self._url = url
-    # very experimental code
-    if not url[-4:] in [".pdf", ".doc", ".ppt"]:
         try:
-            resp = urllib2.urlopen(url)
-            text = resp.read()
-            #resp.close()
-        except urllib2.HTTPError:
+            self._urldata = urllib2.urlopen(urllib2.Request(self._url, None, { 'User-Agent': wikipedia.useragent }))
+        #except httplib.BadStatusLine, line:
+        #    print 'URL: %s\nBad status line: %s' % (url, line)
+        except urllib2.HTTPError, err:
+            print "HTTP error: %d / %s (%s)" % (err.code, err.msg, url)
+            #if err.code == 404:
+            if err.code >= 400:
+                raise NoWebPage
+            return None
+            #except urllib2.URLError:
+        except Exception, err:
+                print "ERROR: %s" % (err)
+
+        self._lastmodified = self._urldata.info().getdate('Last-Modified')
+        self._length = self._urldata.info().getheader('Content-Length')
+        self._content_type = self._urldata.info().getheader('Content-Type')
+
+    def length(self):
+         if hasattr(self, '_length'):
+             if self._length:
+                 return int(self._length)
+         if hasattr(self, '_contents'):
+             return len(self._contents)
+
+         # print "No length for " + self._url
+
+         return None
+
+    def lastmodified(self):
+         if hasattr(self, '_lastmodified'):
+             return self._lastmodified
+         return None
+
+    def get(self, force = False):
+        """
+        """
+
+        # Exclude URL with listed file extension.
+        if self._url[-4:] in [".pdf", ".doc", ".ppt"]:
+            raise URL_exclusion
+
+        # Make sure we did try to get the contents once
+        if not hasattr(self, '_contents'):
+            self._contents = self._urldata.read()
+            return self._contents
+        return None
+
+    def check_in_source(self):
+        """
+        Sources may be different from search engine database and include mentions of
+        Wikipedia. This function avoid also errors in search results that can occurs
+        either with Google and Yahoo! service.
+        """
+        global excl_list, source_seen, positive_source_seen
+
+        if not hasattr(self, '_urldata'):
             return False
-        if reWikipediaC.search(text):
-            # if 'wikipedia' in text.lower():
-            excl_list += [url]
-            #write_log(url + '\n', "copyright/sites_with_'wikipedia'.txt")
-            positive_source_seen.add(url)
+        if self._url in positive_source_seen:
             return True
-        else:
-            #write_log(url + '\n', "copyright/sites_without_'wikipedia'.txt")
-            source_seen.add(url)
-    return False
+        if self._url in source_seen:
+            return False
+
+        text = self.get()
+
+        # Character encoding conversion if 'Content-Type' field has
+        # charset attribute set to UTF-8.
+
+        if text:
+            if 'utf-8' in self._content_type.lower():
+                text = text.decode("utf-8", 'replace')
+            else:
+                # <META> declaration with "http-equiv" set to "Content-Type" in HTML document.
+                if 'text/html' in self._content_type and (re.search("(?is)<meta\s.*?charset\s*=\s*["']*\s*UTF-8.*?>", text) or re.search("(?is)<?.*?encoding\s*=\s*["']*\s*UTF-8.*??>", text)):
+                    text = text.decode("utf-8", 'replace')
+
+            m = reWikipediaC.search(text)
+            if m:
+                excl_list += [self._url]
+                write_log("%s (%s)\n" % (self._url, m.group()), "copyright/sites_with_'wikipedia'.txt")
+                positive_source_seen.add(self._url)
+                return True
+            else:
+                write_log(self._url + '\n', "copyright/sites_without_'wikipedia'.txt")
+        source_seen.add(self._url)
+        return False
+
 def add_in_urllist(url, add_item, engine):
+
+    if (engine == 'google' and config.copyright_check_in_source_google) or \
+    (engine == 'yahoo' and config.copyright_check_in_source_yahoo):
+        check_in_source = True
+    else:
+        check_in_source = False
+
+    if check_in_source or config.copyright_show_date or config.copyright_show_length:
+        s = None
+        cache = False
+
+        # list to store date, length, cache URL
+        comment = list()
+
+        try:
+            s = WebPage(add_item)
+        except URL_exclusion:
+            pass
+        except NoWebPage:
+            cache = True
+
+        if s:
+            # Before of add url in result list, perform the check in source
+            if check_in_source:
+                if s.check_in_source():
+                    return
+
+            if config.copyright_show_date:
+                date = s.lastmodified()
+                if date:
+                    if date[:3] != time.localtime()[:3]:
+                        comment.append("%s/%s/%s" % (date[2], date[1], date[0]))
+
+            unit = 'bytes'
+
+            if config.copyright_show_length:
+                length = s.length()
+                if length:
+                    # convert in kilobyte
+                    length /= 1024
+                    unit = 'KB'
+                    if length > 1024:
+                        # convert in megabyte
+                        length /= 1024
+                        unit = 'MB'
+                    if length > 0:
+                        comment.append("%d %s" % (length, unit))
+
+            if cache:
+                if engine == 'google':
+                    comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s google cache]' % add_item[7:])
+                elif engine == 'yahoo':
+                    cache = False
+                elif engine == 'msn':
+                    cache = False
+
     for i in range(len(url)):
         if add_item in url[i]:
             if engine not in url[i][1]:
-                url[i] = (add_item, url[i][1] + ', ' + engine)
+                url[i] = (add_item, url[i][1] + ', ' + engine, comment)
             return
-    url.append((add_item, engine))
+    url.append((add_item, engine, comment))
     return
def get_results(query, numresults = 10):
@@ -543,9 +672,6 @@
                 data = google.doGoogleSearch('-Wikipedia "' + query + '"')
                 search_request_retry = 0
                 for entry in data.results:
-                    if config.copyright_check_in_source_google:
-                        if check_in_source(entry.URL):
-                            continue
                     add_in_urllist(url, entry.URL, 'google')
             except KeyboardInterrupt:
                 raise
@@ -565,9 +691,6 @@
         while search_request_retry:
             try:
                 for entry in data.parse_results():
-                    if config.copyright_check_in_source_yahoo:
-                        if check_in_source(entry.Url):
-                            continue
                     add_in_urllist(url, entry.Url, 'yahoo')
                 search_request_retry = 0
             except Exception, err: