SVN: [4085] trunk/pywikipedia/copyright.py - Pywikipedia-l

22 Aug 2007

Revision: 4085
Author:   cosoleto
Date:     2007-08-22 08:38:50 +0000 (Wed, 22 Aug 2007)
Log Message:
-----------
code cleanup
Modified Paths:
--------------
    trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py
===================================================================

--- trunk/pywikipedia/copyright.py	2007-08-21 23:52:43 UTC (rev 4084)
+++ trunk/pywikipedia/copyright.py	2007-08-22 08:38:50 UTC (rev 4085)
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 # -*- coding: utf-8  -*-
 """
-This robot checks copyright text in Google and Yahoo.
+This robot checks copyright text in Google, Yahoo and Live Search.
Google search requires to install the pyGoogle module from
 http://pygoogle.sf.net and get a Google API license key from
@@ -11,6 +11,9 @@
 Yahoo! search requires pYsearch module from http://pysearch.sourceforge.net
 and a Yahoo AppID from http://developer.yahoo.com.
+Windows Live Search requires to install the SOAPpy module from
+http://pywebsvcs.sf.net and get an AppID from http://search.msn.com/developer.
+
 You can run the bot with the following commandline parameters:
-g           - Use Google search engine
@@ -182,7 +185,7 @@
sections_to_skip = {
     'en':['References', 'Further reading', 'Citations', 'External links'],
-    'it':['Bibliografia', 'Riferimenti bibliografici', "Collegamenti esterni"],
+    'it':['Bibliografia', 'Riferimenti bibliografici', 'Collegamenti esterni',  'Pubblicazioni principali'],
 }
def skip_section(text):
@@ -222,23 +225,27 @@
 def load_pages(force_update = False):
     for page, path in exclusion_file_list():
         try:
-            length = 0
-            length = os.path.getsize(path)
-            file_age = time.time() - os.path.getmtime(path)
-            if file_age > 24 * 60 * 60:
-                print 'Updating page [[' + page.title() + ']] to exclude new URLs...'
-                length = 0
+            if not os.path.exists(path):
+                    print 'Creating file '%s' ([[%s]])' % (path, page.title())
+                    force_update = True
+            else:
+                file_age = time.time() - os.path.getmtime(path)
+                if file_age > 24 * 60 * 60:
+                    print 'Updating file '%s' ([[%s]])' % (path, page.title())
+                    force_update = True
         except OSError:
-            pass
+            raise
-        if length == 0 or force_update:
+        if force_update:
             try:
                 data = page.get()
                 f = codecs.open(path, 'w', 'utf-8')
                 f.write(data)
                 f.close()
-            except wikipedia.IsRedirectPage:
-                data = page.get(get_redirect=True)
+            except KeyboardInterrupt:
+                raise
+            except wikipedia.IsRedirectPage, arg:
+                data = wikipedia.Page(page.site(), arg).get()
             except:
                 print 'Getting page failed'
     return
@@ -666,13 +673,13 @@
     for arg in wikipedia.handleArgs():
         #if arg.startswith('-repeat'):
         #    repeat = True
-        if arg.startswith('-y'):
+        if arg == '-y':
             config.copyright_yahoo = True
-        elif arg.startswith('-g'):
+        elif arg == '-g':
             config.copyright_google = True
-        elif arg.startswith('-ny'):
+        elif arg == '-ny':
             config.copyright_yahoo = False
-        elif arg.startswith('-ng'):
+        elif arg == '-ng':
             config.copyright_google = False
         elif arg.startswith('-output'):
             if len(arg) >= 8: