[Pywikipedia-l] SVN: [4466] trunk/pywikipedia/copyright.py

18 Oct 2007

Revision: 4466
Author:   cosoleto
Date:     2007-10-18 09:38:06 +0000 (Thu, 18 Oct 2007)
Log Message:
-----------
* [[it:User:RevertBot/Lista_di_esclusione]] exclusion list wiki page have moved and load_pages() function don't work
* Live Search don't work if 'number_of_words' variable is set to 31 (current default). Warnings added.
* Python 2.4 compatibility fix ('module' object has no attribute 'quote') and other minor fixes
Modified Paths:
--------------
    trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py
===================================================================

--- trunk/pywikipedia/copyright.py	2007-10-17 21:25:20 UTC (rev 4465)
+++ trunk/pywikipedia/copyright.py	2007-10-18 09:38:06 UTC (rev 4466)
@@ -74,7 +74,7 @@
 #
from __future__ import generators
-import sys, re, codecs, os, time, urllib2, httplib
+import sys, re, codecs, os, time, urllib, urllib2, httplib
 import wikipedia, pagegenerators, catlib, config
__version__='$Id$'
@@ -82,12 +82,12 @@
 # Search keywords added to all the queries.
 no_result_with_those_words = '-Wikipedia'
+# Performing a search engine query if string length is greater than the given value.
+min_query_string_len = 120
+
 # Split the text into strings of a specified number of words.
 number_of_words = 31
-# Performing a search engine query if string length is greater than the given value.
-min_query_string_len = 120
-
 # Try to skip quoted text.
 exclude_quote = True
@@ -115,7 +115,7 @@
 output_file = wikipedia.datafilepath(appdir, "output.txt")
pages_for_exclusion_database = [
-    ('it', 'User:RevertBot/Lista_di_esclusione', 'exclusion_list.txt'),
+    ('it', 'Wikipedia:Sospette violazioni di copyright/Lista di esclusione', 'exclusion_list.txt'),
     ('en', 'Wikipedia:Mirrors_and_forks/Abc', 'Abc.txt'),
     ('en', 'Wikipedia:Mirrors_and_forks/Def', 'Def.txt'),
     ('en', 'Wikipedia:Mirrors_and_forks/Ghi', 'Ghi.txt'),
@@ -259,6 +259,15 @@
 def error(text ,prefix = None):
     _output(text, prefix = prefix, color = error_color)
+def print_stats():
+        wikipedia.output('\n'
+                         'Search engine | number of queries\n'
+                         '---------------------------------\n'
+                         'Google        | %s\n'
+                         'Yahoo!        | %s\n'
+                         'Live Search   | %s\n'
+                         % (num_google_queries, num_yahoo_queries, num_msn_queries))
+
 def skip_section(text):
     l = list()
     for s in sections_to_skip.values():
@@ -308,21 +317,20 @@
             raise
if force_update:
+            data = None
             try:
                 data = page.get()
-                f = codecs.open(path, 'w', 'utf-8')
-                f.write(data)
-                f.close()
             except KeyboardInterrupt:
                 raise
             except wikipedia.IsRedirectPage, arg:
-                if isinstance(arg, wikipedia.IsRedirectPage):
-                    newtitle = arg.args[0]
-                else:
-                    newtitle = arg.message
-                data = wikipedia.Page(page.site(), newtitle).get()
+                data = page.getRedirectTarget().get()
             except:
                 error('Getting page failed')
+
+            if data:
+                f = codecs.open(path, 'w', 'utf-8')
+                f.write(data)
+                f.close()
     return
def check_list(url, clist, verbose = False):
@@ -542,8 +550,6 @@
return text
-excl_list = exclusion_list()
-
 def exclusion_list_sanity_check():
     print "Exclusion list sanity check..."
     for entry in excl_list:
@@ -614,8 +620,11 @@
                     consecutive = False
                     if " " in search_words:
                          search_words = search_words[:search_words.rindex(" ")]
+
                 results = get_results(search_words)
+
                 group_url = '' ; cmp_group_url = ''
+
                 for url, engine, comment in results:
                     if comment:
                         group_url += '\n*%s - %s (%s)' % (engine, url, "; ".join(comment))
@@ -624,8 +633,11 @@
                     cmp_group_url += '\n*%s - %s' % (engine, url)
                 if results:
                     group_url_list = group_url.splitlines()
+                    cmp_group_url_list = cmp_group_url.splitlines()
                     group_url_list.sort()
+                    cmp_group_url_list.sort()
                     group_url = '\n'.join(group_url_list)
+                    cmp_group_url = '\n'.join(cmp_group_url_list)
                     if previous_group_url == cmp_group_url:
                         if consecutive:
                             output += ' ' + search_words
@@ -808,7 +820,7 @@
if config.copyright_show_length:
                 length = s.length()
-                if length:
+                if length > 1024:
                     # convert in kilobyte
                     length /= 1024
                     unit = 'KB'
@@ -816,13 +828,13 @@
                         # convert in megabyte
                         length /= 1024
                         unit = 'MB'
-                    if length > 0:
-                        comment.append("%d %s" % (length, unit))
+                if length > 0:
+                    comment.append("%d %s" % (length, unit))
if cache:
             if cache_url:
                 if engine == 'google':
-                    comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s Google cache]' % urllib2.quote(short_url(add_item)))
+                    comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s Google cache]' % urllib.quote(short_url(add_item)))
                 elif engine == 'yahoo':
                     #cache = False
                     #comment.append('[%s Yahoo cache]' % re.sub('&appid=[^&]*','', urllib2.unquote(cache_url)))
@@ -993,13 +1005,10 @@
             except wikipedia.NoPage:
                 wikipedia.output(u'Page %s not found' % page.title())
                 continue
-            except wikipedia.IsRedirectPage, error:
-                if isinstance(error, wikipedia.IsRedirectPage):
-                    newtitle = error.args[0]
-                else:
-                    newtitle = error.message
-                wikipedia.output(u'Page %s redirect to '%s'' % (page.aslink(), newtitle))
-                bot = CheckRobot(iter([wikipedia.Page(page.site(), newtitle),]))
+            except wikipedia.IsRedirectPage:
+                newpage = page.getRedirectTarget()
+                wikipedia.output(u'Page %s redirect to '%s'' % (page.aslink(), newpage.title()))
+                bot = CheckRobot(iter([newpage,]))
                 bot.run()
                 continue
@@ -1095,6 +1104,9 @@
         elif arg.startswith('-skipquery'):
             if len(arg) >= 11:
                 config.copyright_skip_query = int(arg[11:])
+        elif arg.startswith('-nwords'):
+            if len(arg) >= 8:
+                number_of_words = int(arg[8:])
         elif arg.startswith('-text'):
             if len(arg) >= 6:
               text = arg[6:]
@@ -1154,8 +1166,19 @@
     bot = CheckRobot(preloadingGen)
     bot.run()
+excl_list = exclusion_list()
+
+if number_of_words > 22:
+    if not config.copyright_google and not config.copyright_yahoo and config.copyright_msn:
+        warn("'number_of_words' variable set to 22 as Live Search requires a lower value of %s" % number_of_words, prefix = 'Warning')
+        number_of_words = 22
+    elif config.copyright_msn:
+        warn("Live Search requires a lower value for 'number_of_words' variable "
+             "(current value is %d, a good value may be 22)." % (number_of_words), prefix = 'Warning')
+
 if __name__ == "__main__":
     try:
         main()
     finally:
         wikipedia.stopme()
+        print_stats()

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

[Pywikipedia-l] SVN: [4466] trunk/pywikipedia/copyright.py