Revision: 4466 Author: cosoleto Date: 2007-10-18 09:38:06 +0000 (Thu, 18 Oct 2007)
Log Message: ----------- * [[it:User:RevertBot/Lista_di_esclusione]] exclusion list wiki page have moved and load_pages() function don't work * Live Search don't work if 'number_of_words' variable is set to 31 (current default). Warnings added. * Python 2.4 compatibility fix ('module' object has no attribute 'quote') and other minor fixes
Modified Paths: -------------- trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py =================================================================== --- trunk/pywikipedia/copyright.py 2007-10-17 21:25:20 UTC (rev 4465) +++ trunk/pywikipedia/copyright.py 2007-10-18 09:38:06 UTC (rev 4466) @@ -74,7 +74,7 @@ #
from __future__ import generators -import sys, re, codecs, os, time, urllib2, httplib +import sys, re, codecs, os, time, urllib, urllib2, httplib import wikipedia, pagegenerators, catlib, config
__version__='$Id$' @@ -82,12 +82,12 @@ # Search keywords added to all the queries. no_result_with_those_words = '-Wikipedia'
+# Performing a search engine query if string length is greater than the given value. +min_query_string_len = 120 + # Split the text into strings of a specified number of words. number_of_words = 31
-# Performing a search engine query if string length is greater than the given value. -min_query_string_len = 120 - # Try to skip quoted text. exclude_quote = True
@@ -115,7 +115,7 @@ output_file = wikipedia.datafilepath(appdir, "output.txt")
pages_for_exclusion_database = [ - ('it', 'User:RevertBot/Lista_di_esclusione', 'exclusion_list.txt'), + ('it', 'Wikipedia:Sospette violazioni di copyright/Lista di esclusione', 'exclusion_list.txt'), ('en', 'Wikipedia:Mirrors_and_forks/Abc', 'Abc.txt'), ('en', 'Wikipedia:Mirrors_and_forks/Def', 'Def.txt'), ('en', 'Wikipedia:Mirrors_and_forks/Ghi', 'Ghi.txt'), @@ -259,6 +259,15 @@ def error(text ,prefix = None): _output(text, prefix = prefix, color = error_color)
+def print_stats(): + wikipedia.output('\n' + 'Search engine | number of queries\n' + '---------------------------------\n' + 'Google | %s\n' + 'Yahoo! | %s\n' + 'Live Search | %s\n' + % (num_google_queries, num_yahoo_queries, num_msn_queries)) + def skip_section(text): l = list() for s in sections_to_skip.values(): @@ -308,21 +317,20 @@ raise
if force_update: + data = None try: data = page.get() - f = codecs.open(path, 'w', 'utf-8') - f.write(data) - f.close() except KeyboardInterrupt: raise except wikipedia.IsRedirectPage, arg: - if isinstance(arg, wikipedia.IsRedirectPage): - newtitle = arg.args[0] - else: - newtitle = arg.message - data = wikipedia.Page(page.site(), newtitle).get() + data = page.getRedirectTarget().get() except: error('Getting page failed') + + if data: + f = codecs.open(path, 'w', 'utf-8') + f.write(data) + f.close() return
def check_list(url, clist, verbose = False): @@ -542,8 +550,6 @@
return text
-excl_list = exclusion_list() - def exclusion_list_sanity_check(): print "Exclusion list sanity check..." for entry in excl_list: @@ -614,8 +620,11 @@ consecutive = False if " " in search_words: search_words = search_words[:search_words.rindex(" ")] + results = get_results(search_words) + group_url = '' ; cmp_group_url = '' + for url, engine, comment in results: if comment: group_url += '\n*%s - %s (%s)' % (engine, url, "; ".join(comment)) @@ -624,8 +633,11 @@ cmp_group_url += '\n*%s - %s' % (engine, url) if results: group_url_list = group_url.splitlines() + cmp_group_url_list = cmp_group_url.splitlines() group_url_list.sort() + cmp_group_url_list.sort() group_url = '\n'.join(group_url_list) + cmp_group_url = '\n'.join(cmp_group_url_list) if previous_group_url == cmp_group_url: if consecutive: output += ' ' + search_words @@ -808,7 +820,7 @@
if config.copyright_show_length: length = s.length() - if length: + if length > 1024: # convert in kilobyte length /= 1024 unit = 'KB' @@ -816,13 +828,13 @@ # convert in megabyte length /= 1024 unit = 'MB' - if length > 0: - comment.append("%d %s" % (length, unit)) + if length > 0: + comment.append("%d %s" % (length, unit))
if cache: if cache_url: if engine == 'google': - comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s Google cache]' % urllib2.quote(short_url(add_item))) + comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s Google cache]' % urllib.quote(short_url(add_item))) elif engine == 'yahoo': #cache = False #comment.append('[%s Yahoo cache]' % re.sub('&appid=[^&]*','', urllib2.unquote(cache_url))) @@ -993,13 +1005,10 @@ except wikipedia.NoPage: wikipedia.output(u'Page %s not found' % page.title()) continue - except wikipedia.IsRedirectPage, error: - if isinstance(error, wikipedia.IsRedirectPage): - newtitle = error.args[0] - else: - newtitle = error.message - wikipedia.output(u'Page %s redirect to '%s'' % (page.aslink(), newtitle)) - bot = CheckRobot(iter([wikipedia.Page(page.site(), newtitle),])) + except wikipedia.IsRedirectPage: + newpage = page.getRedirectTarget() + wikipedia.output(u'Page %s redirect to '%s'' % (page.aslink(), newpage.title())) + bot = CheckRobot(iter([newpage,])) bot.run() continue
@@ -1095,6 +1104,9 @@ elif arg.startswith('-skipquery'): if len(arg) >= 11: config.copyright_skip_query = int(arg[11:]) + elif arg.startswith('-nwords'): + if len(arg) >= 8: + number_of_words = int(arg[8:]) elif arg.startswith('-text'): if len(arg) >= 6: text = arg[6:] @@ -1154,8 +1166,19 @@ bot = CheckRobot(preloadingGen) bot.run()
+excl_list = exclusion_list() + +if number_of_words > 22: + if not config.copyright_google and not config.copyright_yahoo and config.copyright_msn: + warn("'number_of_words' variable set to 22 as Live Search requires a lower value of %s" % number_of_words, prefix = 'Warning') + number_of_words = 22 + elif config.copyright_msn: + warn("Live Search requires a lower value for 'number_of_words' variable " + "(current value is %d, a good value may be 22)." % (number_of_words), prefix = 'Warning') + if __name__ == "__main__": try: main() finally: wikipedia.stopme() + print_stats()