Revision: 4466
Author: cosoleto
Date: 2007-10-18 09:38:06 +0000 (Thu, 18 Oct 2007)
Log Message:
-----------
* [[it:User:RevertBot/Lista_di_esclusione]] exclusion list wiki page have moved and
load_pages() function don't work
* Live Search don't work if 'number_of_words' variable is set to 31 (current
default). Warnings added.
* Python 2.4 compatibility fix ('module' object has no attribute 'quote')
and other minor fixes
Modified Paths:
--------------
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-10-17 21:25:20 UTC (rev 4465)
+++ trunk/pywikipedia/copyright.py 2007-10-18 09:38:06 UTC (rev 4466)
@@ -74,7 +74,7 @@
#
from __future__ import generators
-import sys, re, codecs, os, time, urllib2, httplib
+import sys, re, codecs, os, time, urllib, urllib2, httplib
import wikipedia, pagegenerators, catlib, config
__version__='$Id$'
@@ -82,12 +82,12 @@
# Search keywords added to all the queries.
no_result_with_those_words = '-Wikipedia'
+# Performing a search engine query if string length is greater than the given value.
+min_query_string_len = 120
+
# Split the text into strings of a specified number of words.
number_of_words = 31
-# Performing a search engine query if string length is greater than the given value.
-min_query_string_len = 120
-
# Try to skip quoted text.
exclude_quote = True
@@ -115,7 +115,7 @@
output_file = wikipedia.datafilepath(appdir, "output.txt")
pages_for_exclusion_database = [
- ('it', 'User:RevertBot/Lista_di_esclusione',
'exclusion_list.txt'),
+ ('it', 'Wikipedia:Sospette violazioni di copyright/Lista di
esclusione', 'exclusion_list.txt'),
('en', 'Wikipedia:Mirrors_and_forks/Abc', 'Abc.txt'),
('en', 'Wikipedia:Mirrors_and_forks/Def', 'Def.txt'),
('en', 'Wikipedia:Mirrors_and_forks/Ghi', 'Ghi.txt'),
@@ -259,6 +259,15 @@
def error(text ,prefix = None):
_output(text, prefix = prefix, color = error_color)
+def print_stats():
+ wikipedia.output('\n'
+ 'Search engine | number of queries\n'
+ '---------------------------------\n'
+ 'Google | %s\n'
+ 'Yahoo! | %s\n'
+ 'Live Search | %s\n'
+ % (num_google_queries, num_yahoo_queries, num_msn_queries))
+
def skip_section(text):
l = list()
for s in sections_to_skip.values():
@@ -308,21 +317,20 @@
raise
if force_update:
+ data = None
try:
data = page.get()
- f = codecs.open(path, 'w', 'utf-8')
- f.write(data)
- f.close()
except KeyboardInterrupt:
raise
except wikipedia.IsRedirectPage, arg:
- if isinstance(arg, wikipedia.IsRedirectPage):
- newtitle = arg.args[0]
- else:
- newtitle = arg.message
- data = wikipedia.Page(page.site(), newtitle).get()
+ data = page.getRedirectTarget().get()
except:
error('Getting page failed')
+
+ if data:
+ f = codecs.open(path, 'w', 'utf-8')
+ f.write(data)
+ f.close()
return
def check_list(url, clist, verbose = False):
@@ -542,8 +550,6 @@
return text
-excl_list = exclusion_list()
-
def exclusion_list_sanity_check():
print "Exclusion list sanity check..."
for entry in excl_list:
@@ -614,8 +620,11 @@
consecutive = False
if " " in search_words:
search_words = search_words[:search_words.rindex("
")]
+
results = get_results(search_words)
+
group_url = '' ; cmp_group_url = ''
+
for url, engine, comment in results:
if comment:
group_url += '\n*%s - %s (%s)' % (engine, url, ";
".join(comment))
@@ -624,8 +633,11 @@
cmp_group_url += '\n*%s - %s' % (engine, url)
if results:
group_url_list = group_url.splitlines()
+ cmp_group_url_list = cmp_group_url.splitlines()
group_url_list.sort()
+ cmp_group_url_list.sort()
group_url = '\n'.join(group_url_list)
+ cmp_group_url = '\n'.join(cmp_group_url_list)
if previous_group_url == cmp_group_url:
if consecutive:
output += ' ' + search_words
@@ -808,7 +820,7 @@
if config.copyright_show_length:
length = s.length()
- if length:
+ if length > 1024:
# convert in kilobyte
length /= 1024
unit = 'KB'
@@ -816,13 +828,13 @@
# convert in megabyte
length /= 1024
unit = 'MB'
- if length > 0:
- comment.append("%d %s" % (length, unit))
+ if length > 0:
+ comment.append("%d %s" % (length, unit))
if cache:
if cache_url:
if engine == 'google':
-
comment.append('[http://www.google.com/search?sourceid=navclient&q=… Google
cache]' % urllib2.quote(short_url(add_item)))
+
comment.append('[http://www.google.com/search?sourceid=navclient&q=… Google
cache]' % urllib.quote(short_url(add_item)))
elif engine == 'yahoo':
#cache = False
#comment.append('[%s Yahoo cache]' %
re.sub('&appid=[^&]*','', urllib2.unquote(cache_url)))
@@ -993,13 +1005,10 @@
except wikipedia.NoPage:
wikipedia.output(u'Page %s not found' % page.title())
continue
- except wikipedia.IsRedirectPage, error:
- if isinstance(error, wikipedia.IsRedirectPage):
- newtitle = error.args[0]
- else:
- newtitle = error.message
- wikipedia.output(u'Page %s redirect to \'%s\'' %
(page.aslink(), newtitle))
- bot = CheckRobot(iter([wikipedia.Page(page.site(), newtitle),]))
+ except wikipedia.IsRedirectPage:
+ newpage = page.getRedirectTarget()
+ wikipedia.output(u'Page %s redirect to \'%s\'' %
(page.aslink(), newpage.title()))
+ bot = CheckRobot(iter([newpage,]))
bot.run()
continue
@@ -1095,6 +1104,9 @@
elif arg.startswith('-skipquery'):
if len(arg) >= 11:
config.copyright_skip_query = int(arg[11:])
+ elif arg.startswith('-nwords'):
+ if len(arg) >= 8:
+ number_of_words = int(arg[8:])
elif arg.startswith('-text'):
if len(arg) >= 6:
text = arg[6:]
@@ -1154,8 +1166,19 @@
bot = CheckRobot(preloadingGen)
bot.run()
+excl_list = exclusion_list()
+
+if number_of_words > 22:
+ if not config.copyright_google and not config.copyright_yahoo and
config.copyright_msn:
+ warn("'number_of_words' variable set to 22 as Live Search requires a
lower value of %s" % number_of_words, prefix = 'Warning')
+ number_of_words = 22
+ elif config.copyright_msn:
+ warn("Live Search requires a lower value for 'number_of_words'
variable "
+ "(current value is %d, a good value may be 22)." %
(number_of_words), prefix = 'Warning')
+
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()
+ print_stats()