Revision: 4364
Author: cosoleto
Date: 2007-09-26 08:56:57 +0000 (Wed, 26 Sep 2007)
Log Message:
-----------
code cleanup, bug fix (comment was deleted if two or more search engines find the same
URL)
Modified Paths:
--------------
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-09-25 15:05:53 UTC (rev 4363)
+++ trunk/pywikipedia/copyright.py 2007-09-26 08:56:57 UTC (rev 4364)
@@ -370,13 +370,14 @@
if text.count(', ') > 4:
l = len(text)
c = text.count(', ')
- r = 100 * c / l
+ r = 100 * float(c) / l
+ #if r >= 4 and r < 7:
+ # write_log("%d/%d/%d: %s\n" % (l,c,r,text),
"copyright/skip_%s.txt" % ("%0.1f" % r))
+
if r >= comma_ratio:
return True
- # write_log("%d/%d/%d: %s\n" % (l,c,r,text), "copyright/skip"
+ str(r) + ".txt")
-
# Numbers
if re.search('[^0-9\'*/,. +?:;-]{5}', text):
return False
@@ -738,6 +739,8 @@
for i in range(len(url)):
if add_item in url[i]:
if engine not in url[i][1]:
+ if url[i][2]:
+ comment = url[i][2]
url[i] = (add_item, url[i][1] + ', ' + engine, comment)
return
url.append((add_item, engine, comment))
@@ -757,93 +760,85 @@
if config.copyright_exceeded_in_queries == 3:
raise 'Got a queries exceeded error.'
-def get_results(query, numresults = 10):
- url = list()
- query = re.sub("[()\"<>]", "", query)
- #wikipedia.output(query)
- if config.copyright_google:
- import google
- google.LICENSE_KEY = config.google_key
- print " Google query..."
+def soap(engine, query, url, numresults = 10):
+ print " %s query..." % engine.capitalize()
search_request_retry = config.copyright_connection_tries
while search_request_retry:
try:
- data = google.doGoogleSearch('%s "%s"' %
(no_result_with_those_words, query))
+ if engine == 'google':
+ import google
+ google.LICENSE_KEY = config.google_key
+ data = google.doGoogleSearch('%s "%s"' %
(no_result_with_those_words, query))
+ for entry in data.results:
+ add_in_urllist(url, entry.URL, 'google')
+ elif engine == 'yahoo':
+ import yahoo.search.web
+ data = yahoo.search.web.WebSearch(config.yahoo_appid,
query='"%s" %s' % (
+ query.encode('utf_8'),
+ no_result_with_those_words
+ ), results = numresults)
+ for entry in data.parse_results():
+ add_in_urllist(url, entry.Url, 'yahoo')
+ elif engine == 'msn':
+ #max_query_len = 150?
+ from SOAPpy import WSDL
+
+ try:
+ server =
WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl')
+ except:
+ print "Live Search Error"
+ raise
+ params = {'AppID': config.msn_appid, 'Query': '%s
"%s"' % (no_result_with_those_words, query),
+ 'CultureInfo': 'en-US',
'SafeSearch': 'Off', 'Requests': {
+ 'SourceRequest':{'Source': 'Web',
'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
+
+ results = ''
+
+ server_results = server.Search(Request = params)
+ if server_results.Responses[0].Results:
+ results = server_results.Responses[0].Results[0]
+ if results:
+ # list or instance?
+ if type(results) == type([]):
+ for entry in results:
+ add_in_urllist(url, entry.Url, 'msn')
+ else:
+ add_in_urllist(url, results.Url, 'msn')
search_request_retry = 0
- for entry in data.results:
- add_in_urllist(url, entry.URL, 'google')
except KeyboardInterrupt:
raise
except Exception, err:
print "Got an error ->", err
+
#
# SOAP.faultType: <Fault SOAP-ENV:Server: Exception from service
object:
# Daily limit of 1000 queries exceeded for key ***>
#
if 'Daily limit' in str(err):
exceeded_in_queries('google')
- if search_request_retry:
- search_request_retry -= 1
- if config.copyright_yahoo:
- import yahoo.search.web
- print " Yahoo query..."
- data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s"
%s' % (
- query.encode('utf_8'),
- no_result_with_those_words
- ), results = numresults)
- search_request_retry = config.copyright_connection_tries
- while search_request_retry:
- try:
- for entry in data.parse_results():
- add_in_urllist(url, entry.Url, 'yahoo')
- search_request_retry = 0
- except Exception, err:
- print "Got an error ->", err
if 'limit exceeded' in str(err):
exceeded_in_queries('yahoo')
- if search_request_retry:
- search_request_retry -= 1
- if config.copyright_msn:
- #max_query_len = 150?
- from SOAPpy import WSDL
- print " Live query..."
- try:
- server =
WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl')
- except:
- print "Live Search Error"
- raise
- params = {'AppID': config.msn_appid, 'Query': '%s
"%s"' % (no_result_with_those_words, query),
- 'CultureInfo': 'en-US', 'SafeSearch':
'Off', 'Requests': {
- 'SourceRequest':{'Source': 'Web',
'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
-
- search_request_retry = config.copyright_connection_tries
- results = ''
- while search_request_retry:
- try:
- server_results = server.Search(Request = params)
- search_request_retry = 0
- if server_results.Responses[0].Results:
- results = server_results.Responses[0].Results[0]
- except Exception, err:
- print "Got an error ->", err
if search_request_retry:
search_request_retry -= 1
- if results:
- # list or instance?
- if type(results) == type([]):
- for entry in results:
- add_in_urllist(url, entry.Url, 'msn')
- else:
- add_in_urllist(url, results.Url, 'msn')
+def get_results(query, numresults = 10):
+ result_list = list()
+ query = re.sub("[()\"<>]", "", query)
+ # wikipedia.output(query)
+ if config.copyright_google:
+ soap('google', query, result_list)
+ if config.copyright_yahoo:
+ soap('yahoo', query, result_list, numresults = numresults)
+ if config.copyright_msn:
+ soap('msn', query, result_list)
offset = 0
- for i in range(len(url)):
- if check_list(url[i + offset][0], excl_list, verbose = True):
- url.pop(i + offset)
+ for i in range(len(result_list)):
+ if check_list(result_list[i + offset][0], excl_list, verbose = True):
+ result_list.pop(i + offset)
offset += -1
- return url
+ return result_list
def get_by_id(title, id):
return
wikipedia.getSite().getUrl("/w/index.php?title=%s&oldid=%s&action=raw" %
(title, id))