Revision: 4433
Author: cosoleto
Date: 2007-10-09 14:06:41 +0000 (Tue, 09 Oct 2007)
Log Message:
-----------
* Improved remove wikicode function. Now it does partly use of pywikiparser module.
* Warning and error messages coloured
* Minor fixes and code cleanup
Modified Paths:
--------------
trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-10-09 12:59:29 UTC (rev 4432)
+++ trunk/pywikipedia/copyright.py 2007-10-09 14:06:41 UTC (rev 4433)
@@ -1,7 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-This robot checks copyright text in Google, Yahoo and Live Search.
+This robot checks copyright text in Google, Yahoo! and Live Search.
Google search requires to install the pyGoogle module from
http://pygoogle.sf.net and get a Google API license key from
@@ -79,12 +79,21 @@
__version__='$Id$'
-#
+# Search keywords added to all the queries.
no_result_with_those_words = '-Wikipedia'
+# Split the text into strings of a specified number of words.
+number_of_words = 31
+
+# Performing a search engine query if string length is greater than the given value.
+min_query_string_len = 120
+
# Try to skip quoted text.
exclude_quote = True
+# Enable DOTALL regular expression flag in remove_wikicode() function.
+remove_wikicode_dotall = True
+
# If ratio between query length and number of commas is greater or equal
# to 'comma_ratio' then the script identify a comma separated list and
# don't send data to search engine.
@@ -93,6 +102,15 @@
# No checks if the page is a disambiguation page.
skip_disambig = True
+# Parameter used in Live Search query.
+# (
http://msdn2.microsoft.com/en-us/library/bb266177.aspx)
+region_code = 'en-US'
+
+enable_color = True
+
+warn_color = 'lightyellow'
+error_color = 'lightred'
+
appdir = "copyright"
output_file = wikipedia.datafilepath(appdir, "output.txt")
@@ -220,6 +238,27 @@
'it':['Bibliografia', 'Riferimenti bibliografici',
'Collegamenti esterni', 'Pubblicazioni principali'],
}
+num_google_queries = 0 ; num_yahoo_queries = 0 ; num_msn_queries = 0
+
+if enable_color:
+ warn_color = '\03{%s}' % warn_color
+ error_color = '\03{%s}' % error_color
+ default_color = '\03{default}'
+else:
+ warn_color = '' ; error_color = '' ; default_color = ''
+
+def _output(text, prefix = None, color = ''):
+ if prefix:
+ wikipedia.output('%s%s: %s%s' % (color, prefix, default_color, text))
+ else:
+ wikipedia.output('%s%s' % (color, text))
+
+def warn(text, prefix = None):
+ _output(text, prefix = prefix, color = warn_color)
+
+def error(text ,prefix = None):
+ _output(text, prefix = prefix, color = error_color)
+
def skip_section(text):
l = list()
for s in sections_to_skip.values():
@@ -279,16 +318,17 @@
except wikipedia.IsRedirectPage, arg:
data = wikipedia.Page(page.site(), arg).get()
except:
- print 'Getting page failed'
+ error('Getting page failed')
return
-def check_list(text, cl, verbose = False):
- for entry in cl:
+def check_list(url, clist, verbose = False):
+ for entry in clist:
if entry:
- if text.find(entry) != -1:
- #print entry
- if verbose:
- print 'SKIP URL ' + text
+ if url.find(entry) != -1:
+ if verbose > 1:
+ warn('URL Excluded: %s\nReason: %s' % (url, entry))
+ elif verbose:
+ warn('URL Excluded: %s' % url)
return True
def exclusion_list():
@@ -384,7 +424,7 @@
return True
#
-# Set regex used in cleanwikicode() to remove [[Image:]] tags
+# Set regex used in remove_wikicode() to remove [[Image:]] tags
# and regex used in check_in_source() to reject pages with
# 'Wikipedia'.
@@ -402,23 +442,21 @@
reSectionNamesC = re.compile('(' + '|'.join(editsection_names.values()) +
')')
def cleanwikicode(text):
+ remove_wikicode(text)
+
+def remove_wikicode(text, re_dotall = False, debug = False):
if not text:
return ""
- #write_log(text+'\n', "copyright/debug_cleanwikicode1.txt")
+ if debug:
+ write_log(text+'\n', "copyright/wikicode.txt")
text =
re.sub('(?i)</?(p|u|i|b|em|div|span|font|small|big|code|tt).*?>',
'', text)
text = re.sub('(?i)<(/\s*)?br(\s*/)?>', '', text)
text = re.sub('<!--.*?-->', '', text)
- text = re.sub('<', '<', text)
- text = re.sub('>', '>', text)
- if exclude_quote:
- text = re.sub("(?i){{quote\|.*?}}", "", text)
- text =
re.sub("^[:*]?\s*''.*?''\.?\s*((\(|<ref>).*?(\)|</ref>))?\.?$",
"", text)
- text =
re.sub('^[:*]?\s*["][^"]+["]\.?\s*((\(|<ref>).*?(\)|</ref>))?\.?$',
"", text)
- text =
re.sub('^[:*]?\s*[«][^»]+[»]\.?\s*((\(|<ref>).*?(\)|</ref>))?\.?$',
"", text)
- text =
re.sub('^[:*]?\s*[“][^”]+[”]\.?\s*((\(|<ref>).*?(\)|</ref>))?\.?$',
"", text)
+ text = text.replace('<', '<')
+ text = text.replace('>', '>')
# remove URL
text = re.sub('(ftp|https?)://[\w/.,;:@&=%#\\\?_!~*\'|()\"+-]+',
' ', text)
@@ -432,26 +470,72 @@
# remove unicode and polytonic template
text = re.sub("(?i){{(unicode|polytonic)\|(.*?)}}", "\\1", text)
+ if re_dotall:
+ flags = "(?xsim)"
+ # exclude wikitable
+ text = re.sub('(?s){\|.*?^\|}', '', text)
+ else:
+ flags = "(?xim)"
+
text = re.sub("""
- (?xim)
+ %s
(
- <ref.*?>.*?</ref> | # exclude <ref> notes
- ^[\ \t]*({\||[|!]).* | # exclude wikitable
- </*nowiki> | # remove <nowiki> tags
- {{.*?}} | # remove template
- <math>.*?</math> | # remove LaTeX staff
- [\[\]] | # remove [, ]
- ^[*:;]+ | # remove *, :, ; in begin of line
- <!-- |
- --> |
+ <ref[^>]*?\s*/\s*> | # exclude <ref name = '' / >
tags
+ <ref.*?>.*?</ref> | # exclude <ref> notes
+ ^[\ \t]*({\||[|!]).*?$ | # exclude wikitable
+ </*nowiki> | # remove <nowiki> tags
+ {{.*?}} | # remove (not nested) template
+ <math>.*?</math> | # remove LaTeX staff
+ [\[\]] | # remove [, ]
+ ^[*:;]+ | # remove *, :, ; in begin of line
+ <!-- |
+ --> |
)
- """, "", text)
+ """ % flags, "", text)
+ if exclude_quote:
+ # '' text ''
+ # '' text ''.
+ # '' text '' (text)
+ # « text »
+ # ...
+ #
+
+ italic_quoteC =
re.compile("(?m)^[:*]?\s*(''.*?'')\.?\s*(\(.*?\))?\r?$")
+
+ index = 0
+ try:
+ import pywikiparser
+ except ImportError:
+ pywikiparser = False
+
+ while pywikiparser:
+ m = italic_quoteC.search(text, index)
+ if not m:
+ break
+
+ s = pywikiparser.Parser(m.group(1), debug = True)
+
+ try:
+ xmldata = s.parse().toxml()
+ if '<wikipage><p><i>' in xmldata and
'</i></p></wikipage>' in xmldata:
+ if xmldata.count('<i>') == 1:
+ text = text[:m.start()] + text[m.end():]
+ except:
+ pass
+
+ index = m.start() + 1
+
+ text =
re.sub('(?m)^[:*]*\s*["][^"]+["]\.?\s*(\(.*?\))?\r?$',
"", text)
+ text = re.sub('(?m)^[:*]*\s*[«][^»]+[»]\.?\s*(\(.*?\))?\r?$',
"", text)
+ text = re.sub('(?m)^[:*]*\s*[“][^”]+[”]\.?\s*(\(.*?\))?\r?$',
"", text)
+
# remove useless spaces
- text = re.sub("(?m)(^[ \t]+|[ \t]+$)", "", text)
+ text = re.sub("(?m)(^[ \t]+|[ \t]+\r?$)", "", text)
- #if text:
- # write_log(text+'\n', "copyright/debug_cleanwikicode2.txt")
+ if debug:
+ write_log(text+'\n', "copyright/wikicode_removed.txt")
+
return text
excl_list = exclusion_list()
@@ -463,7 +547,7 @@
print "** " + entry
def exclusion_list_dump():
- f = open(wikipedia.datafilepath(appdir, 'exclusion_list.dump', 'w'))
+ f = open(wikipedia.datafilepath(appdir, 'exclusion_list.dump'), 'w')
f.write('\n'.join(excl_list))
f.close()
print "Exclusion list dump saved."
@@ -494,7 +578,7 @@
break
return l
-def query(lines = [], max_query_len = 1300):
+def query(lines = [], max_query_len = 1300, wikicode = True):
# Google max_query_len = 1480?
# - '-Wikipedia ""' = 1467
@@ -505,18 +589,19 @@
previous_group_url = 'none'
for line in lines:
- line = cleanwikicode(line)
- for search_words in mysplit(line, 31, " "):
- if len(search_words) > 120:
+ if wikicode:
+ line = remove_wikicode(line)
+ for search_words in mysplit(line, number_of_words, " "):
+ if len(search_words) > min_query_string_len:
if config.copyright_economize_query:
if economize_query(search_words):
- wikipedia.output('SKIP TEXT: ' + search_words)
+ warn(search_words, prefix = 'Text excluded')
consecutive = False
continue
n_query += 1
#wikipedia.output(search_words)
if config.copyright_max_query_for_page and n_query >
config.copyright_max_query_for_page:
- wikipedia.output(u"Max query limit for page reached")
+ warn(u"Max query limit for page reached")
return output
if config.copyright_skip_query > n_query:
continue
@@ -526,17 +611,18 @@
if " " in search_words:
search_words = search_words[:search_words.rindex("
")]
results = get_results(search_words)
- group_url = ''
+ group_url = '' ; cmp_group_url = ''
for url, engine, comment in results:
if comment:
group_url += '\n*%s - %s (%s)' % (engine, url, ";
".join(comment))
else:
group_url += '\n*%s - %s' % (engine, url)
+ cmp_group_url += '\n*%s - %s' % (engine, url)
if results:
group_url_list = group_url.splitlines()
group_url_list.sort()
group_url = '\n'.join(group_url_list)
- if previous_group_url == group_url:
+ if previous_group_url == cmp_group_url:
if consecutive:
output += ' ' + search_words
else:
@@ -544,7 +630,7 @@
else:
output += group_url + '\n**' + search_words
- previous_group_url = group_url
+ previous_group_url = cmp_group_url
consecutive = True
else:
consecutive = False
@@ -569,8 +655,9 @@
def __init__(self, url):
"""
"""
+ global source_seen
- if check_list(url, excl_list):
+ if url in source_seen or check_list(url, excl_list):
raise URL_exclusion
self._url = url
@@ -580,16 +667,16 @@
#except httplib.BadStatusLine, line:
# print 'URL: %s\nBad status line: %s' % (url, line)
except urllib2.HTTPError, err:
- print "HTTP error: %d / %s (%s)" % (err.code, err.msg, url)
- #if err.code == 404:
+ error("HTTP error: %d / %s (%s)" % (err.code, err.msg, url))
if err.code >= 400:
+ source_seen.add(self._url)
raise NoWebPage
return None
except urllib2.URLError, arg:
- print "URL error: %s / %s" % (url, arg)
+ error("URL error: %s / %s" % (url, arg))
return None
except Exception, err:
- print "ERROR: %s" % (err)
+ error("ERROR: %s" % (err))
self._lastmodified = self._urldata.info().getdate('Last-Modified')
self._length = self._urldata.info().getheader('Content-Length')
@@ -678,7 +765,7 @@
source_seen.add(self._url)
return False
-def add_in_urllist(url, add_item, engine):
+def add_in_urllist(url, add_item, engine, cache_url = None):
if (engine == 'google' and config.copyright_check_in_source_google) or \
(engine == 'yahoo' and config.copyright_check_in_source_yahoo) or \
@@ -729,12 +816,16 @@
comment.append("%d %s" % (length, unit))
if cache:
- if engine == 'google':
-
comment.append('[http://www.google.com/search?sourceid=navclient&q=… google
cache]' % add_item[7:])
- elif engine == 'yahoo':
- cache = False
- elif engine == 'msn':
- cache = False
+ if cache_url:
+ if engine == 'google':
+
comment.append('[http://www.google.com/search?sourceid=navclient&q=… Google
cache]' % short_url(add_item))
+ elif engine == 'yahoo':
+ #cache = False
+ comment.append('[%s Yahoo cache]' %
re.sub('&appid=[^&]*','', urllib.unquote(cache_url)))
+ elif engine == 'msn':
+ comment.append('[%s Live cache]' %
re.sub('&lang=[^&]*','', cache_url))
+ else:
+
comment.append('[http://web.archive.org/*/%s archive.org]' %
short_url(add_item))
for i in range(len(url)):
if add_item in url[i]:
@@ -754,13 +845,15 @@
exec('config.copyright_' + engine + ' = False')
# Sleeping
if config.copyright_exceeded_in_queries == 2:
- print "Got a queries exceeded error. Sleeping for %d hours..." %
(config.copyright_exceeded_in_queries_sleep_hours)
+ error("Got a queries exceeded error. Sleeping for %d hours..." %
(config.copyright_exceeded_in_queries_sleep_hours))
time.sleep(config.copyright_exceeded_in_queries_sleep_hours * 60 * 60)
# Stop execution
if config.copyright_exceeded_in_queries == 3:
raise 'Got a queries exceeded error.'
def soap(engine, query, url, numresults = 10):
+ global num_google_queries, num_yahoo_queries, num_msn_queries
+
print " %s query..." % engine.capitalize()
search_request_retry = config.copyright_connection_tries
while search_request_retry:
@@ -770,7 +863,10 @@
google.LICENSE_KEY = config.google_key
data = google.doGoogleSearch('%s "%s"' %
(no_result_with_those_words, query))
for entry in data.results:
- add_in_urllist(url, entry.URL, 'google')
+ add_in_urllist(url, entry.URL, 'google',
entry.cachedSize)
+
+ num_google_queries += 1
+
elif engine == 'yahoo':
import yahoo.search.web
data = yahoo.search.web.WebSearch(config.yahoo_appid,
query='"%s" %s' % (
@@ -778,7 +874,13 @@
no_result_with_those_words
), results = numresults)
for entry in data.parse_results():
- add_in_urllist(url, entry.Url, 'yahoo')
+ cacheurl = None
+ if entry.Cache:
+ cacheurl = entry.Cache.Url
+ add_in_urllist(url, entry.Url, 'yahoo', cacheurl)
+
+ num_yahoo_queries += 1
+
elif engine == 'msn':
#max_query_len = 150?
from SOAPpy import WSDL
@@ -786,10 +888,11 @@
try:
server =
WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl')
except:
- print "Live Search Error"
+ error("Live Search Error")
raise
+
params = {'AppID': config.msn_appid, 'Query': '%s
"%s"' % (no_result_with_those_words, query),
- 'CultureInfo': 'en-US',
'SafeSearch': 'Off', 'Requests': {
+ 'CultureInfo': region_code, 'SafeSearch':
'Off', 'Requests': {
'SourceRequest':{'Source': 'Web',
'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
results = ''
@@ -801,14 +904,23 @@
# list or instance?
if type(results) == type([]):
for entry in results:
- add_in_urllist(url, entry.Url, 'msn')
+ cacheurl = None
+ if hasattr(entry, 'CacheUrl'):
+ cacheurl = entry.CacheUrl
+ add_in_urllist(url, entry.Url, 'msn', cacheurl)
else:
- add_in_urllist(url, results.Url, 'msn')
+ cacheurl = None
+ if hasattr(results, 'CacheUrl'):
+ cacheurl = results.CacheUrl
+ add_in_urllist(url, results.Url, 'msn', cacheurl)
+
+ num_msn_queries += 1
+
search_request_retry = 0
except KeyboardInterrupt:
raise
except Exception, err:
- print "Got an error ->", err
+ error(err, "Got an error")
#
# SOAP.faultType: <Fault SOAP-ENV:Server: Exception from service
object:
@@ -851,7 +963,7 @@
output = query(lines=original_text.splitlines())
if output:
write_log(
- "=== [[" + title + "]]
===\n{{/box|%s|prev|%s|%s|00}}"
+ "=== [[" + title + "]]
===\n{{botbox|%s|prev|%s|%s|00}}"
% (title.replace(" ",
"_").replace("\"", "%22"),
id, "author")
+ output,
@@ -876,40 +988,50 @@
except wikipedia.NoPage:
wikipedia.output(u'Page %s not found' % page.title())
continue
- except wikipedia.IsRedirectPage:
- original_text = page.get(get_redirect=True)
+ except wikipedia.IsRedirectPage, error:
+ wikipedia.output(u'Page %s redirect to \'%s\'' %
(page.aslink(), error.message))
+ bot = CheckRobot(iter([wikipedia.Page(page.site(), error.message),]))
+ bot.run()
+ continue
if skip_disambig:
if page.isDisambig():
- wikipedia.output(u'Page %s is a disambiguation page' %
page.title())
+ wikipedia.output(u'Page %s is a disambiguation page' %
page.aslink())
continue
-# colors = [13] * len(page.title())
wikipedia.output(page.title())
if original_text:
text = skip_section(original_text)
- output = query(lines = text.splitlines())
+
+ if remove_wikicode_dotall:
+ text = remove_wikicode(text, re_dotall = True)
+
+ output = query(lines = text.splitlines(), wikicode = not
remove_wikicode_dotall)
if output:
write_log('=== [[' + page.title() + ']] ===' + output
+ '\n',
filename = output_file)
+def short_url(url):
+ return url[url.index('://')+3:]
+
def put(page, text, comment):
while True:
try:
page.put(text, comment = comment)
break
except wikipedia.SpamfilterError, url:
- print "Spam filter"
- text = re.sub(url[0], '<blacklist>' +
url[0][url[0].index('://')+3:], text)
+ warn(url, prefix = "Spam filter")
+ text = re.sub(url[0], '<blacklist>' + short_url(url[0]), text)
except wikipedia.EditConflict:
- print "Edit conflict"
+ warn("Edit conflict")
raise wikipedia.EditConflict
def check_config(var, license_id, license_name):
if var:
if not license_id:
- wikipedia.output(u"WARNING: You don't have set a " +
license_name + ", search engine is disabled.")
+ warn(u"You don't have set a " + license_name + ", search
engine is disabled.",
+ prefix = "WARNING")
return False
return var