[Pywikipedia-l] SVN: [4433] trunk/pywikipedia/copyright.py

9 Oct 2007

Revision: 4433
Author:   cosoleto
Date:     2007-10-09 14:06:41 +0000 (Tue, 09 Oct 2007)
Log Message:
-----------
* Improved remove wikicode function. Now it does partly use of pywikiparser module.
* Warning and error messages coloured
* Minor fixes and code cleanup
Modified Paths:
--------------
    trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py
===================================================================

--- trunk/pywikipedia/copyright.py	2007-10-09 12:59:29 UTC (rev 4432)
+++ trunk/pywikipedia/copyright.py	2007-10-09 14:06:41 UTC (rev 4433)
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 # -*- coding: utf-8  -*-
 """
-This robot checks copyright text in Google, Yahoo and Live Search.
+This robot checks copyright text in Google, Yahoo! and Live Search.
Google search requires to install the pyGoogle module from
 http://pygoogle.sf.net and get a Google API license key from
@@ -79,12 +79,21 @@
__version__='$Id$'
-#
+# Search keywords added to all the queries.
 no_result_with_those_words = '-Wikipedia'
+# Split the text into strings of a specified number of words.
+number_of_words = 31
+
+# Performing a search engine query if string length is greater than the given value.
+min_query_string_len = 120
+
 # Try to skip quoted text.
 exclude_quote = True
+# Enable DOTALL regular expression flag in remove_wikicode() function.
+remove_wikicode_dotall = True
+
 # If ratio between query length and number of commas is greater or equal
 # to 'comma_ratio' then the script identify a comma separated list and
 # don't send data to search engine.
@@ -93,6 +102,15 @@
 # No checks if the page is a disambiguation page.
 skip_disambig = True
+# Parameter used in Live Search query.
+# (http://msdn2.microsoft.com/en-us/library/bb266177.aspx)
+region_code = 'en-US'
+
+enable_color = True
+
+warn_color = 'lightyellow'
+error_color = 'lightred'
+
 appdir = "copyright"
 output_file = wikipedia.datafilepath(appdir, "output.txt")
@@ -220,6 +238,27 @@
     'it':['Bibliografia', 'Riferimenti bibliografici', 'Collegamenti esterni',  'Pubblicazioni principali'],
 }
+num_google_queries = 0 ; num_yahoo_queries = 0 ; num_msn_queries = 0
+
+if enable_color:
+    warn_color = '\03{%s}' % warn_color
+    error_color = '\03{%s}' % error_color
+    default_color = '\03{default}'
+else:
+    warn_color = '' ; error_color = '' ; default_color = ''
+
+def _output(text, prefix = None, color = ''):
+    if prefix:
+        wikipedia.output('%s%s: %s%s' % (color, prefix, default_color, text))
+    else:
+        wikipedia.output('%s%s' % (color, text))
+
+def warn(text, prefix = None):
+    _output(text, prefix = prefix, color = warn_color)
+
+def error(text ,prefix = None):
+    _output(text, prefix = prefix, color = error_color)
+
 def skip_section(text):
     l = list()
     for s in sections_to_skip.values():
@@ -279,16 +318,17 @@
             except wikipedia.IsRedirectPage, arg:
                 data = wikipedia.Page(page.site(), arg).get()
             except:
-                print 'Getting page failed'
+                error('Getting page failed')
     return
-def check_list(text, cl, verbose = False):
-    for entry in cl:
+def check_list(url, clist, verbose = False):
+    for entry in clist:
         if entry:
-            if text.find(entry) != -1:
-                #print entry
-                if verbose:
-                    print 'SKIP URL ' + text
+            if url.find(entry) != -1:
+                if verbose > 1:
+                    warn('URL Excluded: %s\nReason: %s' % (url, entry))
+                elif verbose:
+                    warn('URL Excluded: %s' % url)
                 return True
def exclusion_list():
@@ -384,7 +424,7 @@
     return True
#
-# Set regex used in cleanwikicode() to remove [[Image:]] tags
+# Set regex used in remove_wikicode() to remove [[Image:]] tags
 # and regex used in check_in_source() to reject pages with
 # 'Wikipedia'.
@@ -402,23 +442,21 @@
 reSectionNamesC = re.compile('(' + '|'.join(editsection_names.values()) + ')')
def cleanwikicode(text):
+    remove_wikicode(text)
+
+def remove_wikicode(text, re_dotall = False, debug = False):
     if not text:
         return ""
-    #write_log(text+'\n', "copyright/debug_cleanwikicode1.txt")
+    if debug:
+        write_log(text+'\n', "copyright/wikicode.txt")
text = re.sub('(?i)</?(p|u|i|b|em|div|span|font|small|big|code|tt).*?>', '', text)
     text = re.sub('(?i)<(/\s*)?br(\s*/)?>', '', text)
     text = re.sub('<!--.*?-->', '', text)
-    text = re.sub('&lt;', '<', text)
-    text = re.sub('&gt;', '>', text)
-    if exclude_quote:
-        text = re.sub("(?i){{quote|.*?}}", "", text)
-        text = re.sub("^[:*]?\s*''.*?''.?\s*(((|<ref>).*?()|</ref>))?.?$", "", text)
-        text = re.sub('^[:*]?\s*["][^"]+["].?\s*(((|<ref>).*?()|</ref>))?.?$', "", text)
-        text = re.sub('^[:*]?\s*[«][^»]+[»].?\s*(((|<ref>).*?()|</ref>))?.?$', "", text)
-        text = re.sub('^[:*]?\s*[“][^”]+[”].?\s*(((|<ref>).*?()|</ref>))?.?$', "", text)
+    text = text.replace('&lt;', '<')
+    text = text.replace('&gt;', '>')
# remove URL
     text = re.sub('(ftp|https?)://[\w/.,;:@&=%#\?_!~*'|()"+-]+', ' ', text)
@@ -432,26 +470,72 @@
     # remove unicode and polytonic template
     text = re.sub("(?i){{(unicode|polytonic)|(.*?)}}", "\1", text)
+    if re_dotall:
+       flags = "(?xsim)"
+       # exclude wikitable
+       text = re.sub('(?s){|.*?^|}', '', text)
+    else:
+       flags = "(?xim)"
+
     text = re.sub("""
-    (?xim)
+    %s
     (
-        <ref.*?>.*?</ref>    | # exclude <ref> notes
-        ^[\ \t]*({||[|!]).* | # exclude wikitable
-        </*nowiki>           | # remove <nowiki> tags
-        {{.*?}}              | # remove template
-        <math>.*?</math>     | # remove LaTeX staff
-        [[]]               | # remove [, ]
-        ^[*:;]+              | # remove *, :, ; in begin of line
-        <!--                 |
-        -->                  |
+        <ref[^>]*?\s*/\s*>     | # exclude <ref name = '' / > tags
+        <ref.*?>.*?</ref>      | # exclude <ref> notes
+        ^[\ \t]*({||[|!]).*?$ | # exclude wikitable
+        </*nowiki>             | # remove <nowiki> tags
+        {{.*?}}                | # remove (not nested) template
+        <math>.*?</math>       | # remove LaTeX staff
+        [[]]                 | # remove [, ]
+        ^[*:;]+                | # remove *, :, ; in begin of line
+        <!--                   |
+        -->                    |
     )
-    """, "", text)
+    """ % flags, "", text)
+    if exclude_quote:
+        # '' text ''
+        # '' text ''.
+        # '' text '' (text)
+        # « text »
+        # ...
+        #
+
+        italic_quoteC = re.compile("(?m)^[:*]?\s*(''.*?'').?\s*((.*?))?\r?$")
+
+        index = 0
+        try:
+            import pywikiparser
+        except ImportError:
+            pywikiparser = False
+
+        while pywikiparser:
+            m = italic_quoteC.search(text, index)
+            if not m:
+                break
+
+            s = pywikiparser.Parser(m.group(1), debug = True)
+
+            try:
+                xmldata = s.parse().toxml()
+                if '<wikipage><p><i>' in xmldata and '</i></p></wikipage>' in xmldata:
+                    if xmldata.count('<i>') == 1:
+                        text = text[:m.start()] + text[m.end():]
+            except:
+                pass
+
+            index = m.start() + 1
+
+        text = re.sub('(?m)^[:*]*\s*["][^"]+["].?\s*((.*?))?\r?$', "", text)
+        text = re.sub('(?m)^[:*]*\s*[«][^»]+[»].?\s*((.*?))?\r?$', "", text)
+        text = re.sub('(?m)^[:*]*\s*[“][^”]+[”].?\s*((.*?))?\r?$', "", text)
+
     # remove useless spaces
-    text = re.sub("(?m)(^[ \t]+|[ \t]+$)", "", text)
+    text = re.sub("(?m)(^[ \t]+|[ \t]+\r?$)", "", text)
-    #if text:
-    #    write_log(text+'\n', "copyright/debug_cleanwikicode2.txt")
+    if debug:
+        write_log(text+'\n', "copyright/wikicode_removed.txt")
+
     return text
excl_list = exclusion_list()
@@ -463,7 +547,7 @@
             print "** " + entry
def exclusion_list_dump():
-    f = open(wikipedia.datafilepath(appdir, 'exclusion_list.dump', 'w'))
+    f = open(wikipedia.datafilepath(appdir, 'exclusion_list.dump'), 'w')
     f.write('\n'.join(excl_list))
     f.close()
     print "Exclusion list dump saved."
@@ -494,7 +578,7 @@
         break
     return l
-def query(lines = [], max_query_len = 1300):
+def query(lines = [], max_query_len = 1300, wikicode = True):
     # Google max_query_len = 1480?
     # - '-Wikipedia ""' = 1467
@@ -505,18 +589,19 @@
     previous_group_url = 'none'
for line in lines:
-        line = cleanwikicode(line)
-        for search_words in mysplit(line, 31, " "):
-            if len(search_words) > 120:
+        if wikicode:
+            line = remove_wikicode(line)
+        for search_words in mysplit(line, number_of_words, " "):
+            if len(search_words) > min_query_string_len:
                 if config.copyright_economize_query:
                     if economize_query(search_words):
-                        wikipedia.output('SKIP TEXT: ' + search_words)
+                        warn(search_words, prefix = 'Text excluded')
                         consecutive = False
                         continue
                 n_query += 1
                 #wikipedia.output(search_words)
                 if config.copyright_max_query_for_page and n_query > config.copyright_max_query_for_page:
-                    wikipedia.output(u"Max query limit for page reached")
+                    warn(u"Max query limit for page reached")
                     return output
                 if config.copyright_skip_query > n_query:
                     continue
@@ -526,17 +611,18 @@
                     if " " in search_words:
                          search_words = search_words[:search_words.rindex(" ")]
                 results = get_results(search_words)
-                group_url = ''
+                group_url = '' ; cmp_group_url = ''
                 for url, engine, comment in results:
                     if comment:
                         group_url += '\n*%s - %s (%s)' % (engine, url, "; ".join(comment))
                     else:
                         group_url += '\n*%s - %s' % (engine, url)
+                    cmp_group_url += '\n*%s - %s' % (engine, url)
                 if results:
                     group_url_list = group_url.splitlines()
                     group_url_list.sort()
                     group_url = '\n'.join(group_url_list)
-                    if previous_group_url == group_url:
+                    if previous_group_url == cmp_group_url:
                         if consecutive:
                             output += ' ' + search_words
                         else:
@@ -544,7 +630,7 @@
                     else:
                         output += group_url + '\n**' + search_words
-                    previous_group_url = group_url
+                    previous_group_url = cmp_group_url
                     consecutive = True
                 else:
                     consecutive = False
@@ -569,8 +655,9 @@
     def __init__(self, url):
         """
         """
+        global source_seen
-        if check_list(url, excl_list):
+        if url in source_seen or check_list(url, excl_list):
             raise URL_exclusion
self._url = url
@@ -580,16 +667,16 @@
         #except httplib.BadStatusLine, line:
         #    print 'URL: %s\nBad status line: %s' % (url, line)
         except urllib2.HTTPError, err:
-            print "HTTP error: %d / %s (%s)" % (err.code, err.msg, url)
-            #if err.code == 404:
+            error("HTTP error: %d / %s (%s)" % (err.code, err.msg, url))
             if err.code >= 400:
+                source_seen.add(self._url)
                 raise NoWebPage
             return None
         except urllib2.URLError, arg:
-            print "URL error: %s / %s" % (url, arg)
+            error("URL error: %s / %s" % (url, arg))
             return None
         except Exception, err:
-            print "ERROR: %s" % (err)
+            error("ERROR: %s" % (err))
self._lastmodified = self._urldata.info().getdate('Last-Modified')
         self._length = self._urldata.info().getheader('Content-Length')
@@ -678,7 +765,7 @@
         source_seen.add(self._url)
         return False
-def add_in_urllist(url, add_item, engine):
+def add_in_urllist(url, add_item, engine, cache_url = None):
if (engine == 'google' and config.copyright_check_in_source_google) or \
     (engine == 'yahoo' and config.copyright_check_in_source_yahoo) or \
@@ -729,12 +816,16 @@
                         comment.append("%d %s" % (length, unit))
if cache:
-            if engine == 'google':
-                comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s google cache]' % add_item[7:])
-            elif engine == 'yahoo':
-                cache = False
-            elif engine == 'msn':
-                cache = False
+            if cache_url:
+                if engine == 'google':
+                    comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s Google cache]' % short_url(add_item))
+                elif engine == 'yahoo':
+                    #cache = False
+                    comment.append('[%s Yahoo cache]' % re.sub('&appid=[^&]*','', urllib.unquote(cache_url)))
+                elif engine == 'msn':
+                    comment.append('[%s Live cache]' % re.sub('&lang=[^&]*','', cache_url))
+            else:
+                comment.append('[http://web.archive.org/*/%s archive.org]' % short_url(add_item))
for i in range(len(url)):
         if add_item in url[i]:
@@ -754,13 +845,15 @@
         exec('config.copyright_' + engine + ' = False')
     # Sleeping
     if config.copyright_exceeded_in_queries == 2:
-        print "Got a queries exceeded error. Sleeping for %d hours..." % (config.copyright_exceeded_in_queries_sleep_hours)
+        error("Got a queries exceeded error. Sleeping for %d hours..." % (config.copyright_exceeded_in_queries_sleep_hours))
         time.sleep(config.copyright_exceeded_in_queries_sleep_hours * 60 * 60)
     # Stop execution
     if config.copyright_exceeded_in_queries == 3:
         raise 'Got a queries exceeded error.'
def soap(engine, query, url, numresults = 10):
+        global num_google_queries, num_yahoo_queries, num_msn_queries
+
         print "  %s query..." % engine.capitalize()
         search_request_retry = config.copyright_connection_tries
         while search_request_retry:
@@ -770,7 +863,10 @@
                     google.LICENSE_KEY = config.google_key
                     data = google.doGoogleSearch('%s "%s"' % (no_result_with_those_words, query))
                     for entry in data.results:
-                       add_in_urllist(url, entry.URL, 'google')
+                       add_in_urllist(url, entry.URL, 'google', entry.cachedSize)
+
+                    num_google_queries += 1
+
                 elif engine == 'yahoo':
                     import yahoo.search.web
                     data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s" %s' % (
@@ -778,7 +874,13 @@
                                                       no_result_with_those_words
                                                      ), results = numresults)
                     for entry in data.parse_results():
-                        add_in_urllist(url, entry.Url, 'yahoo')
+                        cacheurl = None
+                        if entry.Cache:
+                            cacheurl = entry.Cache.Url
+                        add_in_urllist(url, entry.Url, 'yahoo', cacheurl)
+
+                    num_yahoo_queries += 1
+
                 elif engine == 'msn':
                     #max_query_len = 150?
                     from SOAPpy import WSDL
@@ -786,10 +888,11 @@
                     try:
                         server = WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl')
                     except:
-                        print "Live Search Error"
+                        error("Live Search Error")
                         raise
+
                     params = {'AppID': config.msn_appid, 'Query': '%s "%s"' % (no_result_with_those_words, query),
-                             'CultureInfo': 'en-US', 'SafeSearch': 'Off', 'Requests': {
+                             'CultureInfo': region_code, 'SafeSearch': 'Off', 'Requests': {
                              'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
results = ''
@@ -801,14 +904,23 @@
                         # list or instance?
                         if type(results) == type([]):
                             for entry in results:
-                                add_in_urllist(url, entry.Url, 'msn')
+                                cacheurl = None
+                                if hasattr(entry, 'CacheUrl'):
+                                    cacheurl = entry.CacheUrl
+                                add_in_urllist(url, entry.Url, 'msn', cacheurl)
                         else:
-                            add_in_urllist(url, results.Url, 'msn')
+                            cacheurl = None
+                            if hasattr(results, 'CacheUrl'):
+                                cacheurl = results.CacheUrl
+                            add_in_urllist(url, results.Url, 'msn', cacheurl)
+
+                    num_msn_queries += 1
+
                 search_request_retry = 0
             except KeyboardInterrupt:
                 raise
             except Exception, err:
-                print "Got an error ->", err
+                error(err, "Got an error")
#
                 # SOAP.faultType: <Fault SOAP-ENV:Server: Exception from service object:
@@ -851,7 +963,7 @@
             output = query(lines=original_text.splitlines())
             if output:
                 write_log(
-                    "=== [[" + title + "]] ===\n{{/box|%s|prev|%s|%s|00}}"
+                    "=== [[" + title + "]] ===\n{{botbox|%s|prev|%s|%s|00}}"
                         % (title.replace(" ", "_").replace(""", "%22"),
                            id, "author")
                         + output,
@@ -876,40 +988,50 @@
             except wikipedia.NoPage:
                 wikipedia.output(u'Page %s not found' % page.title())
                 continue
-            except wikipedia.IsRedirectPage:
-                original_text = page.get(get_redirect=True)
+            except wikipedia.IsRedirectPage, error:
+                wikipedia.output(u'Page %s redirect to '%s'' % (page.aslink(), error.message))
+                bot = CheckRobot(iter([wikipedia.Page(page.site(), error.message),]))
+                bot.run()
+                continue
if skip_disambig:
                 if page.isDisambig():
-                    wikipedia.output(u'Page %s is a disambiguation page' % page.title())
+                    wikipedia.output(u'Page %s is a disambiguation page' % page.aslink())
                     continue
-#            colors = [13] * len(page.title())
     	    wikipedia.output(page.title())
if original_text:
                 text = skip_section(original_text)
-                output = query(lines = text.splitlines())
+
+                if remove_wikicode_dotall:
+                    text = remove_wikicode(text, re_dotall = True)
+
+                output = query(lines = text.splitlines(), wikicode = not remove_wikicode_dotall)
                 if output:
                    write_log('=== [[' + page.title() + ']] ===' + output + '\n',
                              filename = output_file)
+def short_url(url):
+    return url[url.index('://')+3:]
+
 def put(page, text, comment):
     while True:
         try:
             page.put(text, comment = comment)
             break
         except wikipedia.SpamfilterError, url:
-            print "Spam filter"
-            text = re.sub(url[0], '<blacklist>' + url[0][url[0].index('://')+3:], text)
+            warn(url, prefix = "Spam filter")
+            text = re.sub(url[0], '<blacklist>' + short_url(url[0]), text)
         except wikipedia.EditConflict:
-            print "Edit conflict"
+            warn("Edit conflict")
             raise wikipedia.EditConflict
def check_config(var, license_id, license_name):
     if var:
         if not license_id:
-            wikipedia.output(u"WARNING: You don't have set a " + license_name + ", search engine is disabled.")
+            warn(u"You don't have set a " + license_name + ", search engine is disabled.",
+                 prefix = "WARNING")
             return False
     return var

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

[Pywikipedia-l] SVN: [4433] trunk/pywikipedia/copyright.py