[Pywikipedia-svn] SVN: [6647] trunk/pywikipedia/copyright.py

21 Apr 2009

Revision: 6647
Author:   cosoleto
Date:     2009-04-21 11:01:08 +0000 (Tue, 21 Apr 2009)
Log Message:
-----------
A not complete prettification try, instead of rewrite from scratch. Minor functional changes.
Modified Paths:
--------------
    trunk/pywikipedia/copyright.py
Modified: trunk/pywikipedia/copyright.py
===================================================================

--- trunk/pywikipedia/copyright.py	2009-04-21 06:57:10 UTC (rev 6646)
+++ trunk/pywikipedia/copyright.py	2009-04-21 11:01:08 UTC (rev 6647)
@@ -250,8 +250,6 @@
     'zh':[u'參考文獻',u'参考文献',u'參考資料',u'参考资料', u'資料來源',u'资料来源',u'參見',u'参见',u'參閱',u'参阅'],
 }
-num_google_queries = 0 ; num_yahoo_queries = 0 ; num_msn_queries = 0
-
 if enable_color:
     warn_color = '\03{%s}' % warn_color
     error_color = '\03{%s}' % error_color
@@ -271,15 +269,6 @@
 def error(text ,prefix = None):
     _output(text, prefix = prefix, color = error_color)
-def print_stats():
-        wikipedia.output('\n'
-                         'Search engine | number of queries\n'
-                         '---------------------------------\n'
-                         'Google        | %s\n'
-                         'Yahoo!        | %s\n'
-                         'Live Search   | %s\n'
-                         % (num_google_queries, num_yahoo_queries, num_msn_queries))
-
 def skip_section(text):
     l = list()
     for s in sections_to_skip.values():
@@ -307,107 +296,127 @@
             return text[:start.start()]
     return text
-def exclusion_file_list():
-    for i in pages_for_exclusion_database:
-        path = wikipedia.config.datafilepath(appdir, i[0], i[2])
-        wikipedia.config.makepath(path)
-        p = wikipedia.Page(wikipedia.getSite(i[0]), i[1])
-        yield p, path
+class URLExclusion:
+    def __init__(self):
+        self.URLlist = set()
+        self.scan()
-def load_pages(force_update = False):
-    for page, path in exclusion_file_list():
-        try:
-            force_load = force_update
-            if not os.path.exists(path):
-                print 'Creating file '%s' (%s)' % (
-                            wikipedia.config.shortpath(path), page.aslink())
-                force_load = True
-            else:
-                file_age = time.time() - os.path.getmtime(path)
-                if file_age > 24 * 60 * 60:
-                    print 'Updating file '%s' (%s)' % (
-                            wikipedia.config.shortpath(path), page.aslink())
-                    force_load = True
-        except OSError:
-            raise
+    def pages_list(self):
+        for i in pages_for_exclusion_database:
+            path = wikipedia.config.datafilepath(appdir, i[0], i[2])
+            wikipedia.config.makepath(path)
+            page = wikipedia.Page(wikipedia.getSite(i[0]), i[1])
+            yield page, path
-        if force_load:
-            data = None
+    def download(self, force_update = False):
+        for page, path in self.pages_list():
+            download = force_update
             try:
-                data = page.get()
-            except KeyboardInterrupt:
+                if not os.path.exists(path):
+                    print 'Creating file '%s' (%s)' % (wikipedia.config.shortpath(path),
+                                                         page.aslink())
+                    download = True
+                else:
+                    file_age = time.time() - os.path.getmtime(path)
+                    if download or file_age > 24 * 60 * 60:
+                        print 'Updating file '%s' (%s)' % (
+                        wikipedia.config.shortpath(path), page.aslink())
+                        download = True
+            except OSError:
                 raise
-            except wikipedia.IsRedirectPage, arg:
-                data = page.getRedirectTarget().get()
-            except:
-                error('Getting page failed')
-            if data:
-                f = codecs.open(path, 'w', 'utf-8')
-                f.write(data)
-                f.close()
-    return
+            if download:
+                data = None
+                try:
+                    data = page.get()
+                except KeyboardInterrupt:
+                    raise
+                except wikipedia.IsRedirectPage, arg:
+                    data = page.getRedirectTarget().get()
+                except:
+                    error('Getting page failed')
-def check_list(url, clist, verbose = False):
-    for entry in clist:
-        if entry:
-            if url.find(entry) != -1:
-                if verbose > 1:
-                    warn('URL Excluded: %s\nReason: %s' % (url, entry))
-                elif verbose:
-                    warn('URL Excluded: %s' % url)
-                return True
+                if data:
+                    f = codecs.open(path, 'w', 'utf-8')
+                    f.write(data)
+                    f.close()
+                    
+    def update(self):
+        self.download(force_update = True)
+        self.scan()
-def exclusion_list():
-    prelist = []
-    result_list = []
-    load_pages()
+    def check(self, url, verbose = False):
+        for entry in self.URLlist:
+           if url.find(entry) != -1:
+               if verbose > 1:
+                   warn('URL Excluded: %s\nReason: %s' % (url, entry))
+               elif verbose:
+                   warn('URL Excluded: %s' % url)
+               return True
+        return False
-    for page, path in exclusion_file_list():
-        if 'exclusion_list.txt' in path:
-            result_list += re.sub("</?pre>","",
-                                  read_file(path,
-                                            cut_comment=True,
-                                            cut_newlines=True)
-                                  ).splitlines()
-        else:
-            data = read_file(path)
-            # wikipedia:en:Wikipedia:Mirrors and forks
-            prelist += re.findall("(?i)url\s*=\s*<nowiki>(?:http://)?(.*)</nowiki>", data)
-            prelist += re.findall("(?i)*\s*Site:\s*[?(?:http://)?(.*)%5C%5D?", data)
-            # wikipedia:it:Wikipedia:Cloni
-            if 'it/Cloni.txt' in path:
-                prelist += re.findall('(?mi)^==(?!=)\s*[?\s*(?:<nowiki>)?\s*(?:http://)?(.*?)(?:</nowiki>)?\s*]?\s*==', data)
-    list1 = []
-    for entry in prelist:
-        list1 += entry.split(", ")
-    list2 = []
-    for entry in list1:
-        list2 += entry.split("and ")
-    for entry in list2:
-        # Remove unnecessary part of URL
-        entry = re.sub("http://", "", entry)
-        entry = re.sub("www.", "", entry)
-        entry = re.sub("</?nowiki>", "", entry)
-        if entry:
-            if '/' in entry:
-                entry = entry[:entry.rfind('/')]
+    def scan(self):
+        prelist = [] ; result_list = []
+        self.download()
-            entry = re.sub("\s.*", "", entry)
+        for page, path in self.pages_list():
+            if 'exclusion_list.txt' in path:
+                result_list += re.sub("</?pre>","",
+                                      read_file(path,
+                                                cut_comment=True,
+                                                cut_newlines=True)
+                                      ).splitlines()
+            else:
+                data = read_file(path)
+                # wikipedia:en:Wikipedia:Mirrors and forks
+                prelist += re.findall("(?i)url\s*=\s*<nowiki>(?:http://)?(.*)</nowiki>", data)
+                prelist += re.findall("(?i)*\s*Site:\s*[?(?:http://)?(.*)%5C%5D?", data)
+                # wikipedia:it:Wikipedia:Cloni
+                if 'it/Cloni.txt' in path:
+                    prelist += re.findall('(?mi)^==(?!=)\s*[?\s*(?:<nowiki>)?\s*(?:http://)?(.*?)(?:</nowiki>)?\s*]?\s*==', data)
+        list1 = []
+        for entry in prelist:
+            list1 += entry.split(", ")
+        list2 = []
+        for entry in list1:
+            list2 += entry.split("and ")
+        for entry in list2:
+            # Remove unnecessary part of URL
+            entry = re.sub("http://", "", entry)
+            entry = re.sub("www.", "", entry)
+            entry = re.sub("</?nowiki>", "", entry)
+            if entry:
+                if '/' in entry:
+                    entry = entry[:entry.rfind('/')]
-            if len(entry) > 4:
-                result_list.append(entry)
+                entry = re.sub("\s.*", "", entry)
-    result_list += read_file(
-                        wikipedia.config.datafilepath(appdir, 'exclusion_list.txt'),
-                        cut_comment = True, cut_newlines = True
-                    ).splitlines()
+                if len(entry) > 4:
+                    result_list.append(entry)
-    for i in range(len(result_list)):
-            result_list[i] = re.sub('\s+$', '', result_list[i])
+        result_list += read_file(
+                            wikipedia.config.datafilepath(appdir, 'exclusion_list.txt'),
+                            cut_comment = True, cut_newlines = True
+                       ).splitlines()
-    return result_list
+        for i in range(len(result_list)):
+            cleaned = re.sub('\s+$', '', result_list[i])
+            if cleaned:
+                self.URLlist.add(cleaned)
+    def sanity_check(self):
+        print "Exclusion list sanity check..."
+        for entry in self.URLlist:
+            if (not '.' in entry and not '/' in entry) or len(entry) < 5:
+                print "** " + entry
+
+    def dump(self):
+        f = open(wikipedia.config.datafilepath(appdir, 'exclusion_list.dump'), 'w')
+        f.write('\n'.join(self.URLlist))
+        f.close()
+        print "Exclusion list dump saved."
+
+
 def read_file(filename, cut_comment = False, cut_newlines = False):
     text = u""
@@ -565,18 +574,6 @@
return text
-def exclusion_list_sanity_check():
-    print "Exclusion list sanity check..."
-    for entry in excl_list:
-        if (not '.' in entry and not '/' in entry) or len(entry) < 5:
-            print "** " + entry
-
-def exclusion_list_dump():
-    f = open(wikipedia.config.datafilepath(appdir, 'exclusion_list.dump'), 'w')
-    f.write('\n'.join(excl_list))
-    f.close()
-    print "Exclusion list dump saved."
-
 def n_index(text, n, sep):
     pos = 0
     while n>0:
@@ -603,73 +600,270 @@
         break
     return l
-def query(lines = [], max_query_len = 1300, wikicode = True):
-    # Google max_query_len = 1480?
-    # - '-Wikipedia ""' = 1467
+class SearchEngine:
-    # Google limit queries to 32 words.
+    num_google_queries = num_yahoo_queries = num_msn_queries = 0
-    output = u""
-    n_query = 0
-    previous_group_url = 'none'
+    def __init__(self):
+        self.URLexcl = URLExclusion()
-    for line in lines:
-        if wikicode:
-            line = remove_wikicode(line)
-        for search_words in mysplit(line, number_of_words, " "):
-            if len(search_words) > min_query_string_len:
-                if config.copyright_economize_query:
-                    if economize_query(search_words):
-                        warn(search_words, prefix = 'Text excluded')
+    def __del__(self):
+        self.print_stats()
+
+    def query(self, lines = [], max_query_len = 1300, wikicode = True):
+        # Google max_query_len = 1480?
+        # - '-Wikipedia ""' = 1467
+
+        # Google limit queries to 32 words.
+
+        n_query = 0
+        output = unicode()
+        previous_group_url = 'null'
+
+        for line in lines:
+            if wikicode:
+                line = remove_wikicode(line)
+            for search_words in mysplit(line, number_of_words, " "):
+                if len(search_words) > min_query_string_len:
+                    if config.copyright_economize_query:
+                        if economize_query(search_words):
+                            warn(search_words, prefix = 'Text excluded')
+                            consecutive = False
+                            continue
+                    n_query += 1
+                    #wikipedia.output(search_words)
+                    if config.copyright_max_query_for_page and n_query > config.copyright_max_query_for_page:
+                        warn(u"Max query limit for page reached")
+                        return output
+                    if config.copyright_skip_query > n_query:
+                        continue
+                    if len(search_words) > max_query_len:
+                        search_words = search_words[:max_query_len]
                         consecutive = False
-                        continue
-                n_query += 1
-                #wikipedia.output(search_words)
-                if config.copyright_max_query_for_page and n_query > config.copyright_max_query_for_page:
-                    warn(u"Max query limit for page reached")
-                    return output
-                if config.copyright_skip_query > n_query:
-                    continue
-                if len(search_words) > max_query_len:
-                    search_words = search_words[:max_query_len]
-                    consecutive = False
-                    if " " in search_words:
-                         search_words = search_words[:search_words.rindex(" ")]
+                        if " " in search_words:
+                             search_words = search_words[:search_words.rindex(" ")]
-                results = get_results(search_words)
+                    results = self.get_results(search_words)
-                group_url = '' ; cmp_group_url = ''
+                    group_url = '' ; cmp_group_url = ''
-                for url, engine, comment in results:
-                    if comment:
-                        group_url += '\n*%s - %s (%s)' % (engine, url, "; ".join(comment))
-                    else:
-                        group_url += '\n*%s - %s' % (engine, url)
-                    cmp_group_url += '\n*%s - %s' % (engine, url)
-                if results:
-                    group_url_list = group_url.splitlines()
-                    cmp_group_url_list = cmp_group_url.splitlines()
-                    group_url_list.sort()
-                    cmp_group_url_list.sort()
-                    group_url = '\n'.join(group_url_list)
-                    cmp_group_url = '\n'.join(cmp_group_url_list)
-                    if previous_group_url == cmp_group_url:
-                        if consecutive:
-                            output += ' ' + search_words
+                    for url, engine, comment in results:
+                        if comment:
+                            group_url += '\n*%s - %s (%s)' % (engine, url, "; ".join(comment))
                         else:
-                            output += '\n**' + search_words
+                            group_url += '\n*%s - %s' % (engine, url)
+                        cmp_group_url += '\n*%s - %s' % (engine, url)
+                    if results:
+                        group_url_list = group_url.splitlines()
+                        cmp_group_url_list = cmp_group_url.splitlines()
+                        group_url_list.sort()
+                        cmp_group_url_list.sort()
+                        group_url = '\n'.join(group_url_list)
+                        cmp_group_url = '\n'.join(cmp_group_url_list)
+                        if previous_group_url == cmp_group_url:
+                            if consecutive:
+                                output += ' ' + search_words
+                            else:
+                                output += '\n**' + search_words
+                        else:
+                            output += group_url + '\n**' + search_words
+
+                        previous_group_url = cmp_group_url
+                        consecutive = True
                     else:
-                        output += group_url + '\n**' + search_words
+                        consecutive = False
+                else:
+                   consecutive = False
-                    previous_group_url = cmp_group_url
-                    consecutive = True
+        return output
+
+    def add_in_urllist(self, url, add_item, engine, cache_url = None):
+        if (engine == 'google' and config.copyright_check_in_source_google) or \
+        (engine == 'yahoo' and config.copyright_check_in_source_yahoo) or \
+        (engine == 'msn' and config.copyright_check_in_source_msn):
+            check_in_source = True
+        else:
+            check_in_source = False
+
+        if check_in_source or config.copyright_show_date or config.copyright_show_length:
+            s = None
+            cache = False
+
+            # list to store date, length, cache URL
+            comment = list()
+
+            try:
+                s = WebPage(add_item, self.URLexcl)
+            except URL_exclusion:
+                pass
+            except NoWebPage:
+                cache = True
+
+            if s:
+                # Before of add url in result list, perform the check in source
+                if check_in_source:
+                    if s.check_in_source():
+                        return
+
+                if config.copyright_show_date:
+                    date = s.lastmodified()
+                    if date:
+                        if date[:3] != time.localtime()[:3]:
+                            comment.append("%s/%s/%s" % (date[2], date[1], date[0]))
+
+                unit = 'bytes'
+
+                if config.copyright_show_length:
+                    length = s.length()
+                    if length > 1024:
+                        # convert in kilobyte
+                        length /= 1024
+                        unit = 'KB'
+                        if length > 1024:
+                            # convert in megabyte
+                            length /= 1024
+                            unit = 'MB'
+                    if length > 0:
+                        comment.append("%d %s" % (length, unit))
+
+            if cache:
+                if cache_url:
+                    if engine == 'google':
+                        comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s Google cache]' % urllib.quote(short_url(add_item)))
+                    elif engine == 'yahoo':
+                        #cache = False
+                        #comment.append('[%s Yahoo cache]' % re.sub('&appid=[^&]*','', urllib2.unquote(cache_url)))
+                        comment.append("''Yahoo cache''")
+                    elif engine == 'msn':
+                        comment.append('[%s Live cache]' % re.sub('&lang=[^&]*','', cache_url))
                 else:
-                    consecutive = False
-            else:
-               consecutive = False
+                    comment.append('[http://web.archive.org/*/%s archive.org]' % short_url(add_item))
-    return output
+        for i in range(len(url)):
+            if add_item in url[i]:
+                if engine not in url[i][1]:
+                    if url[i][2]:
+                        comment = url[i][2]
+                    url[i] = (add_item, url[i][1] + ', ' + engine, comment)
+                return
+        url.append((add_item, engine, comment))
+        return
+    def soap(self, engine, query, url, numresults = 10):
+        print "  %s query..." % engine.capitalize()
+        search_request_retry = config.copyright_connection_tries
+        query_success = False
+
+        while search_request_retry:
+            try:
+                if engine == 'google':
+                    import google
+                    google.LICENSE_KEY = config.google_key
+                    data = google.doGoogleSearch('%s "%s"' % (no_result_with_those_words, query))
+                    for entry in data.results:
+                       self.add_in_urllist(url, entry.URL, 'google', entry.cachedSize)
+
+                    self.num_google_queries += 1
+
+                elif engine == 'yahoo':
+                    import yahoo.search.web
+                    data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s" %s' % (
+                                                      query.encode('utf_8'),
+                                                      no_result_with_those_words
+                                                     ), results = numresults)
+                    for entry in data.parse_results():
+                        cacheurl = None
+                        if entry.Cache:
+                            cacheurl = entry.Cache.Url
+                        self.add_in_urllist(url, entry.Url, 'yahoo', cacheurl)
+
+                    self.num_yahoo_queries += 1
+
+                elif engine == 'msn':
+                    #max_query_len = 150?
+                    from SOAPpy import WSDL
+
+                    try:
+                        server = WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl')
+                    except Exception, err:
+                        error("Live Search Error: %s" % err)
+                        raise
+
+                    params = {'AppID': config.msn_appid, 'Query': '%s "%s"' % (no_result_with_those_words, query),
+                             'CultureInfo': region_code, 'SafeSearch': 'Off', 'Requests': {
+                             'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
+
+                    results = ''
+
+                    server_results = server.Search(Request = params)
+                    if server_results.Responses[0].Results:
+                        results = server_results.Responses[0].Results[0]
+                    if results:
+                        # list or instance?
+                        if type(results) == type([]):
+                            for entry in results:
+                                cacheurl = None
+                                if hasattr(entry, 'CacheUrl'):
+                                    cacheurl = entry.CacheUrl
+                                self.add_in_urllist(url, entry.Url, 'msn', cacheurl)
+                        else:
+                            cacheurl = None
+                            if hasattr(results, 'CacheUrl'):
+                                cacheurl = results.CacheUrl
+                            self.add_in_urllist(url, results.Url, 'msn', cacheurl)
+
+                    self.num_msn_queries += 1
+
+                search_request_retry = 0
+                query_success = True
+            except KeyboardInterrupt:
+                raise
+            except Exception, err:
+                # Something is going wrong...
+                if 'Daily limit' in str(err) or 'Insufficient quota for key' in str(err):
+                    exceeded_in_queries('google')
+                elif 'limit exceeded' in str(err):
+                    exceeded_in_queries('yahoo')
+                elif 'Invalid value for AppID in request' in str(err):
+                    exceeded_in_queries('msn')
+                else:
+                    raise
+                    error(err, "Got an error")
+
+                if search_request_retry:
+                    search_request_retry -= 1
+
+        if not query_success:
+            error('No response for: %s' % query, "Error (%s)" % engine)
+
+    def get_results(self, query, numresults = 10):
+        result_list = list()
+        query = re.sub("[()"<>]", "", query)
+        # wikipedia.output(query)
+        if config.copyright_google:
+            self.soap('google', query, result_list)
+        if config.copyright_yahoo:
+            self.soap('yahoo', query, result_list, numresults = numresults)
+        if config.copyright_msn:
+            self.soap('msn', query, result_list)
+
+        offset = 0
+        for i in range(len(result_list)):
+            if self.URLexcl.check(result_list[i + offset][0], verbose = True):
+                result_list.pop(i + offset)
+                offset += -1
+        return result_list
+
+    def print_stats(self):
+        wikipedia.output('\n'
+                         'Search engine | number of queries\n'
+                         '---------------------------------\n'
+                         'Google        | %s\n'
+                         'Yahoo!        | %s\n'
+                         'Live Search   | %s\n'
+                         % (self.num_google_queries, self.num_yahoo_queries,
+                            self.num_msn_queries))
+
 source_seen = set()
 positive_source_seen = set()
@@ -683,12 +877,13 @@
     """
     """
-    def __init__(self, url):
+    def __init__(self, url, URLExcl):
         """
         """
         global source_seen
+        self.URLexcludedlist = URLExcl.URLlist
-        if url in source_seen or check_list(url, excl_list):
+        if url in source_seen or URLExcl.check(url):
             raise URL_exclusion
self._url = url
@@ -746,8 +941,8 @@
     def check_regexp(self, reC, text, filename = None):
         m = reC.search(text)
         if m:
-            global excl_list, positive_source_seen
-            excl_list += [self._url]
+            global positive_source_seen
+            self.URLexcludedlist.add(self._url)
             positive_source_seen.add(self._url)
             if filename:
                 write_log("%s (%s)\n" % (self._url, m.group()), filename)
@@ -796,79 +991,7 @@
         source_seen.add(self._url)
         return False
-def add_in_urllist(url, add_item, engine, cache_url = None):
-    if (engine == 'google' and config.copyright_check_in_source_google) or \
-    (engine == 'yahoo' and config.copyright_check_in_source_yahoo) or \
-    (engine == 'msn' and config.copyright_check_in_source_msn):
-        check_in_source = True
-    else:
-        check_in_source = False
-
-    if check_in_source or config.copyright_show_date or config.copyright_show_length:
-        s = None
-        cache = False
-
-        # list to store date, length, cache URL
-        comment = list()
-
-        try:
-            s = WebPage(add_item)
-        except URL_exclusion:
-            pass
-        except NoWebPage:
-            cache = True
-
-        if s:
-            # Before of add url in result list, perform the check in source
-            if check_in_source:
-                if s.check_in_source():
-                    return
-
-            if config.copyright_show_date:
-                date = s.lastmodified()
-                if date:
-                    if date[:3] != time.localtime()[:3]:
-                        comment.append("%s/%s/%s" % (date[2], date[1], date[0]))
-
-            unit = 'bytes'
-
-            if config.copyright_show_length:
-                length = s.length()
-                if length > 1024:
-                    # convert in kilobyte
-                    length /= 1024
-                    unit = 'KB'
-                    if length > 1024:
-                        # convert in megabyte
-                        length /= 1024
-                        unit = 'MB'
-                if length > 0:
-                    comment.append("%d %s" % (length, unit))
-
-        if cache:
-            if cache_url:
-                if engine == 'google':
-                    comment.append('[http://www.google.com/search?sourceid=navclient&q=cache:%s Google cache]' % urllib.quote(short_url(add_item)))
-                elif engine == 'yahoo':
-                    #cache = False
-                    #comment.append('[%s Yahoo cache]' % re.sub('&appid=[^&]*','', urllib2.unquote(cache_url)))
-                    comment.append("''Yahoo cache''")
-                elif engine == 'msn':
-                    comment.append('[%s Live cache]' % re.sub('&lang=[^&]*','', cache_url))
-            else:
-                comment.append('[http://web.archive.org/*/%s archive.org]' % short_url(add_item))
-
-    for i in range(len(url)):
-        if add_item in url[i]:
-            if engine not in url[i][1]:
-                if url[i][2]:
-                    comment = url[i][2]
-                url[i] = (add_item, url[i][1] + ', ' + engine, comment)
-            return
-    url.append((add_item, engine, comment))
-    return
-
 def exceeded_in_queries(engine):
     """Behavior if an exceeded error occur."""
@@ -883,112 +1006,6 @@
     if config.copyright_exceeded_in_queries == 3:
         raise 'Got a queries exceeded error.'
-def soap(engine, query, url, numresults = 10):
-        global num_google_queries, num_yahoo_queries, num_msn_queries
-
-        print "  %s query..." % engine.capitalize()
-        search_request_retry = config.copyright_connection_tries
-        query_success = False
-
-        while search_request_retry:
-            try:
-                if engine == 'google':
-                    import google
-                    google.LICENSE_KEY = config.google_key
-                    data = google.doGoogleSearch('%s "%s"' % (no_result_with_those_words, query))
-                    for entry in data.results:
-                       add_in_urllist(url, entry.URL, 'google', entry.cachedSize)
-
-                    num_google_queries += 1
-
-                elif engine == 'yahoo':
-                    import yahoo.search.web
-                    data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s" %s' % (
-                                                      query.encode('utf_8'),
-                                                      no_result_with_those_words
-                                                     ), results = numresults)
-                    for entry in data.parse_results():
-                        cacheurl = None
-                        if entry.Cache:
-                            cacheurl = entry.Cache.Url
-                        add_in_urllist(url, entry.Url, 'yahoo', cacheurl)
-
-                    num_yahoo_queries += 1
-
-                elif engine == 'msn':
-                    #max_query_len = 150?
-                    from SOAPpy import WSDL
-
-                    try:
-                        server = WSDL.Proxy('http://soap.search.msn.com/webservices.asmx?wsdl')
-                    except Exception, err:
-                        error("Live Search Error: %s" % err)
-                        raise
-
-                    params = {'AppID': config.msn_appid, 'Query': '%s "%s"' % (no_result_with_those_words, query),
-                             'CultureInfo': region_code, 'SafeSearch': 'Off', 'Requests': {
-                             'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
-
-                    results = ''
-
-                    server_results = server.Search(Request = params)
-                    if server_results.Responses[0].Results:
-                        results = server_results.Responses[0].Results[0]
-                    if results:
-                        # list or instance?
-                        if type(results) == type([]):
-                            for entry in results:
-                                cacheurl = None
-                                if hasattr(entry, 'CacheUrl'):
-                                    cacheurl = entry.CacheUrl
-                                add_in_urllist(url, entry.Url, 'msn', cacheurl)
-                        else:
-                            cacheurl = None
-                            if hasattr(results, 'CacheUrl'):
-                                cacheurl = results.CacheUrl
-                            add_in_urllist(url, results.Url, 'msn', cacheurl)
-
-                    num_msn_queries += 1
-
-                search_request_retry = 0
-                query_success = True
-            except KeyboardInterrupt:
-                raise
-            except Exception, err:
-                # Something is going wrong...
-                if 'Daily limit' in str(err) or 'Insufficient quota for key' in str(err):
-                    exceeded_in_queries('google')
-                elif 'limit exceeded' in str(err):
-                    exceeded_in_queries('yahoo')
-                elif 'Invalid value for AppID in request' in str(err):
-                    exceeded_in_queries('msn')
-                else:
-                    error(err, "Got an error")
-
-                if search_request_retry:
-                    search_request_retry -= 1
-
-        if not query_success:
-            error('No response for: %s' % query, "Error (%s)" % engine)
-
-def get_results(query, numresults = 10):
-    result_list = list()
-    query = re.sub("[()"<>]", "", query)
-    # wikipedia.output(query)
-    if config.copyright_google:
-        soap('google', query, result_list)
-    if config.copyright_yahoo:
-        soap('yahoo', query, result_list, numresults = numresults)
-    if config.copyright_msn:
-        soap('msn', query, result_list)
-
-    offset = 0
-    for i in range(len(result_list)):
-        if check_list(result_list[i + offset][0], excl_list, verbose = True):
-            result_list.pop(i + offset)
-            offset += -1
-    return result_list
-
 def get_by_id(title, id):
     return wikipedia.getSite().getUrl("/w/index.php?title=%s&oldid=%s&action=raw" % (title, id))
@@ -1011,6 +1028,7 @@
         """
         """
         self.generator = generator
+        self.SearchEngine = SearchEngine()
def run(self):
         """
@@ -1048,7 +1066,7 @@
                 if remove_wikicode_dotall:
                     text = remove_wikicode(text, re_dotall = True)
-                output = query(lines = text.splitlines(), wikicode = not remove_wikicode_dotall)
+                output = self.SearchEngine.query(lines = text.splitlines(), wikicode = not remove_wikicode_dotall)
                 if output:
                    write_log('=== [[' + page.title() + ']] ===' + output + '\n',
                              filename = output_file)
@@ -1144,7 +1162,7 @@
             except ValueError:
                 namespaces.append(arg[11:])
         elif arg.startswith('-forceupdate'):
-            load_pages(force_update = True)
+            URLExclusion().update()
         elif arg == '-repeat':
             repeat = True
         elif arg.startswith('-new'):
@@ -1187,10 +1205,7 @@
     preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = pageNumber)
     bot = CheckRobot(preloadingGen)
     bot.run()
-    print_stats()
-excl_list = exclusion_list()
-
 if number_of_words > 22 and config.copyright_msn:
         warn("Live Search requires a lower value for 'number_of_words' variable "
              "(current value is %d, a good value may be 22)." % (number_of_words), prefix = 'Warning')

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

[Pywikipedia-svn] SVN: [6647] trunk/pywikipedia/copyright.py