[Pywikipedia-l] SVN: [4353] trunk/pywikipedia - pywikibot

24 Sep 2007

Revision: 4353
Author:   cosoleto
Date:     2007-09-24 09:17:56 +0000 (Mon, 24 Sep 2007)
Log Message:
-----------
copyright.py: Fixed problem with preload generator and new pages generetor (pointed out by Filnik)). Added note to help users of Python 2.5.0 to install SOAPpy 0.12.0. copyright_put.py, copyright_clean.py: Minor fixes, and code cleanup.
Modified Paths:
--------------
    trunk/pywikipedia/CONTENTS
    trunk/pywikipedia/copyright.py
    trunk/pywikipedia/copyright_clean.py
    trunk/pywikipedia/copyright_put.py
Modified: trunk/pywikipedia/CONTENTS
===================================================================

--- trunk/pywikipedia/CONTENTS	2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/CONTENTS	2007-09-24 09:17:56 UTC (rev 4353)
@@ -85,7 +85,7 @@
 clean_sandbox.py       : This bot makes the cleaned of the page of tests.
 commons_link.py        : This robot include commons template to linking Commons and 
                          your wiki project.
-copyright.py           : This robot check copyright text in Google and Yahoo.
+copyright.py           : This robot check copyright text in Google, Yahoo! and Live Search.
 cosmetic_changes.py    : Can do slight modifications to a wiki page source code
                          such that the code looks cleaner.
 delete.py              : This script can be used to delete pages en masse.
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py	2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/copyright.py	2007-09-24 09:17:56 UTC (rev 4353)
@@ -13,6 +13,10 @@
Windows Live Search requires to install the SOAPpy module from
 http://pywebsvcs.sf.net and get an AppID from http://search.msn.com/developer.
+If you use Python 2.5 and have SOAPpy version 0.12.0, you must edit three
+files (SOAPpy/Client.py, SOAPpy/Server.py, SOAPpy/Types.py) to fix a simple
+syntax error by moving 'from __future__ imports...' line to beginning of the
+code.
You can run the bot with the following commandline parameters:
@@ -75,6 +79,9 @@
__version__='$Id$'
+#
+no_result_with_those_words = '-Wikipedia'
+
 # Try to skip quoted text.
 exclude_quote = True
@@ -413,7 +420,7 @@
         text = re.sub('^[:*]?\s*[“][^”]+[”].?\s*(((|<ref>).*?()|</ref>))?.?$', "", text)
# remove URL
-    text = re.sub('https?://[\w/.,;:@&=%#\?_!~*'|()"+-]+', ' ', text)
+    text = re.sub('(ftp|https?)://[\w/.,;:@&=%#\?_!~*'|()"+-]+', ' ', text)
# remove Image tags
     text = reImageC.sub("", text)
@@ -761,7 +768,7 @@
         search_request_retry = config.copyright_connection_tries
         while search_request_retry:
             try:
-                data = google.doGoogleSearch('-Wikipedia "' + query + '"')
+                data = google.doGoogleSearch('%s "%s"' % (no_result_with_those_words, query)
                 search_request_retry = 0
                 for entry in data.results:
                     add_in_urllist(url, entry.URL, 'google')
@@ -780,9 +787,10 @@
     if config.copyright_yahoo:
         import yahoo.search.web
         print "  Yahoo query..."
-        data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"' +
-                                          query.encode('utf_8') +
-                                          '" -Wikipedia', results=numresults)
+        data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s" %s' % (
+                                          query.encode('utf_8'),
+                                          no_result_with_those_words
+                                          ), results = numresults)
         search_request_retry = config.copyright_connection_tries
         while search_request_retry:
             try:
@@ -805,7 +813,8 @@
         except:
             print "Live Search Error"
             raise
-        params = {'AppID': config.msn_appid, 'Query': '-Wikipedia "' + query + '"', 'CultureInfo': 'en-US', 'SafeSearch': 'Off', 'Requests': {
+        params = {'AppID': config.msn_appid, 'Query': '%s "%s"' % (no_result_with_those_words, query),
+                 'CultureInfo': 'en-US', 'SafeSearch': 'Off', 'Requests': {
                  'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
search_request_retry = config.copyright_connection_tries
@@ -919,9 +928,6 @@
     PageTitles = []
     # IDs which will be processed when the -ids parameter is used
     ids = None
-    # will become True when the user presses a ('yes to all') or uses the -always
-    # commandline paramater.
-    acceptall = False
     # Which namespaces should be processed?
     # default to [] which means all namespaces will be processed
     namespaces = []
@@ -929,6 +935,10 @@
     repeat = False
     #
     text = None
+    # Number of pages to load at a time by Preload generator
+    pageNumber = 40
+    # Default number of pages for NewPages generator
+    number = 60
firstPageTitle = None
     # This factory is responsible for processing command line arguments
@@ -936,11 +946,6 @@
     # to work on.
     genFactory = pagegenerators.GeneratorFactory()
-
-    config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid, "Yahoo AppID")
-    config.copyright_google = check_config(config.copyright_google, config.google_key, "Google Web API license key")
-    config.copyright_msn = check_config(config.copyright_msn, config.msn_appid, "Live Search AppID")
-
     # Read commandline parameters.
     for arg in wikipedia.handleArgs():
         if arg == '-y':
@@ -967,11 +972,6 @@
         elif arg.startswith('-text'):
             if len(arg) >= 6:
               text = arg[6:]
-        elif arg.startswith('-xml'):
-            if len(arg) == 4:
-                xmlFilename = wikipedia.input(u'Please enter the XML dump's filename:')
-            else:
-                xmlFilename = arg[5:]
         elif arg.startswith('-page'):
             if len(arg) == 5:
                 PageTitles.append(wikipedia.input(u'Which page do you want to change?'))
@@ -988,9 +988,12 @@
             repeat = True
         elif arg.startswith('-new'):
             if len(arg) >=5:
-              gen = pagegenerators.NewpagesPageGenerator(number=int(arg[5:]), repeat = repeat)
-            else:
-              gen = pagegenerators.NewpagesPageGenerator(number=60, repeat = repeat)
+              number = int(arg[5:])
+            gen = pagegenerators.NewpagesPageGenerator(number = number, repeat = repeat)
+            # Preload generator work better if 'pageNumber' is not major than 'number',
+            # this avoid unnecessary delay.
+            if number < pageNumber:
+                pageNumber = number
         else:
             generator = genFactory.handleArg(arg)
             if generator:
@@ -1000,13 +1003,17 @@
         pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles]
         gen = iter(pages)
+    config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid, "Yahoo AppID")
+    config.copyright_google = check_config(config.copyright_google, config.google_key, "Google Web API license key")
+    config.copyright_msn = check_config(config.copyright_msn, config.msn_appid, "Live Search AppID")
+
     if ids:
         checks_by_ids(ids)
if not gen and not ids and not text:
         # syntax error, show help text from the top of this file
         wikipedia.output(__doc__, 'utf-8')
-        
+
     if text:
         output = query(lines = text.splitlines())
         if output:
@@ -1017,7 +1024,7 @@
         sys.exit()
     if namespaces != []:
         gen =  pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
-    preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
+    preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = pageNumber)
     bot = CheckRobot(preloadingGen)
     bot.run()
Modified: trunk/pywikipedia/copyright_clean.py
===================================================================
--- trunk/pywikipedia/copyright_clean.py	2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/copyright_clean.py	2007-09-24 09:17:56 UTC (rev 4353)
@@ -27,7 +27,7 @@
#
 # {{botbox|title|newid|oldid|author|...}}
-rev_templateC = re.compile("(?m)^(?:{{/t|.*?}}\n?)?{{botbox|.*?|(.*?)|")
+rev_templateC = re.compile("(?m)^(?:{{/t|.*?}}\n?)?{{(?:/box|botbox)|.*?|(.*?)|")
def query_yurik_api(data):
@@ -48,21 +48,8 @@
return data
-def manage_query(items, mode = "titles"):
-    """No more of 100 titles at a time using Yurik's API"""
-
-    global query_results
-
-    for s in mysplit(items, 100, "|"):
-        if mode == "titles":
-            query_results.append(simplejson.loads(query_yurik_api(('titles', s))))
-
-        elif mode == 'revids':
-            query_results2.append(simplejson.loads(query_yurik_api(('revids', s))))
-    return
-
 def page_exist(title):
-    for pageobjs in query_results:
+    for pageobjs in query_results_titles:
         for key in pageobjs['pages']:
             if pageobjs['pages'][key]['title'] == title:
                 if int(key) >= 0:
@@ -71,7 +58,7 @@
     return False
def revid_exist(revid):
-    for pageobjs in query_results2:
+    for pageobjs in query_results_revids:
         for id in pageobjs['pages']:
             for rv in range(len(pageobjs['pages'][id]['revisions'])):
                 if pageobjs['pages'][id]['revisions'][rv]['revid'] == int(revid):
@@ -85,7 +72,7 @@
for page in gen:
     data = page.get()
-    wikipedia.output(page.title())
+    wikipedia.output(page.aslink())
     output = ''
#
@@ -104,11 +91,14 @@
     titles = headC.findall(data)
     revids = rev_templateC.findall(data)
-    query_results = list()
-    query_results2 = list()
+    query_results_titles = list()
+    query_results_revids = list()
-    manage_query(query.ListToParam(titles))
-    manage_query(query.ListToParam(revids), "revids")
+    # No more of 100 titles at a time using Yurik's API
+    for s in mysplit(query.ListToParam(titles), 100, "|"):
+        query_results_titles.append(simplejson.loads(query_yurik_api(('titles', s))))
+    for s in mysplit(query.ListToParam(revids), 100, "|"):
+        query_results_revids.append(simplejson.loads(query_yurik_api(('revids', s))))
comment_entry = list()
     add_separator = False
@@ -131,7 +121,7 @@
         exist = True
         if page_exist(title):
             # check {{botbox}}
-            revid = re.search("{{botbox|.*?|(.*?)|", data[head.end():stop])
+            revid = re.search("{{(?:/box|botbox)|.*?|(.*?)|", data[head.end():stop])
             if revid:
                 if not revid_exist(revid.group(1)):
                     exist = False
Modified: trunk/pywikipedia/copyright_put.py
===================================================================
--- trunk/pywikipedia/copyright_put.py	2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/copyright_put.py	2007-09-24 09:17:56 UTC (rev 4353)
@@ -38,8 +38,8 @@
 }
stat_msg = {
-    'en': [u'Statistics', u'Page', u'Entries', u'Total', 'Update'],
-    'it': [u'Statistiche', u'Pagina', u'Segnalazioni', u'Totale', u'Ultimo aggiornamento'],
+    'en': [u'Statistics', u'Page', u'Entries', u'Size', u'Total', 'Update'],
+    'it': [u'Statistiche', u'Pagina', u'Segnalazioni', u'Lunghezza', u'Totale', u'Ultimo aggiornamento'],
 }
wiki_save_path = wikipedia.translate(wikipedia.getSite(), wiki_save_path)
@@ -87,10 +87,11 @@
 ! %s
 ! %s
 ! %s
+! %s
 |-
-""" % ( msg[1], msg[2], 'Google', 'Yahoo', 'Live Search' )
+""" % ( msg[1], msg[2], msg[3], 'Google', 'Yahoo', 'Live Search' )
-    gnt = 0 ; ynt = 0 ; mnt = 0 ; ent = 0
+    gnt = 0 ; ynt = 0 ; mnt = 0 ; ent = 0 ; sn = 0 ; snt = 0
for page in gen:
         data = page.get()
@@ -100,18 +101,19 @@
         mn = stat_sum('(msn|live)', data)
en = len(re.findall('=== [[', data))
+        sn = len(data)
-        gnt += gn ; ynt += yn ; mnt += mn ; ent += en
+        gnt += gn ; ynt += yn ; mnt += mn ; ent += en ; snt += sn
-        output += u"|%s||%s||%s||%s||%s\n|-\n" % (page.aslink(), en, gn, yn, mn)
+        output += u"|%s||%s||%s KB||%s||%s||%s\n|-\n" % (page.aslink(), en, sn / 1024, gn, yn, mn)
output += u"""|&nbsp;||||||||
 |-
-|'''%s'''||%s||%s||%s||%s
+|'''%s'''||%s||%s KB||%s||%s||%s
 |-
-|colspan="5" align=right style="background-color:#eeeeee;"|<small>''%s: %s''</small>
+|colspan="6" align=right style="background-color:#eeeeee;"|<small>''%s: %s''</small>
 |}
-""" % (msg[3], ent, gnt, ynt, mnt, msg[4], time.strftime("%d " + "%s" % (date.monthName(wikipedia.getSite().language(), time.localtime()[1])) + " %Y"))
+""" % (msg[4], ent, snt / 1024, gnt, ynt, mnt, msg[5], time.strftime("%d " + "%s" % (date.monthName(wikipedia.getSite().language(), time.localtime()[1])) + " %Y"))
return output