[Pywikipedia-l] SVN: [4353] trunk/pywikipedia - pywikibot

24 Sep 2007

Revision: 4353
Author:   cosoleto
Date:     2007-09-24 09:17:56 +0000 (Mon, 24 Sep 2007)

Log Message:
-----------
copyright.py: Fixed problem with preload generator and new pages generetor (pointed out by
Filnik)). Added note to help users of Python 2.5.0 to install SOAPpy 0.12.0.
copyright_put.py, copyright_clean.py: Minor fixes, and code cleanup.

Modified Paths:
--------------
    trunk/pywikipedia/CONTENTS
    trunk/pywikipedia/copyright.py
    trunk/pywikipedia/copyright_clean.py
    trunk/pywikipedia/copyright_put.py

Modified: trunk/pywikipedia/CONTENTS
===================================================================

--- trunk/pywikipedia/CONTENTS	2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/CONTENTS	2007-09-24 09:17:56 UTC (rev 4353)
@@ -85,7 +85,7 @@
 clean_sandbox.py       : This bot makes the cleaned of the page of tests.
 commons_link.py        : This robot include commons template to linking Commons and 
                          your wiki project.
-copyright.py           : This robot check copyright text in Google and Yahoo.
+copyright.py           : This robot check copyright text in Google, Yahoo! and Live
Search.
 cosmetic_changes.py    : Can do slight modifications to a wiki page source code
                          such that the code looks cleaner.
 delete.py              : This script can be used to delete pages en masse.

Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py	2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/copyright.py	2007-09-24 09:17:56 UTC (rev 4353)
@@ -13,6 +13,10 @@
 
 Windows Live Search requires to install the SOAPpy module from
 http://pywebsvcs.sf.net and get an AppID from http://search.msn.com/developer.
+If you use Python 2.5 and have SOAPpy version 0.12.0, you must edit three
+files (SOAPpy/Client.py, SOAPpy/Server.py, SOAPpy/Types.py) to fix a simple
+syntax error by moving 'from __future__ imports...' line to beginning of the
+code.
 
 You can run the bot with the following commandline parameters:
 
@@ -75,6 +79,9 @@
 
 __version__='$Id$'
 
+#
+no_result_with_those_words = '-Wikipedia'
+
 # Try to skip quoted text.
 exclude_quote = True
 
@@ -413,7 +420,7 @@
         text =
re.sub('^[:*]?\s*[“][^”]+[”]\.?\s*((\(|<ref>).*?(\)|</ref>))?\.?$',
"", text)
 
     # remove URL
-    text = re.sub('https?://[\w/.,;:@&=%#\\\?_!~*\'|()\"+-]+', '
', text)
+    text = re.sub('(ftp|https?)://[\w/.,;:@&=%#\\\?_!~*\'|()\"+-]+',
' ', text)
 
     # remove Image tags
     text = reImageC.sub("", text)
@@ -761,7 +768,7 @@
         search_request_retry = config.copyright_connection_tries
         while search_request_retry:
             try:
-                data = google.doGoogleSearch('-Wikipedia "' + query +
'"')
+                data = google.doGoogleSearch('%s "%s"' %
(no_result_with_those_words, query)
                 search_request_retry = 0
                 for entry in data.results:
                     add_in_urllist(url, entry.URL, 'google')
@@ -780,9 +787,10 @@
     if config.copyright_yahoo:
         import yahoo.search.web
         print "  Yahoo query..."
-        data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"' +
-                                          query.encode('utf_8') +
-                                          '" -Wikipedia',
results=numresults)
+        data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s"
%s' % (
+                                          query.encode('utf_8'),
+                                          no_result_with_those_words
+                                          ), results = numresults)
         search_request_retry = config.copyright_connection_tries
         while search_request_retry:
             try:
@@ -805,7 +813,8 @@
         except:
             print "Live Search Error"
             raise
-        params = {'AppID': config.msn_appid, 'Query': '-Wikipedia
"' + query + '"', 'CultureInfo': 'en-US',
'SafeSearch': 'Off', 'Requests': {
+        params = {'AppID': config.msn_appid, 'Query': '%s
"%s"' % (no_result_with_those_words, query),
+                 'CultureInfo': 'en-US', 'SafeSearch':
'Off', 'Requests': {
                  'SourceRequest':{'Source': 'Web',
'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
 
         search_request_retry = config.copyright_connection_tries
@@ -919,9 +928,6 @@
     PageTitles = []
     # IDs which will be processed when the -ids parameter is used
     ids = None
-    # will become True when the user presses a ('yes to all') or uses the
-always
-    # commandline paramater.
-    acceptall = False
     # Which namespaces should be processed?
     # default to [] which means all namespaces will be processed
     namespaces = []
@@ -929,6 +935,10 @@
     repeat = False
     #
     text = None
+    # Number of pages to load at a time by Preload generator
+    pageNumber = 40
+    # Default number of pages for NewPages generator
+    number = 60
 
     firstPageTitle = None
     # This factory is responsible for processing command line arguments
@@ -936,11 +946,6 @@
     # to work on.
     genFactory = pagegenerators.GeneratorFactory()
 
-
-    config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid,
"Yahoo AppID")
-    config.copyright_google = check_config(config.copyright_google, config.google_key,
"Google Web API license key")
-    config.copyright_msn = check_config(config.copyright_msn, config.msn_appid,
"Live Search AppID")
-
     # Read commandline parameters.
     for arg in wikipedia.handleArgs():
         if arg == '-y':
@@ -967,11 +972,6 @@
         elif arg.startswith('-text'):
             if len(arg) >= 6:
               text = arg[6:]
-        elif arg.startswith('-xml'):
-            if len(arg) == 4:
-                xmlFilename = wikipedia.input(u'Please enter the XML dump\'s
filename:')
-            else:
-                xmlFilename = arg[5:]
         elif arg.startswith('-page'):
             if len(arg) == 5:
                 PageTitles.append(wikipedia.input(u'Which page do you want to
change?'))
@@ -988,9 +988,12 @@
             repeat = True
         elif arg.startswith('-new'):
             if len(arg) >=5:
-              gen = pagegenerators.NewpagesPageGenerator(number=int(arg[5:]), repeat =
repeat)
-            else:
-              gen = pagegenerators.NewpagesPageGenerator(number=60, repeat = repeat)
+              number = int(arg[5:])
+            gen = pagegenerators.NewpagesPageGenerator(number = number, repeat = repeat)
+            # Preload generator work better if 'pageNumber' is not major than
'number',
+            # this avoid unnecessary delay.
+            if number < pageNumber:
+                pageNumber = number
         else:
             generator = genFactory.handleArg(arg)
             if generator:
@@ -1000,13 +1003,17 @@
         pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in
PageTitles]
         gen = iter(pages)
 
+    config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid,
"Yahoo AppID")
+    config.copyright_google = check_config(config.copyright_google, config.google_key,
"Google Web API license key")
+    config.copyright_msn = check_config(config.copyright_msn, config.msn_appid,
"Live Search AppID")
+
     if ids:
         checks_by_ids(ids)
 
     if not gen and not ids and not text:
         # syntax error, show help text from the top of this file
         wikipedia.output(__doc__, 'utf-8')
-        
+
     if text:
         output = query(lines = text.splitlines())
         if output:
@@ -1017,7 +1024,7 @@
         sys.exit()
     if namespaces != []:
         gen =  pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
-    preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
+    preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = pageNumber)
     bot = CheckRobot(preloadingGen)
     bot.run()
 

Modified: trunk/pywikipedia/copyright_clean.py
===================================================================
--- trunk/pywikipedia/copyright_clean.py	2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/copyright_clean.py	2007-09-24 09:17:56 UTC (rev 4353)
@@ -27,7 +27,7 @@
 
 #
 # {{botbox|title|newid|oldid|author|...}}
-rev_templateC = re.compile("(?m)^(?:{{/t\|.*?}}\n?)?{{botbox\|.*?\|(.*?)\|")
+rev_templateC =
re.compile("(?m)^(?:{{/t\|.*?}}\n?)?{{(?:/box|botbox)\|.*?\|(.*?)\|")
 
 def query_yurik_api(data):
 
@@ -48,21 +48,8 @@
 
     return data
 
-def manage_query(items, mode = "titles"):
-    """No more of 100 titles at a time using Yurik's
API"""
-
-    global query_results
-
-    for s in mysplit(items, 100, "|"):
-        if mode == "titles":
-            query_results.append(simplejson.loads(query_yurik_api(('titles',
s))))
-
-        elif mode == 'revids':
-            query_results2.append(simplejson.loads(query_yurik_api(('revids',
s))))
-    return
-
 def page_exist(title):
-    for pageobjs in query_results:
+    for pageobjs in query_results_titles:
         for key in pageobjs['pages']:
             if pageobjs['pages'][key]['title'] == title:
                 if int(key) >= 0:
@@ -71,7 +58,7 @@
     return False
 
 def revid_exist(revid):
-    for pageobjs in query_results2:
+    for pageobjs in query_results_revids:
         for id in pageobjs['pages']:
             for rv in range(len(pageobjs['pages'][id]['revisions'])):
                 if
pageobjs['pages'][id]['revisions'][rv]['revid'] == int(revid):
@@ -85,7 +72,7 @@
 
 for page in gen:
     data = page.get()
-    wikipedia.output(page.title())
+    wikipedia.output(page.aslink())
     output = ''
 
     #
@@ -104,11 +91,14 @@
     titles = headC.findall(data)
     revids = rev_templateC.findall(data)
 
-    query_results = list()
-    query_results2 = list()
+    query_results_titles = list()
+    query_results_revids = list()
 
-    manage_query(query.ListToParam(titles))
-    manage_query(query.ListToParam(revids), "revids")
+    # No more of 100 titles at a time using Yurik's API
+    for s in mysplit(query.ListToParam(titles), 100, "|"):
+        query_results_titles.append(simplejson.loads(query_yurik_api(('titles',
s))))
+    for s in mysplit(query.ListToParam(revids), 100, "|"):
+        query_results_revids.append(simplejson.loads(query_yurik_api(('revids',
s))))
 
     comment_entry = list()
     add_separator = False
@@ -131,7 +121,7 @@
         exist = True
         if page_exist(title):
             # check {{botbox}}
-            revid = re.search("{{botbox\|.*?\|(.*?)\|", data[head.end():stop])
+            revid = re.search("{{(?:/box|botbox)\|.*?\|(.*?)\|",
data[head.end():stop])
             if revid:
                 if not revid_exist(revid.group(1)):
                     exist = False

Modified: trunk/pywikipedia/copyright_put.py
===================================================================
--- trunk/pywikipedia/copyright_put.py	2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/copyright_put.py	2007-09-24 09:17:56 UTC (rev 4353)
@@ -38,8 +38,8 @@
 }
 
 stat_msg = {
-    'en': [u'Statistics', u'Page', u'Entries',
u'Total', 'Update'],
-    'it': [u'Statistiche', u'Pagina', u'Segnalazioni',
u'Totale', u'Ultimo aggiornamento'],
+    'en': [u'Statistics', u'Page', u'Entries',
u'Size', u'Total', 'Update'],
+    'it': [u'Statistiche', u'Pagina', u'Segnalazioni',
u'Lunghezza', u'Totale', u'Ultimo aggiornamento'],
 }
 
 wiki_save_path = wikipedia.translate(wikipedia.getSite(), wiki_save_path)
@@ -87,10 +87,11 @@
 ! %s
 ! %s
 ! %s
+! %s
 |-
-""" % ( msg[1], msg[2], 'Google', 'Yahoo', 'Live
Search' )
+""" % ( msg[1], msg[2], msg[3], 'Google', 'Yahoo',
'Live Search' )
 
-    gnt = 0 ; ynt = 0 ; mnt = 0 ; ent = 0
+    gnt = 0 ; ynt = 0 ; mnt = 0 ; ent = 0 ; sn = 0 ; snt = 0
 
     for page in gen:
         data = page.get()
@@ -100,18 +101,19 @@
         mn = stat_sum('(msn|live)', data)
 
         en = len(re.findall('=== \[\[', data))
+        sn = len(data)
 
-        gnt += gn ; ynt += yn ; mnt += mn ; ent += en
+        gnt += gn ; ynt += yn ; mnt += mn ; ent += en ; snt += sn
 
-        output += u"|%s||%s||%s||%s||%s\n|-\n" % (page.aslink(), en, gn, yn,
mn)
+        output += u"|%s||%s||%s KB||%s||%s||%s\n|-\n" % (page.aslink(), en, sn
/ 1024, gn, yn, mn)
 
     output += u"""|&nbsp;||||||||
 |-
-|'''%s'''||%s||%s||%s||%s
+|'''%s'''||%s||%s KB||%s||%s||%s
 |-
-|colspan="5" align=right
style="background-color:#eeeeee;"|<small>''%s:
%s''</small>
+|colspan="6" align=right
style="background-color:#eeeeee;"|<small>''%s:
%s''</small>
 |}
-""" % (msg[3], ent, gnt, ynt, mnt, msg[4], time.strftime("%d " +
"%s" % (date.monthName(wikipedia.getSite().language(), time.localtime()[1])) +
" %Y"))
+""" % (msg[4], ent, snt / 1024, gnt, ynt, mnt, msg[5],
time.strftime("%d " + "%s" %
(date.monthName(wikipedia.getSite().language(), time.localtime()[1])) + " %Y"))
 
     return output