Revision: 4353 Author: cosoleto Date: 2007-09-24 09:17:56 +0000 (Mon, 24 Sep 2007)
Log Message: ----------- copyright.py: Fixed problem with preload generator and new pages generetor (pointed out by Filnik)). Added note to help users of Python 2.5.0 to install SOAPpy 0.12.0. copyright_put.py, copyright_clean.py: Minor fixes, and code cleanup.
Modified Paths: -------------- trunk/pywikipedia/CONTENTS trunk/pywikipedia/copyright.py trunk/pywikipedia/copyright_clean.py trunk/pywikipedia/copyright_put.py
Modified: trunk/pywikipedia/CONTENTS =================================================================== --- trunk/pywikipedia/CONTENTS 2007-09-24 09:07:53 UTC (rev 4352) +++ trunk/pywikipedia/CONTENTS 2007-09-24 09:17:56 UTC (rev 4353) @@ -85,7 +85,7 @@ clean_sandbox.py : This bot makes the cleaned of the page of tests. commons_link.py : This robot include commons template to linking Commons and your wiki project. -copyright.py : This robot check copyright text in Google and Yahoo. +copyright.py : This robot check copyright text in Google, Yahoo! and Live Search. cosmetic_changes.py : Can do slight modifications to a wiki page source code such that the code looks cleaner. delete.py : This script can be used to delete pages en masse.
Modified: trunk/pywikipedia/copyright.py =================================================================== --- trunk/pywikipedia/copyright.py 2007-09-24 09:07:53 UTC (rev 4352) +++ trunk/pywikipedia/copyright.py 2007-09-24 09:17:56 UTC (rev 4353) @@ -13,6 +13,10 @@
Windows Live Search requires to install the SOAPpy module from http://pywebsvcs.sf.net and get an AppID from http://search.msn.com/developer. +If you use Python 2.5 and have SOAPpy version 0.12.0, you must edit three +files (SOAPpy/Client.py, SOAPpy/Server.py, SOAPpy/Types.py) to fix a simple +syntax error by moving 'from __future__ imports...' line to beginning of the +code.
You can run the bot with the following commandline parameters:
@@ -75,6 +79,9 @@
__version__='$Id$'
+# +no_result_with_those_words = '-Wikipedia' + # Try to skip quoted text. exclude_quote = True
@@ -413,7 +420,7 @@ text = re.sub('^[:*]?\s*[“][^”]+[”].?\s*(((|<ref>).*?()|</ref>))?.?$', "", text)
# remove URL - text = re.sub('https?://[\w/.,;:@&=%#\?_!~*'|()"+-]+', ' ', text) + text = re.sub('(ftp|https?)://[\w/.,;:@&=%#\?_!~*'|()"+-]+', ' ', text)
# remove Image tags text = reImageC.sub("", text) @@ -761,7 +768,7 @@ search_request_retry = config.copyright_connection_tries while search_request_retry: try: - data = google.doGoogleSearch('-Wikipedia "' + query + '"') + data = google.doGoogleSearch('%s "%s"' % (no_result_with_those_words, query) search_request_retry = 0 for entry in data.results: add_in_urllist(url, entry.URL, 'google') @@ -780,9 +787,10 @@ if config.copyright_yahoo: import yahoo.search.web print " Yahoo query..." - data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"' + - query.encode('utf_8') + - '" -Wikipedia', results=numresults) + data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s" %s' % ( + query.encode('utf_8'), + no_result_with_those_words + ), results = numresults) search_request_retry = config.copyright_connection_tries while search_request_retry: try: @@ -805,7 +813,8 @@ except: print "Live Search Error" raise - params = {'AppID': config.msn_appid, 'Query': '-Wikipedia "' + query + '"', 'CultureInfo': 'en-US', 'SafeSearch': 'Off', 'Requests': { + params = {'AppID': config.msn_appid, 'Query': '%s "%s"' % (no_result_with_those_words, query), + 'CultureInfo': 'en-US', 'SafeSearch': 'Off', 'Requests': { 'SourceRequest':{'Source': 'Web', 'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
search_request_retry = config.copyright_connection_tries @@ -919,9 +928,6 @@ PageTitles = [] # IDs which will be processed when the -ids parameter is used ids = None - # will become True when the user presses a ('yes to all') or uses the -always - # commandline paramater. - acceptall = False # Which namespaces should be processed? # default to [] which means all namespaces will be processed namespaces = [] @@ -929,6 +935,10 @@ repeat = False # text = None + # Number of pages to load at a time by Preload generator + pageNumber = 40 + # Default number of pages for NewPages generator + number = 60
firstPageTitle = None # This factory is responsible for processing command line arguments @@ -936,11 +946,6 @@ # to work on. genFactory = pagegenerators.GeneratorFactory()
- - config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid, "Yahoo AppID") - config.copyright_google = check_config(config.copyright_google, config.google_key, "Google Web API license key") - config.copyright_msn = check_config(config.copyright_msn, config.msn_appid, "Live Search AppID") - # Read commandline parameters. for arg in wikipedia.handleArgs(): if arg == '-y': @@ -967,11 +972,6 @@ elif arg.startswith('-text'): if len(arg) >= 6: text = arg[6:] - elif arg.startswith('-xml'): - if len(arg) == 4: - xmlFilename = wikipedia.input(u'Please enter the XML dump's filename:') - else: - xmlFilename = arg[5:] elif arg.startswith('-page'): if len(arg) == 5: PageTitles.append(wikipedia.input(u'Which page do you want to change?')) @@ -988,9 +988,12 @@ repeat = True elif arg.startswith('-new'): if len(arg) >=5: - gen = pagegenerators.NewpagesPageGenerator(number=int(arg[5:]), repeat = repeat) - else: - gen = pagegenerators.NewpagesPageGenerator(number=60, repeat = repeat) + number = int(arg[5:]) + gen = pagegenerators.NewpagesPageGenerator(number = number, repeat = repeat) + # Preload generator work better if 'pageNumber' is not major than 'number', + # this avoid unnecessary delay. + if number < pageNumber: + pageNumber = number else: generator = genFactory.handleArg(arg) if generator: @@ -1000,13 +1003,17 @@ pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles] gen = iter(pages)
+ config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid, "Yahoo AppID") + config.copyright_google = check_config(config.copyright_google, config.google_key, "Google Web API license key") + config.copyright_msn = check_config(config.copyright_msn, config.msn_appid, "Live Search AppID") + if ids: checks_by_ids(ids)
if not gen and not ids and not text: # syntax error, show help text from the top of this file wikipedia.output(__doc__, 'utf-8') - + if text: output = query(lines = text.splitlines()) if output: @@ -1017,7 +1024,7 @@ sys.exit() if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) - preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20) + preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = pageNumber) bot = CheckRobot(preloadingGen) bot.run()
Modified: trunk/pywikipedia/copyright_clean.py =================================================================== --- trunk/pywikipedia/copyright_clean.py 2007-09-24 09:07:53 UTC (rev 4352) +++ trunk/pywikipedia/copyright_clean.py 2007-09-24 09:17:56 UTC (rev 4353) @@ -27,7 +27,7 @@
# # {{botbox|title|newid|oldid|author|...}} -rev_templateC = re.compile("(?m)^(?:{{/t|.*?}}\n?)?{{botbox|.*?|(.*?)|") +rev_templateC = re.compile("(?m)^(?:{{/t|.*?}}\n?)?{{(?:/box|botbox)|.*?|(.*?)|")
def query_yurik_api(data):
@@ -48,21 +48,8 @@
return data
-def manage_query(items, mode = "titles"): - """No more of 100 titles at a time using Yurik's API""" - - global query_results - - for s in mysplit(items, 100, "|"): - if mode == "titles": - query_results.append(simplejson.loads(query_yurik_api(('titles', s)))) - - elif mode == 'revids': - query_results2.append(simplejson.loads(query_yurik_api(('revids', s)))) - return - def page_exist(title): - for pageobjs in query_results: + for pageobjs in query_results_titles: for key in pageobjs['pages']: if pageobjs['pages'][key]['title'] == title: if int(key) >= 0: @@ -71,7 +58,7 @@ return False
def revid_exist(revid): - for pageobjs in query_results2: + for pageobjs in query_results_revids: for id in pageobjs['pages']: for rv in range(len(pageobjs['pages'][id]['revisions'])): if pageobjs['pages'][id]['revisions'][rv]['revid'] == int(revid): @@ -85,7 +72,7 @@
for page in gen: data = page.get() - wikipedia.output(page.title()) + wikipedia.output(page.aslink()) output = ''
# @@ -104,11 +91,14 @@ titles = headC.findall(data) revids = rev_templateC.findall(data)
- query_results = list() - query_results2 = list() + query_results_titles = list() + query_results_revids = list()
- manage_query(query.ListToParam(titles)) - manage_query(query.ListToParam(revids), "revids") + # No more of 100 titles at a time using Yurik's API + for s in mysplit(query.ListToParam(titles), 100, "|"): + query_results_titles.append(simplejson.loads(query_yurik_api(('titles', s)))) + for s in mysplit(query.ListToParam(revids), 100, "|"): + query_results_revids.append(simplejson.loads(query_yurik_api(('revids', s))))
comment_entry = list() add_separator = False @@ -131,7 +121,7 @@ exist = True if page_exist(title): # check {{botbox}} - revid = re.search("{{botbox|.*?|(.*?)|", data[head.end():stop]) + revid = re.search("{{(?:/box|botbox)|.*?|(.*?)|", data[head.end():stop]) if revid: if not revid_exist(revid.group(1)): exist = False
Modified: trunk/pywikipedia/copyright_put.py =================================================================== --- trunk/pywikipedia/copyright_put.py 2007-09-24 09:07:53 UTC (rev 4352) +++ trunk/pywikipedia/copyright_put.py 2007-09-24 09:17:56 UTC (rev 4353) @@ -38,8 +38,8 @@ }
stat_msg = { - 'en': [u'Statistics', u'Page', u'Entries', u'Total', 'Update'], - 'it': [u'Statistiche', u'Pagina', u'Segnalazioni', u'Totale', u'Ultimo aggiornamento'], + 'en': [u'Statistics', u'Page', u'Entries', u'Size', u'Total', 'Update'], + 'it': [u'Statistiche', u'Pagina', u'Segnalazioni', u'Lunghezza', u'Totale', u'Ultimo aggiornamento'], }
wiki_save_path = wikipedia.translate(wikipedia.getSite(), wiki_save_path) @@ -87,10 +87,11 @@ ! %s ! %s ! %s +! %s |- -""" % ( msg[1], msg[2], 'Google', 'Yahoo', 'Live Search' ) +""" % ( msg[1], msg[2], msg[3], 'Google', 'Yahoo', 'Live Search' )
- gnt = 0 ; ynt = 0 ; mnt = 0 ; ent = 0 + gnt = 0 ; ynt = 0 ; mnt = 0 ; ent = 0 ; sn = 0 ; snt = 0
for page in gen: data = page.get() @@ -100,18 +101,19 @@ mn = stat_sum('(msn|live)', data)
en = len(re.findall('=== [[', data)) + sn = len(data)
- gnt += gn ; ynt += yn ; mnt += mn ; ent += en + gnt += gn ; ynt += yn ; mnt += mn ; ent += en ; snt += sn
- output += u"|%s||%s||%s||%s||%s\n|-\n" % (page.aslink(), en, gn, yn, mn) + output += u"|%s||%s||%s KB||%s||%s||%s\n|-\n" % (page.aslink(), en, sn / 1024, gn, yn, mn)
output += u"""| |||||||| |- -|'''%s'''||%s||%s||%s||%s +|'''%s'''||%s||%s KB||%s||%s||%s |- -|colspan="5" align=right style="background-color:#eeeeee;"|<small>''%s: %s''</small> +|colspan="6" align=right style="background-color:#eeeeee;"|<small>''%s: %s''</small> |} -""" % (msg[3], ent, gnt, ynt, mnt, msg[4], time.strftime("%d " + "%s" % (date.monthName(wikipedia.getSite().language(), time.localtime()[1])) + " %Y")) +""" % (msg[4], ent, snt / 1024, gnt, ynt, mnt, msg[5], time.strftime("%d " + "%s" % (date.monthName(wikipedia.getSite().language(), time.localtime()[1])) + " %Y"))
return output
pywikipedia-l@lists.wikimedia.org