Revision: 4353
Author: cosoleto
Date: 2007-09-24 09:17:56 +0000 (Mon, 24 Sep 2007)
Log Message:
-----------
copyright.py: Fixed problem with preload generator and new pages generetor (pointed out by
Filnik)). Added note to help users of Python 2.5.0 to install SOAPpy 0.12.0.
copyright_put.py, copyright_clean.py: Minor fixes, and code cleanup.
Modified Paths:
--------------
trunk/pywikipedia/CONTENTS
trunk/pywikipedia/copyright.py
trunk/pywikipedia/copyright_clean.py
trunk/pywikipedia/copyright_put.py
Modified: trunk/pywikipedia/CONTENTS
===================================================================
--- trunk/pywikipedia/CONTENTS 2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/CONTENTS 2007-09-24 09:17:56 UTC (rev 4353)
@@ -85,7 +85,7 @@
clean_sandbox.py : This bot makes the cleaned of the page of tests.
commons_link.py : This robot include commons template to linking Commons and
your wiki project.
-copyright.py : This robot check copyright text in Google and Yahoo.
+copyright.py : This robot check copyright text in Google, Yahoo! and Live
Search.
cosmetic_changes.py : Can do slight modifications to a wiki page source code
such that the code looks cleaner.
delete.py : This script can be used to delete pages en masse.
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/copyright.py 2007-09-24 09:17:56 UTC (rev 4353)
@@ -13,6 +13,10 @@
Windows Live Search requires to install the SOAPpy module from
http://pywebsvcs.sf.net and get an AppID from
http://search.msn.com/developer.
+If you use Python 2.5 and have SOAPpy version 0.12.0, you must edit three
+files (SOAPpy/Client.py, SOAPpy/Server.py, SOAPpy/Types.py) to fix a simple
+syntax error by moving 'from __future__ imports...' line to beginning of the
+code.
You can run the bot with the following commandline parameters:
@@ -75,6 +79,9 @@
__version__='$Id$'
+#
+no_result_with_those_words = '-Wikipedia'
+
# Try to skip quoted text.
exclude_quote = True
@@ -413,7 +420,7 @@
text =
re.sub('^[:*]?\s*[“][^”]+[”]\.?\s*((\(|<ref>).*?(\)|</ref>))?\.?$',
"", text)
# remove URL
- text = re.sub('https?://[\w/.,;:@&=%#\\\?_!~*\'|()\"+-]+', '
', text)
+ text = re.sub('(ftp|https?)://[\w/.,;:@&=%#\\\?_!~*\'|()\"+-]+',
' ', text)
# remove Image tags
text = reImageC.sub("", text)
@@ -761,7 +768,7 @@
search_request_retry = config.copyright_connection_tries
while search_request_retry:
try:
- data = google.doGoogleSearch('-Wikipedia "' + query +
'"')
+ data = google.doGoogleSearch('%s "%s"' %
(no_result_with_those_words, query)
search_request_retry = 0
for entry in data.results:
add_in_urllist(url, entry.URL, 'google')
@@ -780,9 +787,10 @@
if config.copyright_yahoo:
import yahoo.search.web
print " Yahoo query..."
- data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"' +
- query.encode('utf_8') +
- '" -Wikipedia',
results=numresults)
+ data = yahoo.search.web.WebSearch(config.yahoo_appid, query='"%s"
%s' % (
+ query.encode('utf_8'),
+ no_result_with_those_words
+ ), results = numresults)
search_request_retry = config.copyright_connection_tries
while search_request_retry:
try:
@@ -805,7 +813,8 @@
except:
print "Live Search Error"
raise
- params = {'AppID': config.msn_appid, 'Query': '-Wikipedia
"' + query + '"', 'CultureInfo': 'en-US',
'SafeSearch': 'Off', 'Requests': {
+ params = {'AppID': config.msn_appid, 'Query': '%s
"%s"' % (no_result_with_those_words, query),
+ 'CultureInfo': 'en-US', 'SafeSearch':
'Off', 'Requests': {
'SourceRequest':{'Source': 'Web',
'Offset': 0, 'Count': 10, 'ResultFields': 'All',}}}
search_request_retry = config.copyright_connection_tries
@@ -919,9 +928,6 @@
PageTitles = []
# IDs which will be processed when the -ids parameter is used
ids = None
- # will become True when the user presses a ('yes to all') or uses the
-always
- # commandline paramater.
- acceptall = False
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
@@ -929,6 +935,10 @@
repeat = False
#
text = None
+ # Number of pages to load at a time by Preload generator
+ pageNumber = 40
+ # Default number of pages for NewPages generator
+ number = 60
firstPageTitle = None
# This factory is responsible for processing command line arguments
@@ -936,11 +946,6 @@
# to work on.
genFactory = pagegenerators.GeneratorFactory()
-
- config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid,
"Yahoo AppID")
- config.copyright_google = check_config(config.copyright_google, config.google_key,
"Google Web API license key")
- config.copyright_msn = check_config(config.copyright_msn, config.msn_appid,
"Live Search AppID")
-
# Read commandline parameters.
for arg in wikipedia.handleArgs():
if arg == '-y':
@@ -967,11 +972,6 @@
elif arg.startswith('-text'):
if len(arg) >= 6:
text = arg[6:]
- elif arg.startswith('-xml'):
- if len(arg) == 4:
- xmlFilename = wikipedia.input(u'Please enter the XML dump\'s
filename:')
- else:
- xmlFilename = arg[5:]
elif arg.startswith('-page'):
if len(arg) == 5:
PageTitles.append(wikipedia.input(u'Which page do you want to
change?'))
@@ -988,9 +988,12 @@
repeat = True
elif arg.startswith('-new'):
if len(arg) >=5:
- gen = pagegenerators.NewpagesPageGenerator(number=int(arg[5:]), repeat =
repeat)
- else:
- gen = pagegenerators.NewpagesPageGenerator(number=60, repeat = repeat)
+ number = int(arg[5:])
+ gen = pagegenerators.NewpagesPageGenerator(number = number, repeat = repeat)
+ # Preload generator work better if 'pageNumber' is not major than
'number',
+ # this avoid unnecessary delay.
+ if number < pageNumber:
+ pageNumber = number
else:
generator = genFactory.handleArg(arg)
if generator:
@@ -1000,13 +1003,17 @@
pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in
PageTitles]
gen = iter(pages)
+ config.copyright_yahoo = check_config(config.copyright_yahoo, config.yahoo_appid,
"Yahoo AppID")
+ config.copyright_google = check_config(config.copyright_google, config.google_key,
"Google Web API license key")
+ config.copyright_msn = check_config(config.copyright_msn, config.msn_appid,
"Live Search AppID")
+
if ids:
checks_by_ids(ids)
if not gen and not ids and not text:
# syntax error, show help text from the top of this file
wikipedia.output(__doc__, 'utf-8')
-
+
if text:
output = query(lines = text.splitlines())
if output:
@@ -1017,7 +1024,7 @@
sys.exit()
if namespaces != []:
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
- preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
+ preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = pageNumber)
bot = CheckRobot(preloadingGen)
bot.run()
Modified: trunk/pywikipedia/copyright_clean.py
===================================================================
--- trunk/pywikipedia/copyright_clean.py 2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/copyright_clean.py 2007-09-24 09:17:56 UTC (rev 4353)
@@ -27,7 +27,7 @@
#
# {{botbox|title|newid|oldid|author|...}}
-rev_templateC = re.compile("(?m)^(?:{{/t\|.*?}}\n?)?{{botbox\|.*?\|(.*?)\|")
+rev_templateC =
re.compile("(?m)^(?:{{/t\|.*?}}\n?)?{{(?:/box|botbox)\|.*?\|(.*?)\|")
def query_yurik_api(data):
@@ -48,21 +48,8 @@
return data
-def manage_query(items, mode = "titles"):
- """No more of 100 titles at a time using Yurik's
API"""
-
- global query_results
-
- for s in mysplit(items, 100, "|"):
- if mode == "titles":
- query_results.append(simplejson.loads(query_yurik_api(('titles',
s))))
-
- elif mode == 'revids':
- query_results2.append(simplejson.loads(query_yurik_api(('revids',
s))))
- return
-
def page_exist(title):
- for pageobjs in query_results:
+ for pageobjs in query_results_titles:
for key in pageobjs['pages']:
if pageobjs['pages'][key]['title'] == title:
if int(key) >= 0:
@@ -71,7 +58,7 @@
return False
def revid_exist(revid):
- for pageobjs in query_results2:
+ for pageobjs in query_results_revids:
for id in pageobjs['pages']:
for rv in range(len(pageobjs['pages'][id]['revisions'])):
if
pageobjs['pages'][id]['revisions'][rv]['revid'] == int(revid):
@@ -85,7 +72,7 @@
for page in gen:
data = page.get()
- wikipedia.output(page.title())
+ wikipedia.output(page.aslink())
output = ''
#
@@ -104,11 +91,14 @@
titles = headC.findall(data)
revids = rev_templateC.findall(data)
- query_results = list()
- query_results2 = list()
+ query_results_titles = list()
+ query_results_revids = list()
- manage_query(query.ListToParam(titles))
- manage_query(query.ListToParam(revids), "revids")
+ # No more of 100 titles at a time using Yurik's API
+ for s in mysplit(query.ListToParam(titles), 100, "|"):
+ query_results_titles.append(simplejson.loads(query_yurik_api(('titles',
s))))
+ for s in mysplit(query.ListToParam(revids), 100, "|"):
+ query_results_revids.append(simplejson.loads(query_yurik_api(('revids',
s))))
comment_entry = list()
add_separator = False
@@ -131,7 +121,7 @@
exist = True
if page_exist(title):
# check {{botbox}}
- revid = re.search("{{botbox\|.*?\|(.*?)\|", data[head.end():stop])
+ revid = re.search("{{(?:/box|botbox)\|.*?\|(.*?)\|",
data[head.end():stop])
if revid:
if not revid_exist(revid.group(1)):
exist = False
Modified: trunk/pywikipedia/copyright_put.py
===================================================================
--- trunk/pywikipedia/copyright_put.py 2007-09-24 09:07:53 UTC (rev 4352)
+++ trunk/pywikipedia/copyright_put.py 2007-09-24 09:17:56 UTC (rev 4353)
@@ -38,8 +38,8 @@
}
stat_msg = {
- 'en': [u'Statistics', u'Page', u'Entries',
u'Total', 'Update'],
- 'it': [u'Statistiche', u'Pagina', u'Segnalazioni',
u'Totale', u'Ultimo aggiornamento'],
+ 'en': [u'Statistics', u'Page', u'Entries',
u'Size', u'Total', 'Update'],
+ 'it': [u'Statistiche', u'Pagina', u'Segnalazioni',
u'Lunghezza', u'Totale', u'Ultimo aggiornamento'],
}
wiki_save_path = wikipedia.translate(wikipedia.getSite(), wiki_save_path)
@@ -87,10 +87,11 @@
! %s
! %s
! %s
+! %s
|-
-""" % ( msg[1], msg[2], 'Google', 'Yahoo', 'Live
Search' )
+""" % ( msg[1], msg[2], msg[3], 'Google', 'Yahoo',
'Live Search' )
- gnt = 0 ; ynt = 0 ; mnt = 0 ; ent = 0
+ gnt = 0 ; ynt = 0 ; mnt = 0 ; ent = 0 ; sn = 0 ; snt = 0
for page in gen:
data = page.get()
@@ -100,18 +101,19 @@
mn = stat_sum('(msn|live)', data)
en = len(re.findall('=== \[\[', data))
+ sn = len(data)
- gnt += gn ; ynt += yn ; mnt += mn ; ent += en
+ gnt += gn ; ynt += yn ; mnt += mn ; ent += en ; snt += sn
- output += u"|%s||%s||%s||%s||%s\n|-\n" % (page.aslink(), en, gn, yn,
mn)
+ output += u"|%s||%s||%s KB||%s||%s||%s\n|-\n" % (page.aslink(), en, sn
/ 1024, gn, yn, mn)
output += u"""| ||||||||
|-
-|'''%s'''||%s||%s||%s||%s
+|'''%s'''||%s||%s KB||%s||%s||%s
|-
-|colspan="5" align=right
style="background-color:#eeeeee;"|<small>''%s:
%s''</small>
+|colspan="6" align=right
style="background-color:#eeeeee;"|<small>''%s:
%s''</small>
|}
-""" % (msg[3], ent, gnt, ynt, mnt, msg[4], time.strftime("%d " +
"%s" % (date.monthName(wikipedia.getSite().language(), time.localtime()[1])) +
" %Y"))
+""" % (msg[4], ent, snt / 1024, gnt, ynt, mnt, msg[5],
time.strftime("%d " + "%s" %
(date.monthName(wikipedia.getSite().language(), time.localtime()[1])) + " %Y"))
return output