Revision: 4290
Author: cosoleto
Date: 2007-09-14 17:18:46 +0000 (Fri, 14 Sep 2007)
Log Message:
Added scripts to put and remove on wiki pages reports of Partly it's old code, but it's working code. use YurikAPI.
Modified Paths:
Added Paths:
Modified: trunk/pywikipedia/
--- trunk/pywikipedia/ 2007-09-14 16:34:17 UTC (rev 4289)
+++ trunk/pywikipedia/ 2007-09-14 17:18:46 UTC (rev 4290)
@@ -106,6 +106,10 @@
#('sv', 'Wikipedia:Spegelsidor', 'Spegelsidor.txt'),
+reports_cat = {
+ 'it': u'Segnalazioni automatiche sospetti problemi di copyright',
wikipedia_names = {
'--': u'Wikipedia',
'am': u'ዊኪፔድያ',
@@ -247,12 +251,12 @@
for page, path in exclusion_file_list():
if not os.path.exists(path):
- print 'Creating file \'%s\' ([[%s]])' % (path, page.title())
+ print 'Creating file \'%s\' (%s)' % (path, page.aslink())
force_update = True
file_age = time.time() - os.path.getmtime(path)
if file_age > 24 * 60 * 60:
- print 'Updating file \'%s\' ([[%s]])' % (path, page.title())
+ print 'Updating file \'%s\' (%s)' % (path, page.aslink())
force_update = True
except OSError:
Added: trunk/pywikipedia/
--- trunk/pywikipedia/ (rev 0)
+++ trunk/pywikipedia/ 2007-09-14 17:18:46 UTC (rev 4290)
@@ -0,0 +1,169 @@
+# -*- coding: utf-8 -*-
+# (C) Francesco Cosoleto, 2006
+# Distributed under the terms of the MIT license.
+import httplib, socket, simplejson, re, time
+import config, wikipedia, catlib, pagegenerators, query
+from urllib import urlencode
+from copyright import mysplit, put, reports_cat
+import sys
+summary_msg = {
+ 'en': u'Removing',
+ 'it': u'Rimozione',
+headC = re.compile("(?m)^=== (?:<strike>)?(?:<s>)?(?:<del>)?\[\[(?::)?(.*?)\]\]")
+separatorC = re.compile('(?m)^== +')
+next_headC = re.compile("(?m)^=+.*?=+")
+# {{/box|title|newid|oldid|author|...}}
+rev_templateC = re.compile("(?m)^(?:{{/t\|.*?}}\n?)?{{/box\|.*?\|(.*?)\|")
+def query_yurik_api(data):
+ predata = [
+ ('format', 'json'),
+ ('what', 'revisions'),
+ ('rvlimit', '1'),
+ data]
+ data = urlencode(predata)
+ host = wikipedia.getSite().hostname()
+ address = wikipedia.getSite().query_address()
+ conn = httplib.HTTPConnection(host)
+ conn.request("GET", address + data)
+ response = conn.getresponse()
+ data =
+ conn.close()
+ return data
+def manage_query(items, mode = "titles"):
+ """No more of 100 titles at a time using Yurik's API"""
+ global query_results
+ for s in mysplit(items, 100, "|"):
+ if mode == "titles":
+ query_results.append(simplejson.loads(query_yurik_api(('titles', s))))
+ elif mode == 'revids':
+ query_results2.append(simplejson.loads(query_yurik_api(('revids', s))))
+ return
+def page_exist(title):
+ for pageobjs in query_results:
+ for key in pageobjs['pages']:
+ if pageobjs['pages'][key]['title'] == title:
+ if int(key) >= 0:
+ return True
+ wikipedia.output('* ' + title)
+ return False
+def revid_exist(revid):
+ for pageobjs in query_results2:
+ for id in pageobjs['pages']:
+ for rv in range(len(pageobjs['pages'][id]['revisions'])):
+ if pageobjs['pages'][id]['revisions'][rv]['revid'] == int(revid):
+ # print rv
+ return True
+ wikipedia.output('* ' + revid)
+ return False
+cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % wikipedia.translate(wikipedia.getSite(), reports_cat))
+gen = pagegenerators.CategorizedPageGenerator(cat, recurse = True)
+for page in gen:
+ data = page.get()
+ wikipedia.output(page.title())
+ output = ''
+ #
+ # Preserve text before of the sections
+ #
+ m ="(?m)^==\s*[^=]*?\s*==", data)
+ if m:
+ output = data[:m.end() + 1]
+ else:
+ m ="(?m)^===\s*[^=]*?", data)
+ if not m:
+ continue
+ output = data[:m.start()]
+ titles = headC.findall(data)
+ revids = rev_templateC.findall(data)
+ query_results = list()
+ query_results2 = list()
+ manage_query(query.ListToParam(titles))
+ manage_query(query.ListToParam(revids), "revids")
+ comment_entry = list()
+ add_separator = False
+ index = 0
+ while True:
+ head =, index)
+ if not head:
+ break
+ index = head.end()
+ title =
+ next_head =, index)
+ if next_head:
+ if[next_head.start():next_head.end()]):
+ add_separator = True
+ stop = next_head.start()
+ else:
+ stop = len(data)
+ exist = True
+ if page_exist(title):
+ # check {{botbox}}
+ revid ="{{botbox\|.*?\|(.*?)\|", data[head.end():stop])
+ if revid:
+ if not revid_exist(
+ exist = False
+ else:
+ exist = False
+ if exist:
+ output += "=== [[" + title + "]]" + data[head.end():stop]
+ else:
+ comment_entry.append("[[%s]]" % title)
+ if add_separator:
+ output += data[next_head.start():next_head.end()] + '\n'
+ add_separator = False
+ add_comment = u'%s: %s' % (wikipedia.translate(wikipedia.getSite(), summary_msg),", ".join(comment_entry))
+ # remove useless newlines
+ output = re.sub("(?m)^\n", "", output)
+ if comment_entry:
+ wikipedia.output(add_comment)
+ wikipedia.showDiff(output, page.get())
+ if len(sys.argv)!=1:
+ choice = wikipedia.inputChoice(u'Do you want to clean the page?', ['Yes', 'No'], ['y', 'n'], 'n')
+ if choice in ['n', 'N']:
+ continue
+ try:
+ put(page, output, add_comment)
+ except wikipedia.PageNotSaved:
+ raise
Added: trunk/pywikipedia/
--- trunk/pywikipedia/ (rev 0)
+++ trunk/pywikipedia/ 2007-09-14 17:18:46 UTC (rev 4290)
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+# (C) Francesco Cosoleto, 2006
+# Distributed under the terms of the MIT license.
+import sys, re, codecs, os, time, shutil
+import wikipedia, config
+from copyright import put, join_family_data, appdir, reports_cat
+# Month + Year save method
+append_date_to_wiki_save_path = True
+# Add pubblication date to entries (template:botdate)
+append_date_to_entries = False
+msg_table = {
+ 'it': {'_default': [u'Pagine nuove', u'Nuove voci'],
+ 'feed': [u'Aggiunte a voci esistenti', u'Testo aggiunto in']},
+ 'en': {'_default': [u'New entries', u'New entries']}
+wiki_save_path = {
+ '_default': u'User:%s/Report' % config.usernames[wikipedia.getSite()][wikipedia.getSite().lang],
+ 'it': u'Utente:RevertBot/Report'
+template_cat = {
+ '_default': [u'This template is used by, a script part of [[:m:Using the python wikipediabot|PyWikipediaBot]].', u''],
+ 'it': [u'Questo template è usato dallo script del [[:m:Using the python wikipediabot|PyWikipediaBot]].', u'Template usati da bot'],
+wiki_save_path = wikipedia.translate(wikipedia.getSite(), wiki_save_path)
+template_cat = wikipedia.translate(wikipedia.getSite(), template_cat)
+separatorC = re.compile('(?m)^== +')
+def set_template():
+ site = wikipedia.getSite()
+ url = "%s://%s%s" % (site.protocol, site.hostname(), site.path()
+ botdate = u"""
+<div style="text-align:right">{{{1}}}</div><noinclude>%s\n[[%s:%s]]</noinclude>
+""" % (template_cat[0], site.namespace(14), template_cat[1])
+ botbox = """
+<div class=plainlinks style="text-align:right">[%s?title={{{1}}}&diff={{{2}}}&oldid={{{3}}} diff] - [%s?title={{{1}}}&action=history cron] - [%s?title=Special:Log&page={{{1}}} log]</div>
+""" % url, url, url)
+ if append_date_to_entries:
+ p = wikipedia.Page(site, 'Template:botdate')
+ if not p.exists()
+ p.put(botdate)
+def output_files_gen():
+ for f in os.listdir(appdir):
+ if 'output' in f and not '_pending' in f:
+ m ='output_(.*?)\.txt', f)
+ if m:
+ tag =
+ else:
+ tag = '_default'
+ section_name_and_summary = wikipedia.translate(wikipedia.getSite(), msg_table)[tag]
+ section = section_name_and_summary[0]
+ summary = section_name_and_summary[1]
+ yield appdir + f, section, summary
+def read_output_file(filename):
+ if os.path.isfile(filename + '_pending'):
+ shutil.move(filename, filename + '_temp')
+ ap = + '_pending', 'a', 'utf-8')
+ ot = + '_temp', 'r', 'utf-8')
+ ap.write(
+ ap.close()
+ ot.close()
+ os.remove(filename + '_temp')
+ else:
+ shutil.move(filename, filename + '_pending')
+ f = + '_pending', 'r', 'utf-8')
+ data =
+ f.close()
+ return data
+if append_date_to_wiki_save_path:
+ import date
+ wiki_save_path += '_' + date.formats['MonthName'][wikipedia.getSite().language()](time.localtime()[1]) + '_' + str(time.localtime()[0])
+page = wikipedia.Page(wikipedia.getSite(),wiki_save_path)
+ wikitext = page.get()
+except wikipedia.NoPage:
+ wikipedia.output("%s not found." % page.aslink())
+ wikitext = '[[%s:%s]]\n' % (wikipedia.getSite().namespace(14), wikipedia.translate(wikipedia.getSite(), reports_cat))
+final_summary = u''
+output_files = list()
+for f, section, summary in output_files_gen():
+ wikipedia.output('File: \'%s\'\nSection: %s\n' % (f, section))
+ output_data = read_output_file(f)
+ output_files.append(f)
+ entries = re.findall('=== (.*?) ===', output_data)
+ if not entries:
+ continue
+ if append_date_to_entries:
+ dt = time.strftime('%d-%m-%Y %H:%M', time.localtime())
+ output_data = re.sub("(?m)^(=== \[\[.*?\]\] ===\n)", r"\1{{botdate|%s}}\n" % dt, output_data)
+ m ='(?m)^==\s*%s\s*==' % section, wikitext)
+ if m:
+ m_end =, wikitext[m.end():])
+ if m_end:
+ wikitext = wikitext[:m_end.start() + m.end()] + output_data + wikitext[m_end.start() + m.end():]
+ else:
+ wikitext += '\n' + output_data
+ else:
+ wikitext += '\n' + output_data
+ if final_summary:
+ final_summary += ' '
+ final_summary += u'%s: %s' % (summary, ', '.join(entries))
+if final_summary:
+ wikipedia.output(final_summary + '\n')
+ # if a page in 'Image' or 'Category' namespace is checked then fix
+ # title section by adding ':' in order to avoid wiki code effects.
+ wikitext = re.sub(u'(?i)=== \[\[%s:' % join_family_data('Image', 6), ur'== [[:\1:', wikitext)
+ wikitext = re.sub(u'(?i)=== \[\[%s:' % join_family_data('Category', 14), ur'== [[:\1:', wikitext)
+ # TODO:
+ # List of frequent rejected address to improve upload process.
+ wikitext = re.sub('http://(.*?)((forumcommunity|forumfree).net)',r'<blacklist>\1\2', wikitext)
+ if len(final_summary)>=200:
+ final_summary = final_summary[:200]
+ final_summary = final_summary[:add_comment.rindex("[")-3] + "..."
+ try:
+ put(wikitext, comment = final_summary)
+ for f in output_files:
+ os.remove(f + '_pending')
+ wikipedia.output("\'%s\' deleted." % f)
+ except wikipedia.PageNotSaved:
+ raise