Revision: 4290
Author: cosoleto
Date: 2007-09-14 17:18:46 +0000 (Fri, 14 Sep 2007)
Log Message:
-----------
Added scripts to put and remove on wiki pages reports of copyright.py. Partly it's old code, but it's working code. copyright_clean.py use YurikAPI.
Modified Paths:
--------------
trunk/pywikipedia/copyright.py
Added Paths:
-----------
trunk/pywikipedia/copyright_clean.py
trunk/pywikipedia/copyright_put.py
Modified: trunk/pywikipedia/copyright.py
===================================================================
--- trunk/pywikipedia/copyright.py 2007-09-14 16:34:17 UTC (rev 4289)
+++ trunk/pywikipedia/copyright.py 2007-09-14 17:18:46 UTC (rev 4290)
@@ -106,6 +106,10 @@
#('sv', 'Wikipedia:Spegelsidor', 'Spegelsidor.txt'),
]
+reports_cat = {
+ 'it': u'Segnalazioni automatiche sospetti problemi di copyright',
+}
+
wikipedia_names = {
'--': u'Wikipedia',
'am': u'ዊኪፔድያ',
@@ -247,12 +251,12 @@
for page, path in exclusion_file_list():
try:
if not os.path.exists(path):
- print 'Creating file \'%s\' ([[%s]])' % (path, page.title())
+ print 'Creating file \'%s\' (%s)' % (path, page.aslink())
force_update = True
else:
file_age = time.time() - os.path.getmtime(path)
if file_age > 24 * 60 * 60:
- print 'Updating file \'%s\' ([[%s]])' % (path, page.title())
+ print 'Updating file \'%s\' (%s)' % (path, page.aslink())
force_update = True
except OSError:
raise
Added: trunk/pywikipedia/copyright_clean.py
===================================================================
--- trunk/pywikipedia/copyright_clean.py (rev 0)
+++ trunk/pywikipedia/copyright_clean.py 2007-09-14 17:18:46 UTC (rev 4290)
@@ -0,0 +1,169 @@
+# -*- coding: utf-8 -*-
+"""
+"""
+
+#
+# (C) Francesco Cosoleto, 2006
+#
+# Distributed under the terms of the MIT license.
+#
+
+import httplib, socket, simplejson, re, time
+import config, wikipedia, catlib, pagegenerators, query
+
+from urllib import urlencode
+from copyright import mysplit, put, reports_cat
+
+import sys
+
+summary_msg = {
+ 'en': u'Removing',
+ 'it': u'Rimozione',
+}
+
+headC = re.compile("(?m)^=== (?:<strike>)?(?:<s>)?(?:<del>)?\[\[(?::)?(.*?)\]\]")
+separatorC = re.compile('(?m)^== +')
+next_headC = re.compile("(?m)^=+.*?=+")
+
+#
+# {{/box|title|newid|oldid|author|...}}
+rev_templateC = re.compile("(?m)^(?:{{/t\|.*?}}\n?)?{{/box\|.*?\|(.*?)\|")
+
+def query_yurik_api(data):
+
+ predata = [
+ ('format', 'json'),
+ ('what', 'revisions'),
+ ('rvlimit', '1'),
+ data]
+
+ data = urlencode(predata)
+ host = wikipedia.getSite().hostname()
+ address = wikipedia.getSite().query_address()
+ conn = httplib.HTTPConnection(host)
+ conn.request("GET", address + data)
+ response = conn.getresponse()
+ data = response.read()
+ conn.close()
+
+ return data
+
+def manage_query(items, mode = "titles"):
+ """No more of 100 titles at a time using Yurik's API"""
+
+ global query_results
+
+ for s in mysplit(items, 100, "|"):
+ if mode == "titles":
+ query_results.append(simplejson.loads(query_yurik_api(('titles', s))))
+
+ elif mode == 'revids':
+ query_results2.append(simplejson.loads(query_yurik_api(('revids', s))))
+ return
+
+def page_exist(title):
+ for pageobjs in query_results:
+ for key in pageobjs['pages']:
+ if pageobjs['pages'][key]['title'] == title:
+ if int(key) >= 0:
+ return True
+ wikipedia.output('* ' + title)
+ return False
+
+def revid_exist(revid):
+ for pageobjs in query_results2:
+ for id in pageobjs['pages']:
+ for rv in range(len(pageobjs['pages'][id]['revisions'])):
+ if pageobjs['pages'][id]['revisions'][rv]['revid'] == int(revid):
+ # print rv
+ return True
+ wikipedia.output('* ' + revid)
+ return False
+
+cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % wikipedia.translate(wikipedia.getSite(), reports_cat))
+gen = pagegenerators.CategorizedPageGenerator(cat, recurse = True)
+
+for page in gen:
+ data = page.get()
+ wikipedia.output(page.title())
+ output = ''
+
+ #
+ # Preserve text before of the sections
+ #
+
+ m = re.search("(?m)^==\s*[^=]*?\s*==", data)
+ if m:
+ output = data[:m.end() + 1]
+ else:
+ m = re.search("(?m)^===\s*[^=]*?", data)
+ if not m:
+ continue
+ output = data[:m.start()]
+
+ titles = headC.findall(data)
+ revids = rev_templateC.findall(data)
+
+ query_results = list()
+ query_results2 = list()
+
+ manage_query(query.ListToParam(titles))
+ manage_query(query.ListToParam(revids), "revids")
+
+ comment_entry = list()
+ add_separator = False
+ index = 0
+
+ while True:
+ head = headC.search(data, index)
+ if not head:
+ break
+ index = head.end()
+ title = head.group(1)
+ next_head = next_headC.search(data, index)
+ if next_head:
+ if separatorC.search(data[next_head.start():next_head.end()]):
+ add_separator = True
+ stop = next_head.start()
+ else:
+ stop = len(data)
+
+ exist = True
+ if page_exist(title):
+ # check {{botbox}}
+ revid = re.search("{{botbox\|.*?\|(.*?)\|", data[head.end():stop])
+ if revid:
+ if not revid_exist(revid.group(1)):
+ exist = False
+ else:
+ exist = False
+
+ if exist:
+ output += "=== [[" + title + "]]" + data[head.end():stop]
+ else:
+ comment_entry.append("[[%s]]" % title)
+
+ if add_separator:
+ output += data[next_head.start():next_head.end()] + '\n'
+ add_separator = False
+
+ add_comment = u'%s: %s' % (wikipedia.translate(wikipedia.getSite(), summary_msg),", ".join(comment_entry))
+
+ # remove useless newlines
+ output = re.sub("(?m)^\n", "", output)
+
+ if comment_entry:
+ wikipedia.output(add_comment)
+
+ wikipedia.showDiff(output, page.get())
+
+ if len(sys.argv)!=1:
+ choice = wikipedia.inputChoice(u'Do you want to clean the page?', ['Yes', 'No'], ['y', 'n'], 'n')
+ if choice in ['n', 'N']:
+ continue
+ try:
+ put(page, output, add_comment)
+ except wikipedia.PageNotSaved:
+ raise
+
+wikipedia.stopme()
Added: trunk/pywikipedia/copyright_put.py
===================================================================
--- trunk/pywikipedia/copyright_put.py (rev 0)
+++ trunk/pywikipedia/copyright_put.py 2007-09-14 17:18:46 UTC (rev 4290)
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+"""
+"""
+
+#
+# (C) Francesco Cosoleto, 2006
+#
+# Distributed under the terms of the MIT license.
+#
+
+import sys, re, codecs, os, time, shutil
+import wikipedia, config
+
+from copyright import put, join_family_data, appdir, reports_cat
+
+#
+# Month + Year save method
+append_date_to_wiki_save_path = True
+
+#
+# Add pubblication date to entries (template:botdate)
+append_date_to_entries = False
+
+msg_table = {
+ 'it': {'_default': [u'Pagine nuove', u'Nuove voci'],
+ 'feed': [u'Aggiunte a voci esistenti', u'Testo aggiunto in']},
+ 'en': {'_default': [u'New entries', u'New entries']}
+}
+
+wiki_save_path = {
+ '_default': u'User:%s/Report' % config.usernames[wikipedia.getSite().family.name][wikipedia.getSite().lang],
+ 'it': u'Utente:RevertBot/Report'
+}
+
+template_cat = {
+ '_default': [u'This template is used by copyright.py, a script part of [[:m:Using the python wikipediabot|PyWikipediaBot]].', u''],
+ 'it': [u'Questo template è usato dallo script copyright.py del [[:m:Using the python wikipediabot|PyWikipediaBot]].', u'Template usati da bot'],
+}
+
+wiki_save_path = wikipedia.translate(wikipedia.getSite(), wiki_save_path)
+template_cat = wikipedia.translate(wikipedia.getSite(), template_cat)
+
+separatorC = re.compile('(?m)^== +')
+
+def set_template():
+
+ site = wikipedia.getSite()
+ url = "%s://%s%s" % (site.protocol, site.hostname(), site.path()
+
+ botdate = u"""
+<div style="text-align:right">{{{1}}}</div><noinclude>%s\n[[%s:%s]]</noinclude>
+""" % (template_cat[0], site.namespace(14), template_cat[1])
+
+ botbox = """
+<div class=plainlinks style="text-align:right">[%s?title={{{1}}}&diff={{{2}}}&oldid={{{3}}} diff] - [%s?title={{{1}}}&action=history cron] - [%s?title=Special:Log&page={{{1}}} log]</div>
+""" % url, url, url)
+
+ if append_date_to_entries:
+ p = wikipedia.Page(site, 'Template:botdate')
+ if not p.exists()
+ p.put(botdate)
+
+def output_files_gen():
+ for f in os.listdir(appdir):
+ if 'output' in f and not '_pending' in f:
+ m = re.search('output_(.*?)\.txt', f)
+ if m:
+ tag = m.group(1)
+ else:
+ tag = '_default'
+
+ section_name_and_summary = wikipedia.translate(wikipedia.getSite(), msg_table)[tag]
+
+ section = section_name_and_summary[0]
+ summary = section_name_and_summary[1]
+
+ yield appdir + f, section, summary
+
+def read_output_file(filename):
+ if os.path.isfile(filename + '_pending'):
+ shutil.move(filename, filename + '_temp')
+ ap = codecs.open(filename + '_pending', 'a', 'utf-8')
+ ot = codecs.open(filename + '_temp', 'r', 'utf-8')
+ ap.write(ot.read())
+ ap.close()
+ ot.close()
+ os.remove(filename + '_temp')
+ else:
+ shutil.move(filename, filename + '_pending')
+
+ f = codecs.open(filename + '_pending', 'r', 'utf-8')
+ data = f.read()
+ f.close()
+
+ return data
+
+if append_date_to_wiki_save_path:
+ import date
+ wiki_save_path += '_' + date.formats['MonthName'][wikipedia.getSite().language()](time.localtime()[1]) + '_' + str(time.localtime()[0])
+
+page = wikipedia.Page(wikipedia.getSite(),wiki_save_path)
+
+try:
+ wikitext = page.get()
+except wikipedia.NoPage:
+ wikipedia.output("%s not found." % page.aslink())
+ wikitext = '[[%s:%s]]\n' % (wikipedia.getSite().namespace(14), wikipedia.translate(wikipedia.getSite(), reports_cat))
+
+final_summary = u''
+output_files = list()
+
+for f, section, summary in output_files_gen():
+ wikipedia.output('File: \'%s\'\nSection: %s\n' % (f, section))
+
+ output_data = read_output_file(f)
+ output_files.append(f)
+
+ entries = re.findall('=== (.*?) ===', output_data)
+
+ if not entries:
+ continue
+
+ if append_date_to_entries:
+ dt = time.strftime('%d-%m-%Y %H:%M', time.localtime())
+ output_data = re.sub("(?m)^(=== \[\[.*?\]\] ===\n)", r"\1{{botdate|%s}}\n" % dt, output_data)
+
+ m = re.search('(?m)^==\s*%s\s*==' % section, wikitext)
+ if m:
+ m_end = re.search(separatorC, wikitext[m.end():])
+ if m_end:
+ wikitext = wikitext[:m_end.start() + m.end()] + output_data + wikitext[m_end.start() + m.end():]
+ else:
+ wikitext += '\n' + output_data
+ else:
+ wikitext += '\n' + output_data
+
+ if final_summary:
+ final_summary += ' '
+ final_summary += u'%s: %s' % (summary, ', '.join(entries))
+
+if final_summary:
+ wikipedia.output(final_summary + '\n')
+
+ # if a page in 'Image' or 'Category' namespace is checked then fix
+ # title section by adding ':' in order to avoid wiki code effects.
+
+ wikitext = re.sub(u'(?i)=== \[\[%s:' % join_family_data('Image', 6), ur'== [[:\1:', wikitext)
+ wikitext = re.sub(u'(?i)=== \[\[%s:' % join_family_data('Category', 14), ur'== [[:\1:', wikitext)
+
+ # TODO:
+ # List of frequent rejected address to improve upload process.
+
+ wikitext = re.sub('http://(.*?)((forumcommunity|forumfree).net)',r'<blacklist>\1\2', wikitext)
+
+ if len(final_summary)>=200:
+ final_summary = final_summary[:200]
+ final_summary = final_summary[:add_comment.rindex("[")-3] + "..."
+
+ try:
+ put(wikitext, comment = final_summary)
+ for f in output_files:
+ os.remove(f + '_pending')
+ wikipedia.output("\'%s\' deleted." % f)
+ except wikipedia.PageNotSaved:
+ raise
+
+wikipedia.stopme()