Revision: 4290 Author: cosoleto Date: 2007-09-14 17:18:46 +0000 (Fri, 14 Sep 2007)
Log Message: ----------- Added scripts to put and remove on wiki pages reports of copyright.py. Partly it's old code, but it's working code. copyright_clean.py use YurikAPI.
Modified Paths: -------------- trunk/pywikipedia/copyright.py
Added Paths: ----------- trunk/pywikipedia/copyright_clean.py trunk/pywikipedia/copyright_put.py
Modified: trunk/pywikipedia/copyright.py =================================================================== --- trunk/pywikipedia/copyright.py 2007-09-14 16:34:17 UTC (rev 4289) +++ trunk/pywikipedia/copyright.py 2007-09-14 17:18:46 UTC (rev 4290) @@ -106,6 +106,10 @@ #('sv', 'Wikipedia:Spegelsidor', 'Spegelsidor.txt'), ]
+reports_cat = { + 'it': u'Segnalazioni automatiche sospetti problemi di copyright', +} + wikipedia_names = { '--': u'Wikipedia', 'am': u'ዊኪፔድያ', @@ -247,12 +251,12 @@ for page, path in exclusion_file_list(): try: if not os.path.exists(path): - print 'Creating file '%s' ([[%s]])' % (path, page.title()) + print 'Creating file '%s' (%s)' % (path, page.aslink()) force_update = True else: file_age = time.time() - os.path.getmtime(path) if file_age > 24 * 60 * 60: - print 'Updating file '%s' ([[%s]])' % (path, page.title()) + print 'Updating file '%s' (%s)' % (path, page.aslink()) force_update = True except OSError: raise
Added: trunk/pywikipedia/copyright_clean.py =================================================================== --- trunk/pywikipedia/copyright_clean.py (rev 0) +++ trunk/pywikipedia/copyright_clean.py 2007-09-14 17:18:46 UTC (rev 4290) @@ -0,0 +1,169 @@ +# -*- coding: utf-8 -*- +""" +""" + +# +# (C) Francesco Cosoleto, 2006 +# +# Distributed under the terms of the MIT license. +# + +import httplib, socket, simplejson, re, time +import config, wikipedia, catlib, pagegenerators, query + +from urllib import urlencode +from copyright import mysplit, put, reports_cat + +import sys + +summary_msg = { + 'en': u'Removing', + 'it': u'Rimozione', +} + +headC = re.compile("(?m)^=== (?:<strike>)?(?:<s>)?(?:<del>)?[[(?::)?(.*?)]]") +separatorC = re.compile('(?m)^== +') +next_headC = re.compile("(?m)^=+.*?=+") + +# +# {{/box|title|newid|oldid|author|...}} +rev_templateC = re.compile("(?m)^(?:{{/t|.*?}}\n?)?{{/box|.*?|(.*?)|") + +def query_yurik_api(data): + + predata = [ + ('format', 'json'), + ('what', 'revisions'), + ('rvlimit', '1'), + data] + + data = urlencode(predata) + host = wikipedia.getSite().hostname() + address = wikipedia.getSite().query_address() + conn = httplib.HTTPConnection(host) + conn.request("GET", address + data) + response = conn.getresponse() + data = response.read() + conn.close() + + return data + +def manage_query(items, mode = "titles"): + """No more of 100 titles at a time using Yurik's API""" + + global query_results + + for s in mysplit(items, 100, "|"): + if mode == "titles": + query_results.append(simplejson.loads(query_yurik_api(('titles', s)))) + + elif mode == 'revids': + query_results2.append(simplejson.loads(query_yurik_api(('revids', s)))) + return + +def page_exist(title): + for pageobjs in query_results: + for key in pageobjs['pages']: + if pageobjs['pages'][key]['title'] == title: + if int(key) >= 0: + return True + wikipedia.output('* ' + title) + return False + +def revid_exist(revid): + for pageobjs in query_results2: + for id in pageobjs['pages']: + for rv in range(len(pageobjs['pages'][id]['revisions'])): + if pageobjs['pages'][id]['revisions'][rv]['revid'] == int(revid): + # print rv + return True + wikipedia.output('* ' + revid) + return False + +cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % wikipedia.translate(wikipedia.getSite(), reports_cat)) +gen = pagegenerators.CategorizedPageGenerator(cat, recurse = True) + +for page in gen: + data = page.get() + wikipedia.output(page.title()) + output = '' + + # + # Preserve text before of the sections + # + + m = re.search("(?m)^==\s*[^=]*?\s*==", data) + if m: + output = data[:m.end() + 1] + else: + m = re.search("(?m)^===\s*[^=]*?", data) + if not m: + continue + output = data[:m.start()] + + titles = headC.findall(data) + revids = rev_templateC.findall(data) + + query_results = list() + query_results2 = list() + + manage_query(query.ListToParam(titles)) + manage_query(query.ListToParam(revids), "revids") + + comment_entry = list() + add_separator = False + index = 0 + + while True: + head = headC.search(data, index) + if not head: + break + index = head.end() + title = head.group(1) + next_head = next_headC.search(data, index) + if next_head: + if separatorC.search(data[next_head.start():next_head.end()]): + add_separator = True + stop = next_head.start() + else: + stop = len(data) + + exist = True + if page_exist(title): + # check {{botbox}} + revid = re.search("{{botbox|.*?|(.*?)|", data[head.end():stop]) + if revid: + if not revid_exist(revid.group(1)): + exist = False + else: + exist = False + + if exist: + output += "=== [[" + title + "]]" + data[head.end():stop] + else: + comment_entry.append("[[%s]]" % title) + + if add_separator: + output += data[next_head.start():next_head.end()] + '\n' + add_separator = False + + add_comment = u'%s: %s' % (wikipedia.translate(wikipedia.getSite(), summary_msg),", ".join(comment_entry)) + + # remove useless newlines + output = re.sub("(?m)^\n", "", output) + + if comment_entry: + wikipedia.output(add_comment) + + wikipedia.showDiff(output, page.get()) + + if len(sys.argv)!=1: + choice = wikipedia.inputChoice(u'Do you want to clean the page?', ['Yes', 'No'], ['y', 'n'], 'n') + if choice in ['n', 'N']: + continue + try: + put(page, output, add_comment) + except wikipedia.PageNotSaved: + raise + +wikipedia.stopme()
Added: trunk/pywikipedia/copyright_put.py =================================================================== --- trunk/pywikipedia/copyright_put.py (rev 0) +++ trunk/pywikipedia/copyright_put.py 2007-09-14 17:18:46 UTC (rev 4290) @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- +""" +""" + +# +# (C) Francesco Cosoleto, 2006 +# +# Distributed under the terms of the MIT license. +# + +import sys, re, codecs, os, time, shutil +import wikipedia, config + +from copyright import put, join_family_data, appdir, reports_cat + +# +# Month + Year save method +append_date_to_wiki_save_path = True + +# +# Add pubblication date to entries (template:botdate) +append_date_to_entries = False + +msg_table = { + 'it': {'_default': [u'Pagine nuove', u'Nuove voci'], + 'feed': [u'Aggiunte a voci esistenti', u'Testo aggiunto in']}, + 'en': {'_default': [u'New entries', u'New entries']} +} + +wiki_save_path = { + '_default': u'User:%s/Report' % config.usernames[wikipedia.getSite().family.name][wikipedia.getSite().lang], + 'it': u'Utente:RevertBot/Report' +} + +template_cat = { + '_default': [u'This template is used by copyright.py, a script part of [[:m:Using the python wikipediabot|PyWikipediaBot]].', u''], + 'it': [u'Questo template è usato dallo script copyright.py del [[:m:Using the python wikipediabot|PyWikipediaBot]].', u'Template usati da bot'], +} + +wiki_save_path = wikipedia.translate(wikipedia.getSite(), wiki_save_path) +template_cat = wikipedia.translate(wikipedia.getSite(), template_cat) + +separatorC = re.compile('(?m)^== +') + +def set_template(): + + site = wikipedia.getSite() + url = "%s://%s%s" % (site.protocol, site.hostname(), site.path() + + botdate = u""" +<div style="text-align:right">{{{1}}}</div><noinclude>%s\n[[%s:%s]]</noinclude> +""" % (template_cat[0], site.namespace(14), template_cat[1]) + + botbox = """ +<div class=plainlinks style="text-align:right">[%s?title={{{1}}}&diff={{{2}}}&oldid={{{3}}} diff] - [%s?title={{{1}}}&action=history cron] - [%s?title=Special:Log&page={{{1}}} log]</div> +""" % url, url, url) + + if append_date_to_entries: + p = wikipedia.Page(site, 'Template:botdate') + if not p.exists() + p.put(botdate) + +def output_files_gen(): + for f in os.listdir(appdir): + if 'output' in f and not '_pending' in f: + m = re.search('output_(.*?).txt', f) + if m: + tag = m.group(1) + else: + tag = '_default' + + section_name_and_summary = wikipedia.translate(wikipedia.getSite(), msg_table)[tag] + + section = section_name_and_summary[0] + summary = section_name_and_summary[1] + + yield appdir + f, section, summary + +def read_output_file(filename): + if os.path.isfile(filename + '_pending'): + shutil.move(filename, filename + '_temp') + ap = codecs.open(filename + '_pending', 'a', 'utf-8') + ot = codecs.open(filename + '_temp', 'r', 'utf-8') + ap.write(ot.read()) + ap.close() + ot.close() + os.remove(filename + '_temp') + else: + shutil.move(filename, filename + '_pending') + + f = codecs.open(filename + '_pending', 'r', 'utf-8') + data = f.read() + f.close() + + return data + +if append_date_to_wiki_save_path: + import date + wiki_save_path += '_' + date.formats['MonthName'][wikipedia.getSite().language()](time.localtime()[1]) + '_' + str(time.localtime()[0]) + +page = wikipedia.Page(wikipedia.getSite(),wiki_save_path) + +try: + wikitext = page.get() +except wikipedia.NoPage: + wikipedia.output("%s not found." % page.aslink()) + wikitext = '[[%s:%s]]\n' % (wikipedia.getSite().namespace(14), wikipedia.translate(wikipedia.getSite(), reports_cat)) + +final_summary = u'' +output_files = list() + +for f, section, summary in output_files_gen(): + wikipedia.output('File: '%s'\nSection: %s\n' % (f, section)) + + output_data = read_output_file(f) + output_files.append(f) + + entries = re.findall('=== (.*?) ===', output_data) + + if not entries: + continue + + if append_date_to_entries: + dt = time.strftime('%d-%m-%Y %H:%M', time.localtime()) + output_data = re.sub("(?m)^(=== [[.*?]] ===\n)", r"\1{{botdate|%s}}\n" % dt, output_data) + + m = re.search('(?m)^==\s*%s\s*==' % section, wikitext) + if m: + m_end = re.search(separatorC, wikitext[m.end():]) + if m_end: + wikitext = wikitext[:m_end.start() + m.end()] + output_data + wikitext[m_end.start() + m.end():] + else: + wikitext += '\n' + output_data + else: + wikitext += '\n' + output_data + + if final_summary: + final_summary += ' ' + final_summary += u'%s: %s' % (summary, ', '.join(entries)) + +if final_summary: + wikipedia.output(final_summary + '\n') + + # if a page in 'Image' or 'Category' namespace is checked then fix + # title section by adding ':' in order to avoid wiki code effects. + + wikitext = re.sub(u'(?i)=== [[%s:' % join_family_data('Image', 6), ur'== [[:\1:', wikitext) + wikitext = re.sub(u'(?i)=== [[%s:' % join_family_data('Category', 14), ur'== [[:\1:', wikitext) + + # TODO: + # List of frequent rejected address to improve upload process. + + wikitext = re.sub('http://(.*?)((forumcommunity%7Cforumfree).net)%27,r'<blacklist>\1\2', wikitext) + + if len(final_summary)>=200: + final_summary = final_summary[:200] + final_summary = final_summary[:add_comment.rindex("[")-3] + "..." + + try: + put(wikitext, comment = final_summary) + for f in output_files: + os.remove(f + '_pending') + wikipedia.output("'%s' deleted." % f) + except wikipedia.PageNotSaved: + raise + +wikipedia.stopme()
pywikipedia-l@lists.wikimedia.org