Revision: 5793 Author: cosoleto Date: 2008-08-09 12:37:50 +0000 (Sat, 09 Aug 2008)
Log Message: ----------- Disabled code to access to obsolete Query API (http://lists.wikimedia.org/pipermail/mediawiki-api/2008-July/000620.html), added similar code for MediaWiki API (but I think I have to rewrite this script...). Minor bug fixes.
Modified Paths: -------------- trunk/pywikipedia/copyright_clean.py
Modified: trunk/pywikipedia/copyright_clean.py =================================================================== --- trunk/pywikipedia/copyright_clean.py 2008-08-08 14:24:54 UTC (rev 5792) +++ trunk/pywikipedia/copyright_clean.py 2008-08-09 12:37:50 UTC (rev 5793) @@ -12,7 +12,7 @@ import config, wikipedia, catlib, pagegenerators, query
from urllib import urlencode -from copyright import mysplit, put, reports_cat +from copyright import mysplit, put, reports_cat, join_family_data
import sys
@@ -22,6 +22,7 @@ 'it': u'Rimozione', }
+#"(?m)^=== (?:<strike>)?(?:<s>)?(?:<del>)?[[(?::)?(.*?)(?:#.*?)?]]" headC = re.compile("(?m)^=== (?:<strike>)?(?:<s>)?(?:<del>)?[[(?::)?(.*?)]]") separatorC = re.compile('(?m)^== +') next_headC = re.compile("(?m)^=+.*?=+") @@ -30,8 +31,24 @@ # {{botbox|title|newid|oldid|author|...}} rev_templateC = re.compile("(?m)^(?:{{/t|.*?}}\n?)?{{(?:/box|botbox)|.*?|(.*?)|")
-def query_yurik_api(data): +def query_api(data): + predata = [ + ('format', 'json'), + ('action', 'query'), + ('prop', 'revisions'), + data] + data = urlencode(predata) + host = wikipedia.getSite().hostname() + address = wikipedia.getSite().api_address() + conn = httplib.HTTPConnection(host) + conn.request("GET", address + data) + response = conn.getresponse() + data = response.read() + conn.close() + return data
+def query_old_api(data): + predata = [ ('format', 'json'), ('what', 'revisions'), @@ -49,7 +66,7 @@
return data
-def page_exist(title): +def old_page_exist(title): for pageobjs in query_results_titles: for key in pageobjs['pages']: if pageobjs['pages'][key]['title'] == title: @@ -58,7 +75,7 @@ wikipedia.output('* ' + title) return False
-def revid_exist(revid): +def old_revid_exist(revid): for pageobjs in query_results_revids: for id in pageobjs['pages']: for rv in range(len(pageobjs['pages'][id]['revisions'])): @@ -68,6 +85,25 @@ wikipedia.output('* ' + revid) return False
+def page_exist(title): + for pageobjs in query_results_titles: + for key in pageobjs['query']['pages']: + if pageobjs['query']['pages'][key]['title'] == title: + if pageobjs['query']['pages'][key].has_key('missing'): + wikipedia.output('* ' + title) + return False + return True + +def revid_exist(revid): + for pageobjs in query_results_revids: + if pageobjs['query'].has_key('badrevids'): + for id in pageobjs['query']['badrevids']: + if id == int(revid): + # print rv + wikipedia.output('* ' + revid) + return False + return True + cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % wikipedia.translate(wikipedia.getSite(), reports_cat)) gen = pagegenerators.CategorizedPageGenerator(cat, recurse = True)
@@ -90,16 +126,17 @@ output = data[:m.start()]
titles = headC.findall(data) + titles = [re.sub("#.*", "", item) for item in titles] revids = rev_templateC.findall(data)
query_results_titles = list() query_results_revids = list()
- # No more of 100 titles at a time using Yurik's API + # No more of 100 titles at a time using old Query API for s in mysplit(query.ListToParam(titles), 100, "|"): - query_results_titles.append(simplejson.loads(query_yurik_api(('titles', s)))) + query_results_titles.append(simplejson.loads(query_api(('titles', s)))) for s in mysplit(query.ListToParam(revids), 100, "|"): - query_results_revids.append(simplejson.loads(query_yurik_api(('revids', s)))) + query_results_revids.append(simplejson.loads(query_api(('revids', s))))
comment_entry = list() add_separator = False @@ -110,7 +147,7 @@ if not head: break index = head.end() - title = head.group(1) + title = re.sub("#.*", "", head.group(1)) next_head = next_headC.search(data, index) if next_head: if separatorC.search(data[next_head.start():next_head.end()]): @@ -130,7 +167,9 @@ exist = False
if exist: - output += "=== [[" + title + "]]" + data[head.end():stop] + ctitle = re.sub(u'(?i)=== [[%s:' % join_family_data('Image', 6), ur'=== [[:\1:', title) + ctitle = re.sub(u'(?i)=== [[%s:' % join_family_data('Category', 14), ur'=== [[:\1:', ctitle) + output += "=== [[" + ctitle + "]]" + data[head.end():stop] else: comment_entry.append("[[%s]]" % title)
pywikipedia-l@lists.wikimedia.org