Am I correct that the new method uses regex to parse XML??? Somebody, please tell me its not true!!! We have JSON that works perfectly for all sorts of parsing in python. We have XML libraries. Regex is just plain silly - we risk incorrect XML parsing behavior.
What is unstable about the API? Revisions query has been available for half a year without changes.
--Yurik
On 9/28/07, btongminh@svn.wikimedia.org btongminh@svn.wikimedia.org wrote:
Revision: 4375 Author: btongminh Date: 2007-09-28 20:20:59 +0000 (Fri, 28 Sep 2007)
Log Message:
Page.fullVersionHistory uses Special:Export again, which is basically a modified revert to r3659. The API is too unstable for revision fetching.
Modified Paths:
trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py
--- trunk/pywikipedia/wikipedia.py 2007-09-28 18:51:41 UTC (rev 4374) +++ trunk/pywikipedia/wikipedia.py 2007-09-28 20:20:59 UTC (rev 4375) @@ -1694,64 +1694,33 @@ result += '|}\n' return result
- def fullVersionHistory(self, max = 50, comment = False, since = None):
- def fullVersionHistory(self): """ Returns all previous versions. Gives a list of tuples consisting of edit date/time, user name and content """
RV_LIMIT = 50
address = self.site().api_address()
address = self.site().export_address() predata = {
'action': 'query',
'prop': 'revisions',
'titles': self.title(),
'rvprop': 'timestamp|user|comment|content',
'rvlimit': str(RV_LIMIT),
'format': 'json'
'action': 'submit',
'pages': self.title() }
if max < RV_LIMIT: predata['rvlimit'] = str(max)
if since: predata['rvend'] = since
get_throttle(requestsize = 10) now = time.time()
count = 0
if self.site().hostname() in config.authenticate.keys():
predata["Content-type"] = "application/x-www-form-urlencoded"
predata["User-agent"] = useragent
data = self.site.urlEncode(predata)
response = urllib2.urlopen(urllib2.Request('http://' + self.site.hostname() + address, data))
data = response.read()
else:
response, data = self.site().postForm(address, predata)
data = data.encode(self.site().encoding())
get_throttle.setDelay(time.time() - now) output = []
r = re.compile("\<revision\>.*?\<timestamp\>(.*?)\<\/timestamp\>.*?\<(?:ip|username)\>(.*?)\</(?:ip|username)\>.*?\<text.*?\>(.*?)\<\/text\>",re.DOTALL)
#r = re.compile("\<revision\>.*?\<timestamp\>(.*?)\<\/timestamp\>.*?\<(?:ip|username)\>(.*?)\<",re.DOTALL)
return [(match.group(1), unescape(match.group(2)), unescape(match.group(3))) for match in r.finditer(data)]
while count < max and max != -1:
if self.site().hostname() in config.authenticate.keys():
predata["Content-type"] = "application/x-www-form-urlencoded"
predata["User-agent"] = useragent
data = self.site.urlEncode(predata)
response = urllib2.urlopen(urllib2.Request(self.site.protocol() + '://' + self.site.hostname() + address, data))
data = response.read().decode(self.site().encoding())
else:
response, data = self.site().postForm(address, predata)
get_throttle.setDelay(time.time() - now)
data = simplejson.loads(data)
page = data['query']['pages'].values()[0]
if 'missing' in page:
raise NoPage, 'Page %s not found' % self
revisions = page.get('revisions', ())
for revision in revisions:
if not comment:
output.append((revision['timestamp'],
revision['user'], revision.get('*', u'')))
else:
output.append((revision['timestamp'], revision['user'],
revision.get('*', u''), revision.get('comment', u'')))
count += len(revisions)
if max - count < RV_LIMIT:
predata['rvlimit'] = str(max - count)
if 'query-continue' in data:
predata['rvstartid'] = str(data['query-continue']['revisions']['rvstartid'])
else:
break
return output
- fullRevisionHistory = fullVersionHistory
- def contributingUsers(self): """ Returns a set of all user names (including anonymous IPs) of those who
Pywikipedia-l mailing list Pywikipedia-l@lists.wikimedia.org http://lists.wikimedia.org/mailman/listinfo/pywikipedia-l