Re: [Pywikipedia-l] SVN: [4375] trunk/pywikipedia/wikipedia.py

28 Sep 2007

Am I correct that the new method uses regex to parse XML???  Somebody,
please tell me its not true!!!  We have JSON that works perfectly for
all sorts of parsing in python. We have XML libraries. Regex is just
plain silly - we risk incorrect XML parsing behavior.

What is unstable about the API? Revisions query has been available for
half a year without changes.

--Yurik

On 9/28/07, btongminh(a)svn.wikimedia.org &lt;btongminh(a)svn.wikimedia.org&gt; wrote:
...
  Revision: 4375
 Author:   btongminh
 Date:     2007-09-28 20:20:59 +0000 (Fri, 28 Sep 2007)

 Log Message:
 -----------
 Page.fullVersionHistory uses Special:Export again, which is basically a modified revert
to r3659. The API is too unstable for revision fetching.

 Modified Paths:
 --------------
     trunk/pywikipedia/wikipedia.py

 Modified: trunk/pywikipedia/wikipedia.py
 ===================================================================
 --- trunk/pywikipedia/wikipedia.py      2007-09-28 18:51:41 UTC (rev 4374)
 +++ trunk/pywikipedia/wikipedia.py      2007-09-28 20:20:59 UTC (rev 4375)
 @@ -1694,64 +1694,33 @@
          result += '|}\n'
          return result

 -    def fullVersionHistory(self, max = 50, comment = False, since = None):
 +    def fullVersionHistory(self):
          """
          Returns all previous versions. Gives a list of tuples consisting of
          edit date/time, user name and content
          """
 -        RV_LIMIT = 50
 -
 -        address = self.site().api_address()
 +        address = self.site().export_address()
          predata = {
 -            'action': 'query',
 -            'prop': 'revisions',
 -            'titles': self.title(),
 -            'rvprop': 'timestamp|user|comment|content',
 -            'rvlimit': str(RV_LIMIT),
 -            'format': 'json'
 +            'action': 'submit',
 +            'pages': self.title()
          }
 -        if max < RV_LIMIT: predata['rvlimit'] = str(max)
 -        if since: predata['rvend'] = since
 -
          get_throttle(requestsize = 10)
          now = time.time()
 -
 -        count = 0
 +        if self.site().hostname() in config.authenticate.keys():
 +            predata["Content-type"] =
"application/x-www-form-urlencoded"
 +            predata["User-agent"] = useragent
 +            data = self.site.urlEncode(predata)
 +            response = urllib2.urlopen(urllib2.Request('http://' +
self.site.hostname() + address, data))
 +            data = response.read()
 +        else:
 +            response, data = self.site().postForm(address, predata)
 +        data = data.encode(self.site().encoding())
 +        get_throttle.setDelay(time.time() - now)
          output = []
 +        r =
re.compile("\<revision\>.*?\<timestamp\>(.*?)\<\/timestamp\>.*?\<(?:ip|username)\>(.*?)\</(?:ip|username)\>.*?\<text.*?\>(.*?)\<\/text\>",re.DOTALL)
 +        #r =
re.compile("\<revision\>.*?\<timestamp\>(.*?)\<\/timestamp\>.*?\<(?:ip|username)\>(.*?)\<",re.DOTALL)
 +        return [(match.group(1), unescape(match.group(2)), unescape(match.group(3))) for
match in r.finditer(data)]

 -        while count < max and max != -1:
 -            if self.site().hostname() in config.authenticate.keys():
 -                predata["Content-type"] =
"application/x-www-form-urlencoded"
 -                predata["User-agent"] = useragent
 -                data = self.site.urlEncode(predata)
 -                response = urllib2.urlopen(urllib2.Request(self.site.protocol() +
'://' + self.site.hostname() + address, data))
 -                data = response.read().decode(self.site().encoding())
 -            else:
 -                response, data = self.site().postForm(address, predata)
 -
 -            get_throttle.setDelay(time.time() - now)
 -            data = simplejson.loads(data)
 -            page = data['query']['pages'].values()[0]
 -            if 'missing' in page:
 -                raise NoPage, 'Page %s not found' % self
 -            revisions = page.get('revisions', ())
 -            for revision in revisions:
 -                if not comment:
 -                    output.append((revision['timestamp'],
 -                      revision['user'], revision.get('*',
u'')))
 -                else:
 -                    output.append((revision['timestamp'],
revision['user'],
 -                      revision.get('*', u''),
revision.get('comment', u'')))
 -            count += len(revisions)
 -            if max - count < RV_LIMIT:
 -                predata['rvlimit'] = str(max - count)
 -            if 'query-continue' in data:
 -                predata['rvstartid'] =
str(data['query-continue']['revisions']['rvstartid'])
 -            else:
 -                break
 -        return output
 -    fullRevisionHistory = fullVersionHistory
 -
      def contributingUsers(self):
          """
          Returns a set of all user names (including anonymous IPs) of those who

 _______________________________________________
 Pywikipedia-l mailing list
 Pywikipedia-l(a)lists.wikimedia.org
 http://lists.wikimedia.org/mailman/listinfo/pywikipedia-l

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

Re: [Pywikipedia-l] SVN: [4375] trunk/pywikipedia/wikipedia.py