Revision: 5196 Author: filnik Date: 2008-04-08 17:40:08 +0000 (Tue, 08 Apr 2008)
Log Message: ----------- Another fix to the regex
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2008-04-08 16:56:41 UTC (rev 5195) +++ trunk/pywikipedia/wikipedia.py 2008-04-08 17:40:08 UTC (rev 5196) @@ -4641,13 +4641,12 @@ """Yield ImagePages from Special:Log&type=upload"""
seen = set() - regexp = re.compile(r'(?:<li[^>]*>|<div class="mw-log-entry"[^>]*>)(?P<date>.+?)\s+<a href=.*?>(?P<user>.+?)</a>\s+(.+?</a>).*?<a href=".*?"(?P<new> class="new")? title=".*?"\s*>(?P<image>.+?)</a>(?:.*?<span class="comment">(?P<comment>.*?)</span>)?', re.UNICODE) - + regexp = re.compile(r'(?:<li[^>]*>|<div class="mw-log-entry">)(?P<date>.+?)\s+<a href=.*?>(?P<user>.+?)</a>\s+(.+?</a>).*?<a href=".*?"(?P<new> class="new")? title=".*?"\s*>(?P<image>.+?)</a>(?:.*?<span class="comment">((?P<comment>.*?))</span>)?', re.UNICODE) while True: path = self.log_address(number, mode = 'upload') get_throttle() html = self.getUrl(path) - + print regexp.findall(html) for m in regexp.finditer(html): image = m.group('image')
@@ -4661,7 +4660,6 @@ date = m.group('date') user = m.group('user') comment = m.group('comment') or '' - yield ImagePage(self, image), date, user, comment if not repeat: break