Revision: 5530 Author: cosoleto Date: 2008-06-08 15:35:32 +0000 (Sun, 08 Jun 2008)
Log Message: ----------- now longpages() RE and more fixes to avoid error with numbers conversion in string
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2008-06-08 15:06:43 UTC (rev 5529) +++ trunk/pywikipedia/wikipedia.py 2008-06-08 15:35:32 UTC (rev 5530) @@ -4673,14 +4673,15 @@ # implemented is fairly useless # this comment applies to all the XXXXpages methods following, as well seen = set() + path = self.longpages_address(n=number) + entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">.+?</a>) .<a href=".+?" title="(?P<title>.+?)">.+?</a> .[(?P<length>[\d.,]+).*?]</li>', re.UNICODE) + while True: - path = self.longpages_address(n=number) get_throttle() html = self.getUrl(path) - entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">hist</a>) .<a href=".+?" title="(?P<title>.+?)">.+?</a> .[(?P<length>\d+)(.+?)]</li>', re.UNICODE) for m in entryR.finditer(html): title = m.group('title') - length = int(m.group('length')) + length = int(re.sub('[.,]', '', m.group('length'))) if title not in seen: seen.add(title) page = Page(self, title) @@ -4692,15 +4693,16 @@ """Yield Pages and lengths from Special:Shortpages.""" throttle = True seen = set() + path = self.shortpages_address(n = number) + entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">.+?</a>) .<a href=".+?" title="(?P<title>.+?)">.+?</a> .[(?P<length>[\d.,]+).*?]</li>', re.UNICODE) + while True: - path = self.shortpages_address(n = number) get_throttle() html = self.getUrl(path) - entryR = re.compile(ur'<li>(<a href=".+?" title=".+?">.+?</a>) .<a href=".+?" title="(?P<title>.+?)">.+?</a> .[(?P<length>[\d.,]+).*?]</li>', re.UNICODE)
for m in entryR.finditer(html): title = m.group('title') - length = int(m.group('length')) + length = int(re.sub('[., ]', '', m.group('length')))
if title not in seen: seen.add(title)