Revision: 4063 Author: jitseniesen Date: 2007-08-18 08:04:00 +0000 (Sat, 18 Aug 2007)
Log Message: ----------- _ParseCategory(): Do not assume English interface
Modified Paths: -------------- trunk/pywikipedia/catlib.py
Modified: trunk/pywikipedia/catlib.py =================================================================== --- trunk/pywikipedia/catlib.py 2007-08-18 07:35:12 UTC (rev 4062) +++ trunk/pywikipedia/catlib.py 2007-08-18 08:04:00 UTC (rev 4063) @@ -177,7 +177,7 @@ '<div class\s?=\s?"thumb"\sstyle="[^"]*"><a href=".*?"\s?title\s?=\s?"([^"]*)"') ns = self.site().category_namespaces() # regular expression matching the "(next 200)" link - RLinkToNextPage = re.compile('&from=(.*?)" title="[^"]*">next 200</a>'); + RLinkToNextPage = re.compile('&from=(.*?)" title="');
currentPageOffset = startFrom while True: @@ -197,12 +197,15 @@ # index where subcategory listing begins try: ibegin = txt.index('<div id="mw-subcategories">') + skippedCategoryDescription = True except ValueError: try: ibegin = txt.index('<div id="mw-pages">') + skippedCategoryDescription = True except ValueError: try: ibegin = txt.index('<!-- start content -->') # does not work for cats without text + skippedCategoryDescription = False except ValueError: wikipedia.output("\nCategory page detection is not bug free. Please report this error!") raise @@ -237,13 +240,19 @@ for title in Rimage.findall(txt): yield ARTICLE, wikipedia.Page(self.site(), title) # try to find a link to the next list page - matchObj = RLinkToNextPage.search(txt) - if matchObj: - currentPageOffset = matchObj.group(1) - wikipedia.output('There are more articles in %s.' - % self.title()) + # If skippedCategoryDescription is False, then there are no pages + # or subcategories, so there cannot be a next list page + if skippedCategoryDescription: + matchObj = RLinkToNextPage.search(txt) + if matchObj: + currentPageOffset = matchObj.group(1) + wikipedia.output('There are more articles in %s.' + % self.title()) + else: + break else: break + # get supercategories try: ibegin = self_txt.index('<div id="catlinks">')