[Pywikipedia-l] SVN: [4063] trunk/pywikipedia/catlib.py

18 Aug 2007

Revision: 4063
Author:   jitseniesen
Date:     2007-08-18 08:04:00 +0000 (Sat, 18 Aug 2007)
Log Message:
-----------
_ParseCategory(): Do not assume English interface
Modified Paths:
--------------
    trunk/pywikipedia/catlib.py
Modified: trunk/pywikipedia/catlib.py
===================================================================

--- trunk/pywikipedia/catlib.py	2007-08-18 07:35:12 UTC (rev 4062)
+++ trunk/pywikipedia/catlib.py	2007-08-18 08:04:00 UTC (rev 4063)
@@ -177,7 +177,7 @@
                 '<div class\s?=\s?"thumb"\sstyle="[^"]*"><a href=".*?"\s?title\s?=\s?"([^"]*)"')
         ns = self.site().category_namespaces()
         # regular expression matching the "(next 200)" link
-        RLinkToNextPage = re.compile('&amp;from=(.*?)" title="[^"]*">next 200</a>');
+        RLinkToNextPage = re.compile('&amp;from=(.*?)" title="');
currentPageOffset = startFrom
         while True:
@@ -197,12 +197,15 @@
             # index where subcategory listing begins
             try:
                 ibegin = txt.index('<div id="mw-subcategories">')
+                skippedCategoryDescription = True
             except ValueError:
                 try:
                     ibegin = txt.index('<div id="mw-pages">')
+                    skippedCategoryDescription = True
                 except ValueError:
                     try:
                         ibegin = txt.index('<!-- start content -->') # does not work for cats without text
+                        skippedCategoryDescription = False
                     except ValueError:
                         wikipedia.output("\nCategory page detection is not bug free. Please report this error!")
                         raise
@@ -237,13 +240,19 @@
                 for title in Rimage.findall(txt):
                     yield ARTICLE, wikipedia.Page(self.site(), title)
             # try to find a link to the next list page
-            matchObj = RLinkToNextPage.search(txt)
-            if matchObj:
-                currentPageOffset = matchObj.group(1)
-                wikipedia.output('There are more articles in %s.'
-                                 % self.title())
+            # If skippedCategoryDescription is False, then there are no pages
+            # or subcategories, so there cannot be a next list page
+            if skippedCategoryDescription:
+                matchObj = RLinkToNextPage.search(txt)
+                if matchObj:
+                    currentPageOffset = matchObj.group(1)
+                    wikipedia.output('There are more articles in %s.'
+                                     % self.title())
+                else:
+                    break
             else:
                 break
+            
         # get supercategories
         try:
             ibegin = self_txt.index('<div id="catlinks">')

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

2008

2007

[Pywikipedia-l] SVN: [4063] trunk/pywikipedia/catlib.py