[Pywikipedia-l] SVN: [4770] trunk/pywikipedia/catlib.py

rotem at svn.wikimedia.org rotem at svn.wikimedia.org
Fri Dec 28 17:12:54 UTC 2007


Revision: 4770
Author:   rotem
Date:     2007-12-28 17:12:53 +0000 (Fri, 28 Dec 2007)

Log Message:
-----------
This seems to partially fix the problems.

Modified Paths:
--------------
    trunk/pywikipedia/catlib.py

Modified: trunk/pywikipedia/catlib.py
===================================================================
--- trunk/pywikipedia/catlib.py	2007-12-28 16:47:34 UTC (rev 4769)
+++ trunk/pywikipedia/catlib.py	2007-12-28 17:12:53 UTC (rev 4770)
@@ -159,6 +159,7 @@
         if self.site().versionnumber() < 4:
             Rtitle = re.compile('title\s?=\s?\"([^\"]*)\"')
         elif self.site().versionnumber() < 8:
+            # FIXME seems to parse all links
             Rtitle = re.compile('/\S*(?: title\s?=\s?)?\"([^\"]*)\"')
         else:
             Rtitle = re.compile(
@@ -189,35 +190,28 @@
             wikipedia.get_throttle()
             txt = self.site().getUrl(path)
             # index where subcategory listing begins
-            try:
-                ibegin = txt.index('<div id="mw-subcategories">')
-                skippedCategoryDescription = True
-            except ValueError:
-                try:
+            if self.site().versionnumber() >= 9:
+                # These IDs were introduced in 1.9
+                if '<div id="mw-subcategories">' in txt:
+                    ibegin = txt.index('<div id="mw-subcategories">')
+                elif '<div id="mw-pages">' in txt:
                     ibegin = txt.index('<div id="mw-pages">')
-                    skippedCategoryDescription = True
-                except ValueError:
-                    if self.site().has_mediawiki_message('category-empty') and self.site().mediawiki_message('category-empty') in txt:
-                        # No articles or subcategories
-                        return
-                    else:
-                        try:
-                            ibegin = txt.index('<!-- start content -->') # does not work for cats without text
-                            # TODO: This parses category text and may think they are
-                            # pages in category! Check for versions without the message
-                            # "category-empty".
-                            skippedCategoryDescription = False
-                        except ValueError:
-                            wikipedia.output("\nCategory page detection is not bug free. Please report this error!")
-                            raise
+                elif '<div id="mw-category-media">' in txt:
+                    ibegin = txt.index('<div id="mw-category-media">')
+                else:
+                    # No pages
+                    return
+            else:
+                ibegin = txt.index('<!-- start content -->') # does not work for cats without text
+                # TODO: This parses category text and may think they are
+                # pages in category! Check for versions before 1.9
             # index where article listing ends
-            try:
+            if '<div class="printfooter">' in txt:
                 iend = txt.index('<div class="printfooter">')
-            except ValueError:
-                try:
-                    iend = txt.index('<div id="catlinks">')
-                except ValueError:
-                    iend = txt.index('<!-- end content -->')
+            elif '<div class="catlinks">' in txt:
+                iend = txt.index('<div class="catlinks">')
+            else:
+                iend = txt.index('<!-- end content -->')
             txt = txt[ibegin:iend]
             for title in Rtitle.findall(txt):
                 if title == self.title():
@@ -244,16 +238,10 @@
                     # defaultNamespace feature to get everything correctly.
                     yield ARTICLE, wikipedia.ImagePage(self.site(), title)
             # try to find a link to the next list page
-            # If skippedCategoryDescription is False, then there are no pages
-            # or subcategories, so there cannot be a next list page
-            if skippedCategoryDescription:
-                matchObj = RLinkToNextPage.search(txt)
-                if matchObj:
-                    currentPageOffset = matchObj.group(1)
-                    wikipedia.output('There are more articles in %s.'
-                                     % self.title())
-                else:
-                    break
+            matchObj = RLinkToNextPage.search(txt)
+            if matchObj:
+                currentPageOffset = matchObj.group(1)
+                wikipedia.output('There are more articles in %s.' % self.title())
             else:
                 break
 





More information about the Pywikipedia-l mailing list