[Pywikipedia-l] SVN: [4770] trunk/pywikipedia/catlib.py
rotem at svn.wikimedia.org
rotem at svn.wikimedia.org
Fri Dec 28 17:12:54 UTC 2007
Revision: 4770
Author: rotem
Date: 2007-12-28 17:12:53 +0000 (Fri, 28 Dec 2007)
Log Message:
-----------
This seems to partially fix the problems.
Modified Paths:
--------------
trunk/pywikipedia/catlib.py
Modified: trunk/pywikipedia/catlib.py
===================================================================
--- trunk/pywikipedia/catlib.py 2007-12-28 16:47:34 UTC (rev 4769)
+++ trunk/pywikipedia/catlib.py 2007-12-28 17:12:53 UTC (rev 4770)
@@ -159,6 +159,7 @@
if self.site().versionnumber() < 4:
Rtitle = re.compile('title\s?=\s?\"([^\"]*)\"')
elif self.site().versionnumber() < 8:
+ # FIXME seems to parse all links
Rtitle = re.compile('/\S*(?: title\s?=\s?)?\"([^\"]*)\"')
else:
Rtitle = re.compile(
@@ -189,35 +190,28 @@
wikipedia.get_throttle()
txt = self.site().getUrl(path)
# index where subcategory listing begins
- try:
- ibegin = txt.index('<div id="mw-subcategories">')
- skippedCategoryDescription = True
- except ValueError:
- try:
+ if self.site().versionnumber() >= 9:
+ # These IDs were introduced in 1.9
+ if '<div id="mw-subcategories">' in txt:
+ ibegin = txt.index('<div id="mw-subcategories">')
+ elif '<div id="mw-pages">' in txt:
ibegin = txt.index('<div id="mw-pages">')
- skippedCategoryDescription = True
- except ValueError:
- if self.site().has_mediawiki_message('category-empty') and self.site().mediawiki_message('category-empty') in txt:
- # No articles or subcategories
- return
- else:
- try:
- ibegin = txt.index('<!-- start content -->') # does not work for cats without text
- # TODO: This parses category text and may think they are
- # pages in category! Check for versions without the message
- # "category-empty".
- skippedCategoryDescription = False
- except ValueError:
- wikipedia.output("\nCategory page detection is not bug free. Please report this error!")
- raise
+ elif '<div id="mw-category-media">' in txt:
+ ibegin = txt.index('<div id="mw-category-media">')
+ else:
+ # No pages
+ return
+ else:
+ ibegin = txt.index('<!-- start content -->') # does not work for cats without text
+ # TODO: This parses category text and may think they are
+ # pages in category! Check for versions before 1.9
# index where article listing ends
- try:
+ if '<div class="printfooter">' in txt:
iend = txt.index('<div class="printfooter">')
- except ValueError:
- try:
- iend = txt.index('<div id="catlinks">')
- except ValueError:
- iend = txt.index('<!-- end content -->')
+ elif '<div class="catlinks">' in txt:
+ iend = txt.index('<div class="catlinks">')
+ else:
+ iend = txt.index('<!-- end content -->')
txt = txt[ibegin:iend]
for title in Rtitle.findall(txt):
if title == self.title():
@@ -244,16 +238,10 @@
# defaultNamespace feature to get everything correctly.
yield ARTICLE, wikipedia.ImagePage(self.site(), title)
# try to find a link to the next list page
- # If skippedCategoryDescription is False, then there are no pages
- # or subcategories, so there cannot be a next list page
- if skippedCategoryDescription:
- matchObj = RLinkToNextPage.search(txt)
- if matchObj:
- currentPageOffset = matchObj.group(1)
- wikipedia.output('There are more articles in %s.'
- % self.title())
- else:
- break
+ matchObj = RLinkToNextPage.search(txt)
+ if matchObj:
+ currentPageOffset = matchObj.group(1)
+ wikipedia.output('There are more articles in %s.' % self.title())
else:
break
More information about the Pywikipedia-l
mailing list