[Pywikipedia-l] SVN: [4741] trunk/pywikipedia/redirect.py

russblau at svn.wikimedia.org russblau at svn.wikimedia.org
Thu Dec 20 18:38:39 UTC 2007


Revision: 4741
Author:   russblau
Date:     2007-12-20 18:38:39 +0000 (Thu, 20 Dec 2007)

Log Message:
-----------
Fix bugs 1855044 and 1855071

Modified Paths:
--------------
    trunk/pywikipedia/redirect.py

Modified: trunk/pywikipedia/redirect.py
===================================================================
--- trunk/pywikipedia/redirect.py	2007-12-20 17:09:58 UTC (rev 4740)
+++ trunk/pywikipedia/redirect.py	2007-12-20 18:38:39 UTC (rev 4741)
@@ -95,9 +95,9 @@
 }
 
 class RedirectGenerator:
-    def __init__(self, xmlFilename = None, namespace = None, offset = -1):
+    def __init__(self, xmlFilename = None, namespaces = None, offset = -1):
         self.xmlFilename = xmlFilename
-        self.namespace = namespace
+        self.namespaces = namespaces
         self.offset = offset
 
     def get_redirects_from_dump(self, alsoGetPageTitles = False):
@@ -111,7 +111,8 @@
         dict = {}
         # open xml dump and read page titles out of it
         dump = xmlreader.XmlDump(xmlFilename)
-        redirR = wikipedia.getSite().redirectRegex()
+        site = wikipedia.getSite()
+        redirR = site.redirectRegex()
         readPagesCount = 0
         if alsoGetPageTitles:
             pageTitles = set()
@@ -120,8 +121,9 @@
             # always print status message after 10000 pages
             if readPagesCount % 10000 == 0:
                 wikipedia.output(u'%i pages read...' % readPagesCount)
-            if self.namespace and self.namespace != entry.namespace:
-                continue
+            if self.namespaces is not None:
+                if wikipedia.Page(site, entry.title).namespace() not in self.namespaces:
+                    continue
             if alsoGetPageTitles:
                 pageTitles.add(entry.title.replace(' ', '_'))
 
@@ -129,7 +131,7 @@
             if m:
                 target = m.group(1)
                 # There might be redirects to another wiki. Ignore these.
-                for code in wikipedia.getSite().family.langs.keys():
+                for code in site.family.langs.keys():
                     if target.startswith('%s:' % code) or target.startswith(':%s:' % code):
                         wikipedia.output(u'NOTE: Ignoring %s which is a redirect to %s:' % (entry.title, code))
                         target = None
@@ -279,6 +281,11 @@
                     wikipedia.output(
                         u'Redirect target %s is not a redirect.'
                           % secondRedir.aslink())
+                except wikipedia.BadTitle, e:
+                    # str(e) is in the format 'BadTitle: [[Foo]]'
+                    wikipedia.output(
+                        u'Redirect target %s is not a valid page title.'
+                          % str(e)[10:])
                 except wikipedia.NoPage:
                     wikipedia.output(
                         u'Redirect target %s doesn\'t exist.'





More information about the Pywikipedia-l mailing list