Revision: 4208
Author: wikipedian
Date: 2007-09-07 13:22:08 +0000 (Fri, 07 Sep 2007)
Log Message:
-----------
improved working on multiple sites
Modified Paths:
--------------
trunk/pywikipedia/pagegenerators.py
trunk/pywikipedia/replace.py
Modified: trunk/pywikipedia/pagegenerators.py
===================================================================
--- trunk/pywikipedia/pagegenerators.py 2007-09-07 13:08:19 UTC (rev 4207)
+++ trunk/pywikipedia/pagegenerators.py 2007-09-07 13:22:08 UTC (rev 4208)
@@ -499,10 +499,16 @@
def preload(self, pages):
try:
- site = pages[0].site()
- # filter out pages that are on other sites
- pages = filter(lambda p: p.site() == site, pages)
- wikipedia.getall(site, pages, throttle=False)
+ while len(pages) > 0:
+ # It might be that the pages are on different sites,
+ # e.g. because the -interwiki parameter was used.
+ # Query the sites one by one.
+ site = pages[0].site()
+ pagesThisSite = [page for page in pages if page.site() == site]
+ pages = [page for page in pages if page.site() != site]
+ wikipedia.getall(site, pagesThisSite, throttle=False)
+ for page in pagesThisSite:
+ yield page
except IndexError:
# Can happen if the pages list is empty. Don't care.
pass
@@ -520,14 +526,12 @@
# We don't want to load too many pages at once using XML export.
# We only get a maximum number at a time.
if len(somePages) >= self.pageNumber:
- self.preload(somePages)
- for refpage in somePages:
+ for refpage in self.preload(somePages):
self.queue.put(refpage)
somePages = []
if somePages:
# preload remaining pages
- self.preload(somePages)
- for refpage in somePages:
+ for refpage in self.preload(somePages):
self.queue.put(refpage)
self.queue.put(None) # to signal end of list
except Exception, e:
Modified: trunk/pywikipedia/replace.py
===================================================================
--- trunk/pywikipedia/replace.py 2007-09-07 13:08:19 UTC (rev 4207)
+++ trunk/pywikipedia/replace.py 2007-09-07 13:22:08 UTC (rev 4208)
@@ -223,21 +223,21 @@
# Load the page's text from the wiki
original_text = page.get()
if not page.canBeEdited():
- wikipedia.output(u'Skipping locked page %s' % page.title())
+ wikipedia.output(u'Skipping locked page %s' % page.aslink())
continue
except wikipedia.NoPage:
- wikipedia.output(u'Page %s not found' % page.title())
+ wikipedia.output(u'Page %s not found' % page.aslink())
continue
except wikipedia.IsRedirectPage:
original_text = page.get(get_redirect=True)
match = self.checkExceptions(original_text)
# skip all pages that contain certain texts
if match:
- wikipedia.output(u'Skipping %s because it contains %s' % (page.title(), match))
+ wikipedia.output(u'Skipping %s because it contains %s' % (page.aslink(), match))
else:
new_text = self.doReplacements(original_text)
if new_text == original_text:
- wikipedia.output('No changes were necessary in %s' % page.title())
+ wikipedia.output('No changes were necessary in %s' % page.aslink())
else:
if self.recursive:
newest_text = self.doReplacements(new_text)