Revision: 7697 Author: alexsh Date: 2009-11-26 09:33:45 +0000 (Thu, 26 Nov 2009)
Log Message: ----------- separate getall for huge-retrieve (only work in getAll pageCount > config.special_page_limit
Modified Paths: -------------- trunk/pywikipedia/wikipedia.py
Modified: trunk/pywikipedia/wikipedia.py =================================================================== --- trunk/pywikipedia/wikipedia.py 2009-11-26 09:29:40 UTC (rev 7696) +++ trunk/pywikipedia/wikipedia.py 2009-11-26 09:33:45 UTC (rev 7697) @@ -3848,7 +3848,25 @@ # TODO: why isn't this a Site method? pages = list(pages) # if pages is an iterator, we need to make it a list output(u'Getting %d pages from %s...' % (len(pages), site)) - _GetAll(site, pages, throttle, force).run() + limit = config.special_page_limit / 4 # default is 500/4, but It might have good point for server. + + if len(pages) > limit: + # separate export pages for bulk-retrieve + + for pagg in range(0, len(pages), limit): + if pagg == range(0, len(pages), limit)[-1]: #latest retrieve + k = pages[pagg:] + output(u'Getting pages %d - %d of %d...' % (pagg + 1, len(pages), len(pages))) + _GetAll(site, k, throttle, force).run() + pages[pagg:] = k + else: + k = pages[pagg:pagg + limit] + output(u'Getting pages %d - %d of %d...' % (pagg + 1, pagg + limit, len(pages))) + _GetAll(site, k, throttle, force).run() + pages[pagg:pagg + limit] = k + get_throttle(requestsize = len(pages) / 10) # one time to retrieve is 7.7 sec. + else: + _GetAll(site, pages, throttle, force).run()
# Library functions
pywikipedia-svn@lists.wikimedia.org