Revision: 5194 Author: russblau Date: 2008-04-08 14:19:22 +0000 (Tue, 08 Apr 2008)
Log Message: ----------- Add missing try/finally block, and reduce lookahead on XML dumps
Modified Paths: -------------- trunk/pywikipedia/replace.py
Modified: trunk/pywikipedia/replace.py =================================================================== --- trunk/pywikipedia/replace.py 2008-04-08 13:06:01 UTC (rev 5193) +++ trunk/pywikipedia/replace.py 2008-04-08 14:19:22 UTC (rev 5194) @@ -324,75 +324,78 @@ """ # Run the generator which will yield Pages which might need to be # changed. - for page in self.generator: - if self.isTitleExcepted(page.title()): - wikipedia.output( - u'Skipping %s because the title is on the exceptions list.' - % page.aslink()) - continue - try: - # Load the page's text from the wiki - original_text = page.get() - if not page.canBeEdited(): - wikipedia.output(u"You can't edit page %s" + try: + for page in self.generator: + if self.isTitleExcepted(page.title()): + wikipedia.output( + u'Skipping %s because the title is on the exceptions list.' + % page.aslink()) + continue + try: + # Load the page's text from the wiki + original_text = page.get() + if not page.canBeEdited(): + wikipedia.output(u"You can't edit page %s" + % page.aslink()) + continue + except wikipedia.NoPage: + wikipedia.output(u'Page %s not found' % page.aslink()) + continue + except wikipedia.IsRedirectPage: + original_text = page.get(get_redirect=True) + if self.isTextExcepted(original_text): + wikipedia.output( + u'Skipping %s because it contains text that is on the exceptions list.' + % page.aslink()) + continue + new_text = self.doReplacements(original_text) + if new_text == original_text: + wikipedia.output('No changes were necessary in %s' % page.aslink()) continue - except wikipedia.NoPage: - wikipedia.output(u'Page %s not found' % page.aslink()) - continue - except wikipedia.IsRedirectPage: - original_text = page.get(get_redirect=True) - if self.isTextExcepted(original_text): - wikipedia.output( -u'Skipping %s because it contains text that is on the exceptions list.' - % page.aslink()) - continue - new_text = self.doReplacements(original_text) - if new_text == original_text: - wikipedia.output('No changes were necessary in %s' - % page.aslink()) - continue - if self.recursive: - newest_text = self.doReplacements(new_text) - while (newest_text!=new_text): - new_text = newest_text + if self.recursive: newest_text = self.doReplacements(new_text) + while (newest_text!=new_text): + new_text = newest_text + newest_text = self.doReplacements(new_text)
- if self.addedCat: - cats = page.categories() - if self.addedCat not in cats: - cats.append(self.addedCat) - new_text = wikipedia.replaceCategoryLinks(new_text, - cats) - # Show the title of the page we're working on. - # Highlight the title in purple. - wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" - % page.title()) - wikipedia.showDiff(original_text, new_text) - if not self.acceptall: - choice = wikipedia.inputChoice( - u'Do you want to accept these changes?', - ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N') - if choice in ['a', 'A']: - self.acceptall = True - if choice in ['y', 'Y']: - page.put_async(new_text) - if self.acceptall: - try: - page.put(new_text) - except wikipedia.EditConflict: - wikipedia.output(u'Skipping %s because of edit conflict' - % (page.title(),)) - except wikipedia.SpamfilterError, e: - wikipedia.output( - u'Cannot change %s because of blacklist entry %s' - % (page.title(), e.url)) - except wikipedia.PageNotSaved, error: - wikipedia.output(u'Error putting page: %s' - % (error.args,)) - except wikipedia.LockedPage: - wikipedia.output(u'Skipping %s (locked page)' - % (page.title(),)) + if self.addedCat: + cats = page.categories() + if self.addedCat not in cats: + cats.append(self.addedCat) + new_text = wikipedia.replaceCategoryLinks(new_text, + cats) + # Show the title of the page we're working on. + # Highlight the title in purple. + wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" + % page.title()) + wikipedia.showDiff(original_text, new_text) + if not self.acceptall: + choice = wikipedia.inputChoice( + u'Do you want to accept these changes?', + ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N') + if choice in ['a', 'A']: + self.acceptall = True + if choice in ['y', 'Y']: + page.put_async(new_text) + if self.acceptall: + try: + page.put(new_text) + except wikipedia.EditConflict: + wikipedia.output(u'Skipping %s because of edit conflict' + % (page.title(),)) + except wikipedia.SpamfilterError, e: + wikipedia.output( + u'Cannot change %s because of blacklist entry %s' + % (page.title(), e.url)) + except wikipedia.PageNotSaved, error: + wikipedia.output(u'Error putting page: %s' + % (error.args,)) + except wikipedia.LockedPage: + wikipedia.output(u'Skipping %s (locked page)' + % (page.title(),)) + finally: + self.generator.stop()
def prepareRegexForMySQL(pattern): pattern = pattern.replace('\s', '[:space:]') @@ -648,7 +651,7 @@ # XML parsing can be quite slow, so use smaller batches and # longer lookahead. preloadingGen = pagegenerators.PreloadingGenerator(gen, - pageNumber=20, lookahead=1000) + pageNumber=20, lookahead=100) else: preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60) bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall,