[Pywikipedia-l] SVN: [4690] trunk/pywikipedia/replace.py
russblau at svn.wikimedia.org
russblau at svn.wikimedia.org
Mon Dec 10 16:59:07 UTC 2007
Revision: 4690
Author: russblau
Date: 2007-12-10 16:59:01 +0000 (Mon, 10 Dec 2007)
Log Message:
-----------
cleanup in XML generator code
Modified Paths:
--------------
trunk/pywikipedia/replace.py
Modified: trunk/pywikipedia/replace.py
===================================================================
--- trunk/pywikipedia/replace.py 2007-12-10 16:03:06 UTC (rev 4689)
+++ trunk/pywikipedia/replace.py 2007-12-10 16:59:01 UTC (rev 4690)
@@ -142,22 +142,20 @@
class XmlDumpReplacePageGenerator:
"""
- Generator which will yield Pages to pages that might contain text to
- replace. These pages will be retrieved from a local XML dump file
- (cur table).
+ Iterator that will yield Pages that might contain text to replace.
+
+ These pages will be retrieved from a local XML dump file.
+ Arguments:
+ * xmlFilename - The dump's path, either absolute or relative
+ * replacements - A list of 2-tuples of original text (as a
+ compiled regular expression) and replacement
+ text (as a string).
+ * exceptions - A dictionary which defines when to ignore an
+ occurence. See docu of the ReplaceRobot
+ constructor below.
+
"""
def __init__(self, xmlFilename, replacements, exceptions):
- """
- Arguments:
- * xmlFilename - The dump's path, either absolute or relative
- * replacements - A list of 2-tuples of original text (as a
- compiled regular expression) and replacement
- text (as a string).
- * exceptions - A dictionary which defines when to ignore an
- occurence. See docu of the ReplaceRobot
- constructor below.
- """
-
self.xmlFilename = xmlFilename
self.replacements = replacements
self.exceptions = exceptions
@@ -167,26 +165,34 @@
self.excsInside += self.exceptions['inside-tags']
if self.exceptions.has_key('inside'):
self.excsInside += self.exceptions['inside']
+ import xmlreader
+ self.site = wikipedia.getSite()
+ dump = xmlreader.XmlDump(self.xmlFilename)
+ self.parser = dump.parse()
def __iter__(self):
- import xmlreader
- mysite = wikipedia.getSite()
- dump = xmlreader.XmlDump(self.xmlFilename)
- for entry in dump.parse():
- if not self.isTitleExcepted(entry.title) and not self.isTextExcepted(entry.text):
+ return self
+
+ def next(self):
+ while True:
+ try:
+ entry = self.parser.next()
+ except StopIteration:
+ raise
+ if not self.isTitleExcepted(entry.title) \
+ and not self.isTextExcepted(entry.text):
new_text = entry.text
for old, new in self.replacements:
new_text = wikipedia.replaceExcept(new_text, old, new, self.excsInside)
if new_text != entry.text:
- yield wikipedia.Page(mysite, entry.title)
- break
+ return wikipedia.Page(self.site, entry.title)
def isTitleExcepted(self, title):
if self.exceptions.has_key('title'):
for exc in self.exceptions['title']:
if exc.find(title) > -1:
return True
- False
+ return False
def isTextExcepted(self, text):
if self.exceptions.has_key('text-contains'):
@@ -400,7 +406,8 @@
regex = True
elif arg.startswith('-xml'):
if len(arg) == 4:
- xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
+ xmlFilename = wikipedia.input(
+ u'Please enter the XML dump\'s filename:')
else:
xmlFilename = arg[5:]
elif arg =='-sql':
@@ -547,7 +554,11 @@
sys.exit()
if namespaces != []:
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
- preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 50)
+ if xmlFilename:
+ # XML parsing is slow enough that preloading would make bot even slower
+ preloadingGen = gen
+ else:
+ preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 50)
bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive)
bot.run()
More information about the Pywikipedia-l
mailing list