Revision: 6692
Author: nicdumz
Date: 2009-04-24 06:26:16 +0000 (Fri, 24 Apr 2009)
Log Message:
-----------
Adding a PageTree structure to manipulate Page sets instead of using lists.
This performs more efficiently when filtering these sets per Site.
Modified Paths:
--------------
trunk/pywikipedia/interwiki.py
Modified: trunk/pywikipedia/interwiki.py
===================================================================
--- trunk/pywikipedia/interwiki.py 2009-04-24 05:38:49 UTC (rev 6691)
+++ trunk/pywikipedia/interwiki.py 2009-04-24 06:26:16 UTC (rev 6692)
@@ -502,6 +502,51 @@
nobackonly = False
hintsareright = False
+class PageTree(object):
+ """
+ Structure to manipulate a set of pages.
+ Allows filtering efficiently by Site.
+ """
+ def __init__(self):
+ self.tree = {}
+ self.size = 0
+
+ def filter(self, site):
+ """
+ Iterates over pages that are in Site site
+ """
+ try:
+ for page in self.tree[site]:
+ yield page
+ except KeyError:
+ pass
+
+ def __len__(self):
+ return self.size
+
+ def add(self, page):
+ site = page.site()
+ if not site in self.tree:
+ self.tree[site] = {}
+ self.tree[site][page] = True
+ self.size += 1
+
+ def remove(self, page):
+ del self.tree[site][page]
+ self.size -= 1
+
+ def siteCounts(self):
+ """
+ Yields (Site, number of pages in site) pairs
+ """
+ for site, d in self.tree:
+ yield site, len(d)
+
+ def __iter__(self):
+ for site, d in self.tree:
+ for page in d:
+ yield page
+
class Subject(object):
"""
Class to follow the progress of a single 'subject' (i.e. a page with
@@ -515,10 +560,12 @@
self.originPage = originPage
# todo is a list of all pages that still need to be analyzed.
# Mark the origin page as todo.
- self.todo = [originPage]
+ self.todo = PageTree()
+ self.todo.add(originPage)
+
# done is a list of all pages that have been analyzed and that
# are known to belong to this subject.
- self.done = []
+ self.done = PageTree()
# foundIn is a dictionary where pages are keys and lists of
# pages are values. It stores where we found each page.
# As we haven't yet found a page that links to the origin page, we
@@ -526,7 +573,7 @@
self.foundIn = {self.originPage:[]}
# This is a list of all pages that are currently scheduled for
# download.
- self.pending = []
+ self.pending = PageTree()
if globalvar.hintsareright:
# This is a set of sites that we got hits to
self.hintedsites = set()
@@ -544,8 +591,8 @@
first one will be returned.
Otherwise, None will be returned.
"""
- for page in self.done + self.pending:
- if page.site() == site:
+ for tree in [self.done, self.pending]:
+ for page in tree.filter(site):
if page.exists() and page.isDisambig():
return page
return None
@@ -557,8 +604,8 @@
first one will be returned.
Otherwise, None will be returned.
"""
- for page in self.done + self.pending:
- if page.site() == site:
+ for tree in [self.done, self.pending]:
+ for page in tree.filter(site):
if page.exists() and not page.isDisambig() and not page.isRedirectPage():
return page
return None
@@ -570,8 +617,8 @@
have been found, the first one will be returned.
Otherwise, None will be returned.
"""
- for page in self.done + self.pending + self.todo:
- if page.site() == site:
+ for tree in [self.done, self.pending, self.todo]:
+ for page in tree.filter(site):
if page.namespace() == self.originPage.namespace():
if page.exists() and not page.isRedirectPage():
return page
@@ -590,7 +637,7 @@
pages = titletranslate.translate(self.originPage, hints = hints, auto = globalvar.auto, removebrackets
= globalvar.hintnobracket)
for page in pages:
- self.todo.append(page)
+ self.todo.add(page)
self.foundIn[page] = [None]
if keephintedsites:
self.hintedsites.add(page.site)
@@ -603,12 +650,8 @@
"""
siteCount = {}
- for page in self.todo:
- site = page.site()
- try:
- siteCount[site] += 1
- except KeyError:
- siteCount[site] = 1
+ for site, count in self.todo.siteCounts():
+ siteCount[site] = count
return siteCount
def willWorkOn(self, site):
@@ -619,24 +662,25 @@
"""
# Bug-check: Isn't there any work still in progress? We can't work on
# different sites at a time!
- if self.pending != []:
+ if len(self.pending) > 0:
raise 'BUG: Can\'t start to work on %s; still working on %s' % (site, self.pending)
# Prepare a list of suitable pages
- for page in self.todo:
- if page.site() == site:
- self.pending.append(page)
- for page in self.pending:
+ result = []
+ for page in self.todo.filter(site):
+ self.pending.add(page)
+ result.append(page)
+ for page in self.pending.filter(site):
self.todo.remove(page)
# If there are any, return them. Otherwise, nothing is in progress.
- return self.pending
+ return result
def makeForcedStop(self,counter):
"""
Ends work on the page before the normal end.
"""
- for page in self.todo:
- counter.minus(page.site())
- self.todo = []
+ for site, count in self.todo.siteCounts():
+ counter.minus(site, count)
+ self.todo = PageTree()
self.forcedStop = True
def addIfNew(self, page, counter, linkingPage):
@@ -662,7 +706,7 @@
return False
else:
self.foundIn[page] = [linkingPage]
- self.todo.append(page)
+ self.todo.add(page)
counter.plus(page.site())
return True
@@ -828,7 +872,7 @@
# Loop over all the pages that should have been taken care of
for page in self.pending:
# Mark the page as done
- self.done.append(page)
+ self.done.add(page)
# make sure that none of the linked items is an auto item
if globalvar.skipauto:
@@ -852,15 +896,21 @@
if page == self.originPage:
if globalvar.initialredirect:
self.originPage = redirectTargetPage
- self.pending.append(redirectTargetPage)
+ #XXX might not work if page.site != redirTar.site:
+ # We are appending an item to
+ # self.pending[redirTar.site]
+ # but we are iterating on self.pending at the same
+ # time.
+ # On the other hand... crosslanguage redirects?
+ self.pending.add(redirectTargetPage)
counter.plus(redirectTargetPage.site)
else:
# This is a redirect page to the origin. We don't need to
# follow the redirection.
# In this case we can also stop all hints!
- for page2 in self.todo:
- counter.minus(page2.site())
- self.todo = []
+ for site, count in self.todo.siteCounts():
+ counter.minus(site, count)
+ self.todo = PageTree()
elif not globalvar.followredirect:
wikipedia.output(u"NOTE: not following redirects.")
else:
@@ -873,10 +923,11 @@
if page == self.originPage:
# The page we are working on is the page that does not exist.
# No use in doing any work on it in that case.
- for page2 in self.todo:
- counter.minus(page2.site())
- self.todo = []
- self.done = [] # In some rare cases it might be we already did check some 'automatic' links
+ for site, count in self.todo.siteCounts():
+ counter.minus(site, count)
+ self.todo = PageTree()
+ # In some rare cases it might be we already did check some 'automatic' links
+ self.done = PageTree()
pass
except wikipedia.NoSuchSite:
wikipedia.output(u"NOTE: site %s does not exist" % page.site())
@@ -886,8 +937,7 @@
(skip, alternativePage) = self.disambigMismatch(page, counter)
if skip:
wikipedia.output(u"NOTE: ignoring %s and its interwiki links" % page.aslink(True))
- if page in self.done: #XXX: Ugly bugfix - the following line has reportedly thrown "ValueError: list.remove(x): x not in list"
- self.done.remove(page)
+ self.done = PageTree()
iw = ()
if alternativePage:
# add the page that was entered by the user
@@ -898,11 +948,14 @@
if globalvar.untranslatedonly:
# Ignore the interwiki links.
iw = ()
- elif globalvar.autonomous and page.site() in [p.site() for p in self.done if p != page and p.exists() and not p.isRedirectPage()]:
+ # FIXME: the filtered list generated in the condition is
+ # re-generated the lign after.
+ # And we only use the first item of that list.
+ elif globalvar.autonomous and [p for p in self.done.filter(page.site()) if p != page and p.exists() and not p.isRedirectPage()]:
- for p in self.done:
- if p.site() == page.site() and p != page \
- and p.exists() and not p.isRedirectPage():
+ for p in self.done.filter(page.site()):
+ if p != page and p.exists() and \
+ not p.isRedirectPage():
otherpage = p
break
wikipedia.output(u"Stopping work on %s because duplicate pages %s and %s are found"%(self.originPage.aslink(),otherpage.aslink(True),page.aslink(True)))
@@ -925,8 +978,7 @@
elif page.isEmpty() and not page.isCategory():
wikipedia.output(u"NOTE: %s is empty; ignoring it and its interwiki links" % page.aslink(True))
# Ignore the interwiki links
- if page in self.done: #XXX: Ugly bugfix - the following line has reportedly thrown "ValueError: list.remove(x): x not in list"
- self.done.remove(page)
+ self.done = PageTree()
iw = ()
for linkedPage in iw:
if globalvar.hintsareright:
@@ -950,7 +1002,7 @@
wikipedia.output(u"%s: %s gives new interwiki %s"% (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True)))
# These pages are no longer 'in progress'
- self.pending = []
+ self.pending = PageTree()
# Check whether we need hints and the user offered to give them
if self.untranslated and not self.hintsAsked:
self.reportInterwikilessPage(page)
@@ -1581,9 +1633,9 @@
except KeyError:
self.counts[site] = count
- def minus(self, site):
+ def minus(self, site, count=1):
"""This is a routine that the Subject class expects in a counter"""
- self.counts[site] -= 1
+ self.counts[site] -= count
def run(self):
"""Start the process until finished"""