Revision: 6810 Author: nicdumz Date: 2009-05-04 01:14:36 +0000 (Mon, 04 May 2009)
Log Message: ----------- Merging changes from trunk
Modified Paths: -------------- branches/rewrite/pywikibot/config2.py branches/rewrite/scripts/interwiki.py
Property Changed: ---------------- branches/rewrite/pywikibot/config2.py branches/rewrite/scripts/interwiki.py
Modified: branches/rewrite/pywikibot/config2.py =================================================================== --- branches/rewrite/pywikibot/config2.py 2009-05-03 18:59:43 UTC (rev 6809) +++ branches/rewrite/pywikibot/config2.py 2009-05-04 01:14:36 UTC (rev 6810) @@ -271,6 +271,11 @@ # Save file with local articles without interwikis. without_interwiki = False
+# Experimental feature: +# Store the page contents on disk (/cache/ directory) instead of loading +# them in RAM. +interwiki_contents_on_disk = False + ############## SOLVE_DISAMBIGUATION SETTINGS ############ # # Set disambiguation_comment[FAMILY][LANG] to a non-empty string to override @@ -482,22 +487,18 @@ % {'fn' :_filename}
# Test for obsoleted and/or unknown variables. -for _key in globals().keys(): - if _key[0]=='_': +for _key, _val in globals().items(): + if _key.startswith('_'): pass elif _key in _gl: - nt=type(globals()[_key]) - ot=_tp[_key] - if nt==ot or nt==type(None) or ot==type(None): + nt = type(_val) + ot = _tp[_key] + if nt == ot or _val is None or ot == type(None): pass - elif nt==type(1) and ot==type(1.0): + elif nt is int and (ot is float or ot is bool): pass - elif ot==type(1) and nt==type(1.0): + elif ot is int and (nt is float or nt is bool): pass - elif nt==type(1) and ot==type(True): - pass - elif ot==type(1) and nt==type(True): - pass else: print "WARNING: Type of '%(_key)s' changed" % locals() print " %(was)s: %(old)s" % {'was': "Was", 'old': ot}
Property changes on: branches/rewrite/pywikibot/config2.py ___________________________________________________________________ Modified: svn:mergeinfo - + /trunk/pywikipedia/config.py:6734-6801
Modified: branches/rewrite/scripts/interwiki.py =================================================================== --- branches/rewrite/scripts/interwiki.py 2009-05-03 18:59:43 UTC (rev 6809) +++ branches/rewrite/scripts/interwiki.py 2009-05-04 01:14:36 UTC (rev 6810) @@ -211,7 +211,7 @@ -localonly only work on the local wiki, not on other wikis in the family I have a login at. (note: without ending colon)
- -limittwo only update two pages - one in the local wiki (if loged-in), + -limittwo only update two pages - one in the local wiki (if logged-in) and one in the top available one. For example, if the local page has links to de and fr, this option will make sure that only local and de: (larger) @@ -270,13 +270,14 @@ # (C) Rob W.W. Hooft, 2003 # (C) Daniel Herding, 2004 # (C) Yuri Astrakhan, 2005-2006 +# (C) Pywikipedia bot team, 2007-2009 # # Distributed under the terms of the MIT license. # __version__ = '$Id$' #
-import sys, copy, re +import sys, copy, re, os import time import codecs import socket @@ -309,6 +310,7 @@
import wikipedia, config, pagegenerators, catlib import titletranslate, interwiki_graph +import webbrowser
docuReplacements = { '&pagegenerators_help;': pagegenerators.parameterHelp @@ -461,6 +463,7 @@ 'uz': (u'Bot', u'Qoʻshdi', u'Tuzatdi', u'Oʻchirdi'), 'vi': (u'robot ', u'Thêm', u'Dời', u'Thay'), 'vo': (u'bot ', u'läükon', u'moükon', u'votükon'), + 'war':(u'robot ', u'Gindugngan', u'Gintanggal', u'Ginliwat'), 'yi': (u'באט ', u'צוגעלייגט', u'אראפגענומען', u'געענדערט'), 'yue': (u'機械人 ', u'加', u'減', u'改'), 'zh': (u'機器人 ', u'新增', u'移除', u'修改'), @@ -495,19 +498,107 @@ strictlimittwo = False needlimit = 0 ignore = [] - bracketonly = False + parenthesesonly = False rememberno = False followinterwiki = True minsubjects = config.interwiki_min_subjects nobackonly = False hintsareright = False + contentsondisk = config.interwiki_contents_on_disk
+class StoredPage(wikipedia.Page): + """ + Store the Page contents on disk to avoid sucking too much + memory when a big number of Page objects will be loaded + at the same time. + """ + + # Please prefix the class members names by SP + # to avoid possible name clashes with wikipedia.Page + + # path to the shelve + SPpath = None + # shelve + SPstore = None + + # attributes created by wikipedia.Page.__init__ + SPcopy = [ '_editrestriction', + '_site', + '_namespace', + '_section', + '_title', + 'editRestriction', + 'moveRestriction', + '_permalink', + '_userName', + '_ipedit', + '_editTime', + '_startTime', + '_revisionId', + '_deletedRevs' ] + + def SPdeleteStore(): + if StoredPage.SPpath: + del StoredPage.SPstore + os.unlink(StoredPage.SPpath) + SPdeleteStore = staticmethod(SPdeleteStore) + + def __init__(self, page): + for attr in StoredPage.SPcopy: + setattr(self, attr, getattr(page, attr)) + + if not StoredPage.SPpath: + import shelve + index = 1 + while True: + path = config.datafilepath('cache', 'pagestore' + str(index)) + if not os.path.exists(path): break + index += 1 + StoredPage.SPpath = path + StoredPage.SPstore = shelve.open(path) + + self.SPkey = self.aslink().encode('utf-8') + self.SPcontentSet = False + + def SPgetContents(self): + return StoredPage.SPstore[self.SPkey] + + def SPsetContents(self, contents): + self.SPcontentSet = True + StoredPage.SPstore[self.SPkey] = contents + + def SPdelContents(self): + if self.SPcontentSet: + del StoredPage.SPstore[self.SPkey] + + _contents = property(SPgetContents, SPsetContents, SPdelContents) + class PageTree(object): """ Structure to manipulate a set of pages. Allows filtering efficiently by Site. """ def __init__(self): + # self.tree : + # Dictionary: + # keys: Site + # values: list of pages + # All pages found within Site are kept in + # self.tree[site] + + # While using dict values would be faster for + # the remove() operation, + # keeping list values is important, because + # the order in which the pages were found matters: + # the earlier a page is found, the closer it is to the + # Subject.originPage. Chances are that pages found within + # 2 interwiki distance from the originPage are more related + # to the original topic than pages found later on, after + # 3, 4, 5 or more interwiki hops. + + # Keeping this order is hence important to display ordered + # list of pages to the user when he'll be asked to resolve + # conflicts. self.tree = {} self.size = 0
@@ -527,15 +618,15 @@ def add(self, page): site = page.site() if not site in self.tree: - self.tree[site] = {} - self.tree[site][page] = True + self.tree[site] = [] + self.tree[site].append(page) self.size += 1
def remove(self, page): try: - del self.tree[page.site()][page] + self.tree[page.site()].remove(page) self.size -= 1 - except KeyError: + except ValueError: pass
def removeSite(self, site): @@ -556,19 +647,76 @@ yield site, len(d)
def __iter__(self): - for site, d in self.tree.iteritems(): - for page in d: + for site, plist in self.tree.iteritems(): + for page in plist: yield page
class Subject(object): """ Class to follow the progress of a single 'subject' (i.e. a page with all its translations) + + + Subject is a transitive closure of the binary relation on Page: + "has_a_langlink_pointing_to". + + A formal way to compute that closure would be: + + With P a set of pages, NL ('NextLevel') a function on sets defined as: + NL(P) = { target | ∃ source ∈ P, target ∈ source.langlinks() } + pseudocode: + todo <- [originPage] + done <- [] + while todo != []: + pending <- todo + todo <-NL(pending) / done + done <- NL(pending) U done + return done + + + There is, however, one limitation that is induced by implementation: + to compute efficiently NL(P), one has to load the page contents of + pages in P. + (Not only the langlinks have to be parsed from each Page, but we also want + to know if the Page is a redirect, a disambiguation, etc...) + + Because of this, the pages in pending have to be preloaded. + However, because the pages in pending are likely to be in several sites + we cannot "just" preload them as a batch. + + Instead of doing "pending <- todo" at each iteration, we have to elect a + Site, and we put in pending all the pages from todo that belong to that + Site: + + Code becomes: + todo <- {originPage.site():[originPage]} + done <- [] + while todo != {}: + site <- electSite() + pending <- todo[site] + + preloadpages(site, pending) + + todo[site] <- NL(pending) / done + done <- NL(pending) U done + return done + + + Subject objects only operate on pages that should have been preloaded before. + In fact, at any time: + * todo contains new Pages that have not been loaded yet + * done contains Pages that have been loaded, and that have been treated. + * If batch preloadings are successful, Page._get() is never called from + this Object. """
def __init__(self, originPage, hints = None): """Constructor. Takes as arguments the Page on the home wiki plus optionally a list of hints for translation""" + + if globalvar.contentsondisk: + originPage = StoredPage(originPage) + # Remember the "origin page" self.originPage = originPage # todo is a list of all pages that still need to be analyzed. @@ -650,6 +798,8 @@ pages = titletranslate.translate(self.originPage, hints = hints, auto = globalvar.auto, removebrackets = globalvar.hintnobracket) for page in pages: + if globalvar.contentsondisk: + page = StoredPage(page) self.todo.add(page) self.foundIn[page] = [None] if keephintedsites: @@ -663,11 +813,12 @@ """ return self.todo.siteCounts()
- def willWorkOn(self, site): + def whatsNextPageBatch(self, site): """ By calling this method, you 'promise' this instance that you will - work on any todo items for the wiki indicated by 'site'. This routine - will return a list of pages that can be treated. + preload all the 'site' Pages that are in the todo list. + + This routine will return a list of pages that can be treated. """ # Bug-check: Isn't there any work still in progress? We can't work on # different sites at a time! @@ -680,6 +831,7 @@ result.append(page)
self.todo.removeSite(site) + # If there are any, return them. Otherwise, nothing is in progress. return result
@@ -709,11 +861,15 @@ wikipedia.output("%s has a backlink from %s."%(page,linkingPage)) self.makeForcedStop(counter) return False + + if page in self.foundIn: # not new self.foundIn[page].append(linkingPage) return False else: + if globalvar.contentsondisk: + page = StoredPage(page) self.foundIn[page] = [linkingPage] self.todo.add(page) counter.plus(page.site()) @@ -876,10 +1032,14 @@ if globalvar.hintsareright: self.hintedsites.add(page.site)
- def workDone(self, counter): + def batchLoaded(self, counter): """ - This is called by a worker to tell us that the promised work - was completed as far as possible. The only argument is an instance + This is called by a worker to tell us that the promised batch of + pages was loaded. + In other words, all the pages in self.pending have already + been preloaded. + + The only argument is an instance of a counter class, that has methods minus() and plus() to keep counts of the total work todo. """ @@ -913,10 +1073,18 @@ continue
elif page.isRedirectPage(): - redirectTargetPage = page.getRedirectTarget() + try: + redirectTargetPage = page.getRedirectTarget() + except wikipedia.InvalidTitle: + # MW considers #redirect [[en:#foo]] as a redirect page, + # but we can't do anything useful with such pages + wikipedia.output(u"NOTE: %s redirects to an invalid title" % page.aslink(True)) + continue wikipedia.output(u"NOTE: %s is redirect to %s" % (page.aslink(True), redirectTargetPage.aslink(True))) if page == self.originPage: if globalvar.initialredirect: + if globalvar.contentsondisk: + redirectTargetPage = StoredPage(redirectTargetPage) self.originPage = redirectTargetPage self.todo.add(redirectTargetPage) counter.plus(redirectTargetPage.site) @@ -973,7 +1141,7 @@ elif globalvar.autonomous and duplicate:
wikipedia.output(u"Stopping work on %s because duplicate pages"\ - " %s and %s are found" % (self.originPage.aslink(), + " %s and %s are found" % (self.originPage.aslink(True), duplicate.aslink(True), page.aslink(True))) self.makeForcedStop(counter) @@ -1015,7 +1183,7 @@ if prevPage != linkedPage and prevPage.site() == lpsite: # Still, this could be "no problem" as either may be a # redirect to the other. No way to find out quickly! - wikipedia.output(u"NOTE: %s: %s gives duplicate interwiki on same site %s" % (self.originPage.aslink(), page.aslink(True), linkedPage.aslink(True))) + wikipedia.output(u"NOTE: %s: %s gives duplicate interwiki on same site %s" % (self.originPage.aslink(True), page.aslink(True), linkedPage.aslink(True))) break else: if config.interwiki_shownew: @@ -1190,14 +1358,7 @@ lclSiteDone = False frgnSiteDone = False
- # XXX Do we really need to make an union here? - # we should have sorted(languages_by_size) = sorted(langs) ?! - langBySize = set(lclSite.family.languages_by_size) - allLangs = set(lclSite.family.langs) - - langToCheck = (langBySize | allLangs).difference(lclSite.family.obsolete) - - for siteCode in langToCheck: + for siteCode in lclSite.family.languages_by_size: site = wikipedia.getSite(code = siteCode) if (not lclSiteDone and site == lclSite) or (not frgnSiteDone and site != lclSite and site in new): if site == lclSite: @@ -1254,6 +1415,27 @@ if config.interwiki_backlink: self.reportBacklinks(new, updatedSites)
+ def clean(self): + """ + Delete the contents that are stored on disk for this Subject. + + We cannot afford to define this in a StoredPage destructor because + StoredPage instances can get referenced cyclicly: that would stop the + garbage collector from destroying some of those objects. + + It's also not necessary to set these lines as a Subject destructor: + deleting all stored content one entry by one entry when bailing out + after a KeyboardInterrupt for example is redundant, because the + whole storage file will be eventually removed. + """ + if globalvar.contentsondisk: + for page in self.foundIn: + # foundIn can contain either Page or StoredPage objects + # calling the destructor on _contents will delete the + # disk records if necessary + if hasattr(page, '_contents'): + del page._contents + def replaceLinks(self, page, newPages, bot): """ Returns True if saving was successful. @@ -1312,12 +1494,8 @@
# Put interwiki links into a map old={} - try: - for page2 in interwikis: - old[page2.site()] = page2 - except wikipedia.NoPage: - wikipedia.output(u"BUG>>> %s no longer exists?" % page.aslink(True)) - raise SaveError + for page2 in interwikis: + old[page2.site()] = page2
# Check what needs to get done mods, adding, removing, modifying = compareLanguages(old, new, insite = page.site()) @@ -1368,7 +1546,16 @@ # If we cannot ask, deny permission answer = 'n' else: - answer = wikipedia.inputChoice(u'Submit?', ['Yes', 'No', 'Give up'], ['y', 'n', 'g']) + answer = wikipedia.inputChoice(u'Submit?', + ['Yes', 'No', 'open in Browser', 'Give up'], + ['y', 'n', 'b', 'g']) + if answer == 'b': + webbrowser.open("http://%s%s" % ( + page.site().hostname(), + page.site().nice_get_address(page.title()) + )) + wikipedia.input("Press Enter when finished in browser.") + return True else: # If we do not need to ask, allow answer = 'y' @@ -1530,8 +1717,9 @@ if dictName is not None: wikipedia.output(u'Skipping: %s is an auto entry %s(%s)' % (page.title(),dictName,year)) continue - if globalvar.bracketonly: - if page.title().find("(") == -1: + if globalvar.parenthesesonly: + # Only yield pages that have ( ) in titles + if "(" not in page.title(): continue break
@@ -1625,7 +1813,7 @@ for subject in self.subjects: # Promise the subject that we will work on the site. # We will get a list of pages we can do. - pages = subject.willWorkOn(site) + pages = subject.whatsNextPageBatch(site) if pages: pageGroup.extend(pages) subjectGroup.append(subject) @@ -1643,7 +1831,7 @@ pass # Tell all of the subjects that the promised work is done for subject in subjectGroup: - subject.workDone(self) + subject.batchLoaded(self) return True
def queryStep(self): @@ -1653,6 +1841,7 @@ subj = self.subjects[i] if subj.isDone(): subj.finish(self) + subj.clean() del self.subjects[i]
def isDone(self): @@ -1870,7 +2059,7 @@ # override configuration config.interwiki_graph = True elif arg == '-bracket': - globalvar.bracketonly = True + globalvar.parenthesesonly = True elif arg == '-localright': globalvar.followinterwiki = False elif arg == '-hintsareright': @@ -1891,7 +2080,7 @@ site = wikipedia.getSite() mainpagename = site.mediawiki_message('mainpage') globalvar.skip.add(wikipedia.Page(site, mainpagename)) - except: + except wikipedia.Error: wikipedia.output(u'Missing main page name')
if newPages is not None: @@ -1958,6 +2147,9 @@ except: bot.dump() raise + finally: + if globalvar.contentsondisk: + StoredPage.SPdeleteStore()
finally: wikipedia.stopme()
Property changes on: branches/rewrite/scripts/interwiki.py ___________________________________________________________________ Modified: svn:mergeinfo - /trunk/pywikipedia/interwiki.py:6668-6733 + /trunk/pywikipedia/interwiki.py:6668-6801
pywikipedia-svn@lists.wikimedia.org