http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11580
Revision: 11580
Author: valhallasw
Date: 2013-05-25 14:03:33 +0000 (Sat, 25 May 2013)
Log Message:
-----------
New script to replicate wiki pages, contributed by Kasper Souren
This script allows you to replicate all pages in a certain namespace
to a second wiki. You can, for instance, use this to keep templates
or semantic forms synchronised from a master wiki.
Modified Paths:
--------------
trunk/pywikipedia/config.py
Added Paths:
-----------
trunk/pywikipedia/replicate_wiki.py
Modified: trunk/pywikipedia/config.py
===================================================================
--- trunk/pywikipedia/config.py 2013-05-25 09:26:38 UTC (rev 11579)
+++ trunk/pywikipedia/config.py 2013-05-25 14:03:33 UTC (rev 11580)
@@ -438,6 +438,18 @@
# number of results.
copyright_economize_query = True
+############## REPLICATION BOT ################
+# You can add replicate_replace to your user_config.py, which has the following format:
+#
+# replicate_replace = {
+# 'wikipedia:li': {'Hoofdpagina': 'Veurblaad'}
+# }
+#
+# to replace all occurences of 'Hoofdpagina' with 'Veurblaad' when
writing to liwiki. Note that this does
+# not take the origin wiki into account.
+replicate_replace = {}
+
+
############## HTTP SETTINGS ##############
# Default socket timeout. Set to None to disable timeouts.
socket_timeout = 120 # set a pretty long timeout just in case...
Added: trunk/pywikipedia/replicate_wiki.py
===================================================================
--- trunk/pywikipedia/replicate_wiki.py (rev 0)
+++ trunk/pywikipedia/replicate_wiki.py 2013-05-25 14:03:33 UTC (rev 11580)
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+#
+# -*- coding: utf-8 -*-
+#
+# (C) Kasper Souren 2012-2013
+#
+# Distributed under the terms of the MIT license.
+#
+
+'''
+This bot replicates all pages (from specific namespaces) in a wiki to a second wiki
within one family.
+
+Example:
+python replicate_wiki.py [-r] -ns 10 -f wikipedia -o nl li fy
+
+to copy all templates from an nlwiki to liwiki and fywiki. It will show which pages have
to be changed
+if -r is not present, and will only actually write pages if -r /is/ present.
+
+You can add replicate_replace to your user_config.py, which has the following format:
+
+replicate_replace = {
+ 'wikipedia:li': {'Hoofdpagina': 'Veurblaad'}
+}
+
+to replace all occurences of 'Hoofdpagina' with 'Veurblaad' when writing
to liwiki. Note that this does
+not take the origin wiki into account.
+'''
+
+import sys
+import re
+from wikipedia import *
+from itertools import imap
+
+def namespaces(site):
+ '''dict from namespace number to prefix'''
+ ns = dict(map(lambda n: (site.getNamespaceIndex(n), n),
+ site.namespaces()))
+ ns[0] = ''
+ return ns
+
+
+def multiple_replace(text, word_dict):
+ '''Replace all occurrences in text of key value pairs in
word_dict'''
+ for key in word_dict:
+ text = text.replace(key, word_dict[key])
+ return text
+
+
+class SyncSites:
+ '''Work is done in here.'''
+
+ def __init__(self, options):
+ self.options = options
+
+ if options.original_wiki:
+ original_wiki = options.original_wiki
+ else:
+ original_wiki = config.mylang
+
+ print "Syncing from " + original_wiki
+
+ family = options.family or config.family
+
+ sites = options.destination_wiki
+
+ self.original = getSite(original_wiki, family)
+
+ if options.namespace and 'help' in options.namespace:
+ nsd = namespaces(self.original)
+ for k in nsd:
+ print k, nsd[k]
+ sys.exit()
+
+ self.sites = map(lambda s: getSite(s, family), sites)
+
+ self.differences = {}
+ self.user_diff = {}
+ print 'Syncing to',
+ for s in self.sites:
+ self.differences[s] = []
+ self.user_diff[s] = []
+ print s,
+ print
+
+ def check_sysops(self):
+ '''Check if sysops are the same
+
+ TODO: make optional
+ '''
+ def get_users(site):
+ userlist =
site.getUrl(site.get_address('Special:Userlist&group=sysop'))
+ # Hackery but working. At least on MW 1.15.0
+ # User namespace is number 2
+ return set(re.findall(site.namespace(2) + ':(\w+)["\&]',
userlist))
+
+ ref_users = get_users(self.original)
+ for site in self.sites:
+ users = get_users(site)
+ diff = list(ref_users.difference(users))
+ diff.sort()
+ self.user_diff[site] = diff
+
+ def check_namespaces(self):
+ '''Check all namespaces, to be ditched for clarity'''
+ namespaces = [
+ 0, # Main
+ 8, # MediaWiki
+ 152, # DPL
+ 102, # Eigenschap
+ 104, # Type
+ 106, # Formulier
+ 108, # Concept
+ 10, # Sjabloon
+ ]
+
+ if self.options.namespace:
+ print options.namespace
+ namespaces = [int(options.namespace)]
+ print "Checking these namespaces", namespaces, "\n"
+
+ for ns in namespaces:
+ self.check_namespace(ns)
+
+ def check_namespace(self, namespace):
+ '''Check an entire namespace'''
+
+ print "\nCHECKING NAMESPACE", namespace
+ pages = imap(lambda p: p.title(),
+ self.original.allpages('!', namespace))
+ for p in pages:
+ if not p in ['MediaWiki:Sidebar', 'MediaWiki:Mainpage',
+ 'MediaWiki:Sitenotice',
'MediaWiki:MenuSidebar']:
+ try:
+ self.check_page(p)
+ except pywikibot.exceptions.NoPage:
+ print 'Bizarre NoPage exception that we are just going to
ignore'
+ except pywikibot.exceptions.IsRedirectPage:
+ print 'error: Redirectpage - todo: handle gracefully'
+ print
+
+
+ def generate_overviews(self):
+ '''Create page on wikis with overview of bot results'''
+ for site in self.sites:
+ sync_overview_page = Page(site, 'User:' + site.loggedInAs() +
'/sync.py overview')
+ output = "== Pages that differ from original ==\n\n"
+ if self.differences[site]:
+ output += "".join(map(lambda l: '* [[:' + l +
"]]\n", self.differences[site]))
+ else:
+ output += "All important pages are the same"
+
+ output += "\n\n== Admins from original that are missing here
==\n\n"
+ if self.user_diff[site]:
+ output += "".join(map(lambda l: '* ' +
l.replace('_', ' ') + "\n", self.user_diff[site]))
+ else:
+ output += "All users from original are also present on this
wiki"
+
+ print output
+ sync_overview_page.put(output, self.put_message(site))
+
+
+ def put_message(self, site):
+ return site.loggedInAs() + ' sync.py synchronization from ' +
str(self.original)
+
+ def check_page(self, pagename):
+ '''Check one page'''
+
+ print "\nChecking", pagename,
+ sys.stdout.flush()
+ page1 = Page(self.original, pagename)
+ txt1 = page1.get()
+
+ for site in self.sites:
+ if options.dest_namespace:
+ prefix = namespaces(site)[int(options.dest_namespace)]
+ if prefix:
+ prefix += ':'
+ new_pagename = prefix + page1.titleWithoutNamespace()
+ print "\nCross namespace, new title: ", new_pagename
+ else:
+ new_pagename = pagename
+
+ page2 = Page(site, new_pagename)
+ if page2.exists():
+ txt2 = page2.get()
+
+ else:
+ txt2 = ''
+
+ if config.replicate_replace.has_key(str(site)):
+ txt_new = multiple_replace(txt1, config.replicate_replace[str(site)])
+ if txt1 != txt_new:
+ print 'NOTE: text replaced using config.sync_replace'
+ print txt1, txt_new, txt2
+ txt1 = txt_new
+
+ if txt1 != txt2:
+ print "\n", site, 'DIFFERS'
+ self.differences[site].append(pagename)
+
+ if self.options.replace:
+ page2.put(txt1, self.put_message(site))
+ else:
+ sys.stdout.write('.')
+ sys.stdout.flush()
+
+
+if __name__ == '__main__':
+ from argparse import ArgumentParser
+
+ parser = ArgumentParser()
+ parser.add_argument("-f", "--family", dest="family",
+ help="wiki family")
+
+ parser.add_argument("-r", "--replace",
action="store_true",
+ help="actually replace pages (without this option you will
only get an overview page)")
+ parser.add_argument("-o", "--original",
dest="original_wiki",
+ help="original wiki")
+ parser.add_argument('destination_wiki', metavar='N', type=str,
nargs='+',
+ help='destination wiki(s)')
+ parser.add_argument("-ns", "--namespace",
dest="namespace",
+ help="specify namespace")
+ parser.add_argument("-dns", "--dest-namespace",
dest="dest_namespace",
+ help="destination namespace (if different)")
+
+ (options, args) = parser.parse_known_args()
+
+ # sync is global for convenient IPython debugging
+ sync = SyncSites(options)
+ sync.check_sysops()
+ sync.check_namespaces()
+ sync.generate_overviews()
+