Revision: 8372 Author: valhallasw Date: 2010-07-29 14:04:50 +0000 (Thu, 29 Jul 2010)
Log Message: ----------- Moving generate_family_file.py to correct location.
Added Paths: ----------- trunk/pywikipedia/generate_family_file.py
Removed Paths: ------------- pywikipedia/
Added: trunk/pywikipedia/generate_family_file.py =================================================================== --- trunk/pywikipedia/generate_family_file.py (rev 0) +++ trunk/pywikipedia/generate_family_file.py 2010-07-29 14:04:50 UTC (rev 8372) @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- +""" +This script generates a family file from a given URL. +Hackish, etc. Regexps, yes. Sorry, jwz. + +""" +__version__ = "$Id" + +# +# (C) Merlijn van Deen, 2010 +# (C) Pywikipedia bot team, 2010 +# +# Distributed under the terms of the MIT license +# + +from urllib2 import HTTPError +import urllib2 + +def urlopen(url): + req = urllib2.Request(url, headers = {'User-agent': 'Pywikipedia family generator 0.1 - pywikipediabot.sf.net'}) + return urllib2.urlopen(req) + +from urlparse import urlparse, ParseResult +import codecs +import sys +import re +try: + import json +except ImportError: + import simplejson as json + +# Monkey-patching wikipediatools to prevent questions about user_config.py +import wikipediatools +wikipediatools.get_base_dir = lambda: '.' +import family +STANDARDNAMESPACES = family.Family().namespaces + +def main(url=None, name=None): + if url == None: + url = raw_input("Please insert URL to wiki: ") + if name == None: + name = raw_input("Please insert a short name (eg: freeciv): ") + + wikis = {} + print "Generating family file from %s" % url + + w = Wiki(url) + wikis[w.iwpath] = w + print + print "==================================" + print "api url: %s" % w.api + print "MediaWiki version: %s" % w.version + print "==================================" + print + + print "Determining other languages...", + try: + iw = json.load(urlopen(w.api + "?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json")) + langs = [wiki for wiki in iw['query']['interwikimap'] if u'language' in wiki] + print u' '.join(sorted([wiki[u'prefix'] for wiki in langs])) + + if raw_input("\nThere are %i languages available.\nDo you want to generate interwiki links? This might take a long time. (y/N)" % len(langs)).lower() != "y": + langs = [wiki for wiki in langs if wiki[u'url'] == w.iwpath] + except HTTPError, e: + langs = [] + print e, "; continuing..." + + if langs == []: + print "Assuming English" + langs = [{u'language': u'English', + u'local': u'', + u'prefix': u'en', + u'url': w.iwpath}] + + print "Loading wikis... " + for lang in langs: + print " * %s... " % (lang[u'prefix']), + if lang[u'url'] not in wikis: + wikis[lang[u'url']] = Wiki(lang[u'url']) + print "downloaded" + else: + print "in cache" + + print "Retrieving namespaces... ", + namespaces = {} + for w in wikis.itervalues(): + print "%s " % w.lang, + ns = json.load(urlopen(w.api + "?action=query&meta=siteinfo&siprop=namespaces&format=json"))['query']['namespaces'] + for namespace in ns: + if namespace == '0': + continue + if int(namespace) not in namespaces: + namespaces[int(namespace)] = {} + + # Better method? You're very welcome. + try: + if STANDARDNAMESPACES[int(namespace)]['_default'] != ns[namespace][u'*'] and \ + STANDARDNAMESPACES[int(namespace)][w.lang] != ns[namespace][u'*']: + raise KeyError # if the namespace name is different, act if it is undefined + except KeyError: + namespaces[int(namespace)][w.lang] = ns[namespace][u'*'] + print + + fn = "families/%s_family.py" % name + print "Writing %s... " % fn + try: + open(fn) + if raw_input("%s already exists. Overwrite? (y/n)").lower() == 'n': + print "Terminating." + sys.exit(1) + except IOError: # file not found + pass + f = codecs.open('families/%s_family.py' % name, 'w', 'utf-8') + + f.write(""" +# -*- coding: utf-8 -*- +""" +This family file was auto-generated by $Id: generate_family_file.py 8371 2010-07-29 13:29:26Z valhallasw $ +Configuration parameters: + url = %(url)s + name = %(name)s + +Please do not commit this to the SVN repository! +""" + +import family + +class Family(family.Family): + def __init__(self): + family.Family.__init__(self) + self.name = '%(name)s' + self.langs = { +""".lstrip() % {'url': url, 'name': name}) + + for w in wikis.itervalues(): + f.write(" '%(lang)s': u'%(hostname)s',\n" % {'lang': w.lang, 'hostname': urlparse(w.server).netloc}) + + f.write(" }\n\n") + + for nsid, nslangs in namespaces.iteritems(): + for lang, nsname in nslangs.iteritems(): + f.write(" self.namespaces[%(nsid)i]['%(lang)s'] = u'%(nsname)s'\n" % {'nsid': nsid, 'lang': lang, 'nsname': nsname}) + f.write("\n\n") + + f.write(" def scriptpath(self, code):\n") + f.write(" return {\n") + + for w in wikis.itervalues(): + f.write(" '%(lang)s': u'%(path)s',\n" % {'lang': w.lang, 'path': w.scriptpath}) + f.write(" }[code]\n") + f.write("\n") + + f.write(" def version(self, code):\n") + f.write(" return {\n") + for w in wikis.itervalues(): + if w.version == None: + f.write(" '%(lang)s': None,\n" % {'lang': w.lang}) + else: + f.write(" '%(lang)s': u'%(ver)s',\n" % {'lang': w.lang, 'ver': w.version}) + f.write(" }[code]\n") + + +class Wiki(object): + REwgEnableApi = re.compile(ur'wgEnableAPI ?= ?true') + REwgServer = re.compile(ur'wgServer ?= ?"([^"]*)"') + REwgScriptPath = re.compile(ur'wgScriptPath ?= ?"([^"]*)"') + REwgArticlePath = re.compile(ur'wgArticlePath ?= ?"([^"]*)"') + REwgContentLanguage = re.compile(ur'wgContentLanguage ?= ?"([^"]*)"') + REwgVersion = re.compile(ur'wgVersion ?= ?"([^"]*)"') + + def __init__(self, fromurl): + if fromurl.endswith("$1"): + fromurl = fromurl[:-2] + try: + data = urlopen(fromurl).read() + except HTTPError, e: + if e.code != 404: + raise + data = e.read() + pass + + if not self.REwgEnableApi.search(data): + print "*** WARNING: Api does not seem to be enabled on %s" % fromurl + try: + self.version = self.REwgVersion.search(data).groups()[0] + except AttributeError: + self.version = None + self.server = self.REwgServer.search(data).groups()[0] + self.scriptpath = self.REwgScriptPath.search(data).groups()[0] + self.articlepath = self.REwgArticlePath.search(data).groups()[0] + self.lang = self.REwgContentLanguage.search(data).groups()[0] + + def __cmp__(self, other): + return (self.server + self.scriptpath == other.server + other.scriptpath) + + def __hash__(self): + return hash(self.server + self.scriptpath) + + @property + def api(self): + return self.server + self.scriptpath + "/api.php" + + @property + def iwpath(self): + return self.server + self.articlepath + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print "Usage: %s <url> <short name>" + print "Example: %s http://www.mywiki.bogus/wiki/Main_Page mywiki" + print "This will create the file families/mywiki_family.py" + main(sys.argv[1], sys.argv[2])