Revision: 8373 Author: valhallasw Date: 2010-07-29 14:05:49 +0000 (Thu, 29 Jul 2010)
Log Message: ----------- Major refactoring of generate_family_file.py: * changed def main() into an object * split off namespace logic to NamespaceStorage object
Modified Paths: -------------- trunk/pywikipedia/generate_family_file.py
Modified: trunk/pywikipedia/generate_family_file.py =================================================================== --- trunk/pywikipedia/generate_family_file.py 2010-07-29 14:04:50 UTC (rev 8372) +++ trunk/pywikipedia/generate_family_file.py 2010-07-29 14:05:49 UTC (rev 8373) @@ -1,213 +1,250 @@ -# -*- coding: utf-8 -*- -""" -This script generates a family file from a given URL. -Hackish, etc. Regexps, yes. Sorry, jwz. - -""" -__version__ = "$Id" - -# -# (C) Merlijn van Deen, 2010 -# (C) Pywikipedia bot team, 2010 -# -# Distributed under the terms of the MIT license -# - -from urllib2 import HTTPError -import urllib2 - -def urlopen(url): - req = urllib2.Request(url, headers = {'User-agent': 'Pywikipedia family generator 0.1 - pywikipediabot.sf.net'}) - return urllib2.urlopen(req) - -from urlparse import urlparse, ParseResult -import codecs -import sys -import re -try: - import json -except ImportError: - import simplejson as json - -# Monkey-patching wikipediatools to prevent questions about user_config.py -import wikipediatools -wikipediatools.get_base_dir = lambda: '.' -import family -STANDARDNAMESPACES = family.Family().namespaces - -def main(url=None, name=None): - if url == None: - url = raw_input("Please insert URL to wiki: ") - if name == None: - name = raw_input("Please insert a short name (eg: freeciv): ") - - wikis = {} - print "Generating family file from %s" % url - - w = Wiki(url) - wikis[w.iwpath] = w - print - print "==================================" - print "api url: %s" % w.api - print "MediaWiki version: %s" % w.version - print "==================================" - print - - print "Determining other languages...", - try: - iw = json.load(urlopen(w.api + "?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json")) - langs = [wiki for wiki in iw['query']['interwikimap'] if u'language' in wiki] - print u' '.join(sorted([wiki[u'prefix'] for wiki in langs])) - - if raw_input("\nThere are %i languages available.\nDo you want to generate interwiki links? This might take a long time. (y/N)" % len(langs)).lower() != "y": - langs = [wiki for wiki in langs if wiki[u'url'] == w.iwpath] - except HTTPError, e: - langs = [] - print e, "; continuing..." - - if langs == []: - print "Assuming English" - langs = [{u'language': u'English', - u'local': u'', - u'prefix': u'en', - u'url': w.iwpath}] - - print "Loading wikis... " - for lang in langs: - print " * %s... " % (lang[u'prefix']), - if lang[u'url'] not in wikis: - wikis[lang[u'url']] = Wiki(lang[u'url']) - print "downloaded" - else: - print "in cache" - - print "Retrieving namespaces... ", - namespaces = {} - for w in wikis.itervalues(): - print "%s " % w.lang, - ns = json.load(urlopen(w.api + "?action=query&meta=siteinfo&siprop=namespaces&format=json"))['query']['namespaces'] - for namespace in ns: - if namespace == '0': - continue - if int(namespace) not in namespaces: - namespaces[int(namespace)] = {} - - # Better method? You're very welcome. - try: - if STANDARDNAMESPACES[int(namespace)]['_default'] != ns[namespace][u'*'] and \ - STANDARDNAMESPACES[int(namespace)][w.lang] != ns[namespace][u'*']: - raise KeyError # if the namespace name is different, act if it is undefined - except KeyError: - namespaces[int(namespace)][w.lang] = ns[namespace][u'*'] - print - - fn = "families/%s_family.py" % name - print "Writing %s... " % fn - try: - open(fn) - if raw_input("%s already exists. Overwrite? (y/n)").lower() == 'n': - print "Terminating." - sys.exit(1) - except IOError: # file not found - pass - f = codecs.open('families/%s_family.py' % name, 'w', 'utf-8') - - f.write(""" -# -*- coding: utf-8 -*- -""" -This family file was auto-generated by $Id: generate_family_file.py 8371 2010-07-29 13:29:26Z valhallasw $ -Configuration parameters: - url = %(url)s - name = %(name)s - -Please do not commit this to the SVN repository! -""" - -import family - -class Family(family.Family): - def __init__(self): - family.Family.__init__(self) - self.name = '%(name)s' - self.langs = { -""".lstrip() % {'url': url, 'name': name}) - - for w in wikis.itervalues(): - f.write(" '%(lang)s': u'%(hostname)s',\n" % {'lang': w.lang, 'hostname': urlparse(w.server).netloc}) - - f.write(" }\n\n") - - for nsid, nslangs in namespaces.iteritems(): - for lang, nsname in nslangs.iteritems(): - f.write(" self.namespaces[%(nsid)i]['%(lang)s'] = u'%(nsname)s'\n" % {'nsid': nsid, 'lang': lang, 'nsname': nsname}) - f.write("\n\n") - - f.write(" def scriptpath(self, code):\n") - f.write(" return {\n") - - for w in wikis.itervalues(): - f.write(" '%(lang)s': u'%(path)s',\n" % {'lang': w.lang, 'path': w.scriptpath}) - f.write(" }[code]\n") - f.write("\n") - - f.write(" def version(self, code):\n") - f.write(" return {\n") - for w in wikis.itervalues(): - if w.version == None: - f.write(" '%(lang)s': None,\n" % {'lang': w.lang}) - else: - f.write(" '%(lang)s': u'%(ver)s',\n" % {'lang': w.lang, 'ver': w.version}) - f.write(" }[code]\n") - - -class Wiki(object): - REwgEnableApi = re.compile(ur'wgEnableAPI ?= ?true') - REwgServer = re.compile(ur'wgServer ?= ?"([^"]*)"') - REwgScriptPath = re.compile(ur'wgScriptPath ?= ?"([^"]*)"') - REwgArticlePath = re.compile(ur'wgArticlePath ?= ?"([^"]*)"') - REwgContentLanguage = re.compile(ur'wgContentLanguage ?= ?"([^"]*)"') - REwgVersion = re.compile(ur'wgVersion ?= ?"([^"]*)"') - - def __init__(self, fromurl): - if fromurl.endswith("$1"): - fromurl = fromurl[:-2] - try: - data = urlopen(fromurl).read() - except HTTPError, e: - if e.code != 404: - raise - data = e.read() - pass - - if not self.REwgEnableApi.search(data): - print "*** WARNING: Api does not seem to be enabled on %s" % fromurl - try: - self.version = self.REwgVersion.search(data).groups()[0] - except AttributeError: - self.version = None - self.server = self.REwgServer.search(data).groups()[0] - self.scriptpath = self.REwgScriptPath.search(data).groups()[0] - self.articlepath = self.REwgArticlePath.search(data).groups()[0] - self.lang = self.REwgContentLanguage.search(data).groups()[0] - - def __cmp__(self, other): - return (self.server + self.scriptpath == other.server + other.scriptpath) - - def __hash__(self): - return hash(self.server + self.scriptpath) - - @property - def api(self): - return self.server + self.scriptpath + "/api.php" - - @property - def iwpath(self): - return self.server + self.articlepath - - -if __name__ == "__main__": - if len(sys.argv) != 3: - print "Usage: %s <url> <short name>" - print "Example: %s http://www.mywiki.bogus/wiki/Main_Page mywiki" - print "This will create the file families/mywiki_family.py" - main(sys.argv[1], sys.argv[2]) +# -*- coding: utf-8 -*- +""" +This script generates a family file from a given URL. +Hackish, etc. Regexps, yes. Sorry, jwz. + +""" +__version__ = "$Id" + +# +# (C) Merlijn van Deen, 2010 +# (C) Pywikipedia bot team, 2010 +# +# Distributed under the terms of the MIT license +# + +from urllib2 import HTTPError +import urllib2 + +def urlopen(url): + req = urllib2.Request(url, headers = {'User-agent': 'Pywikipedia family generator 0.1 - pywikipediabot.sf.net'}) + return urllib2.urlopen(req) + +from urlparse import urlparse, ParseResult +import codecs +import sys +import re +try: + import json +except ImportError: + import simplejson as json + +# Monkey-patching wikipediatools to prevent questions about user_config.py +import wikipediatools +wikipediatools.get_base_dir = lambda: '.' +import family + +class FamilyFileGenerator(object): + def __init__(self, url=None, name=None): + if url == None: + url = raw_input("Please insert URL to wiki: ") + if name == None: + name = raw_input("Please insert a short name (eg: freeciv): ") + self.base_url = url + self.name = name + + self.wikis = {} # {'http://wiki/$1': Wiki('http://wiki/$1'), ...} + self.langs = [] # [Wiki('http://wiki/$1'), ...] + + self.namespaces = NamespaceStorage() + + def run(self): + print "Generating family file from %s" % self.base_url + + w = Wiki(self.base_url) + self.wikis[w.iwpath] = w + print + print "==================================" + print "api url: %s" % w.api + print "MediaWiki version: %s" % w.version + print "==================================" + print + + self.getlangs(w) + self.getapis() + self.getnamespaces() + self.writefile() + + def getlangs(self, w): + print "Determining other languages...", + try: + iw = json.load(urlopen(w.api + "?action=query&meta=siteinfo&siprop=interwikimap&sifilteriw=local&format=json")) + self.langs = [wiki for wiki in iw['query']['interwikimap'] if u'language' in wiki] + print u' '.join(sorted([wiki[u'prefix'] for wiki in self.langs])) + except HTTPError, e: + self.langs = [] + print e, "; continuing..." + + if len([lang for lang in self.langs if lang['url'] == w.iwpath]) == 0: + self.langs.append({u'language': w.lang, + u'local': u'', + u'prefix': w.lang, + u'url': w.iwpath}) + + if len(self.langs) > 1 and \ + raw_input("\nThere are %i languages available.\nDo you want to generate interwiki links? This might take a long time. (y/N)" % len(self.langs)).lower() != "y": + self.langs = [wiki for wiki in langs if wiki[u'url'] == w.iwpath] + + def getapis(self): + print "Loading wikis... " + for lang in self.langs: + print " * %s... " % (lang[u'prefix']), + if lang[u'url'] not in self.wikis: + self.wikis[lang[u'url']] = Wiki(lang[u'url']) + print "downloaded" + else: + print "in cache" + + def getnamespaces(self): + print "Retrieving namespaces... ", + for w in self.wikis.itervalues(): + print "%s " % w.lang, + self.namespaces.addfromwiki(w) + print + + def writefile(self): + fn = "families/%s_family.py" % self.name + print "Writing %s... " % fn + try: + open(fn) + if raw_input("%s already exists. Overwrite? (y/n)" % fn).lower() == 'n': + print "Terminating." + sys.exit(1) + except IOError: # file not found + pass + f = codecs.open(fn, 'w', 'utf-8') + + f.write(""" +# -*- coding: utf-8 -*- +""" +This family file was auto-generated by $Id$ +Configuration parameters: + url = %(url)s + name = %(name)s + +Please do not commit this to the SVN repository! +""" + +import family + +class Family(family.Family): + def __init__(self): + family.Family.__init__(self) + self.name = '%(name)s' + self.langs = { +""".lstrip() % {'url': self.base_url, 'name': self.name}) + + for w in self.wikis.itervalues(): + f.write(" '%(lang)s': u'%(hostname)s',\n" % {'lang': w.lang, 'hostname': urlparse(w.server).netloc}) + + f.write(" }\n\n") + + f.write(self.namespaces.output(8)) + f.write("\n\n") + + f.write(" def scriptpath(self, code):\n") + f.write(" return {\n") + + for w in self.wikis.itervalues(): + f.write(" '%(lang)s': u'%(path)s',\n" % {'lang': w.lang, 'path': w.scriptpath}) + f.write(" }[code]\n") + f.write("\n") + + f.write(" def version(self, code):\n") + f.write(" return {\n") + for w in self.wikis.itervalues(): + if w.version == None: + f.write(" '%(lang)s': None,\n" % {'lang': w.lang}) + else: + f.write(" '%(lang)s': u'%(ver)s',\n" % {'lang': w.lang, 'ver': w.version}) + f.write(" }[code]\n") + +class NamespaceStorage(object): + def __init__(self): + self.nsinfo = {} + self.f = family.Family() + + def addfromwiki(self, w): + data = json.load(urlopen(w.api + "?action=query&format=json&meta=siteinfo&siprop=namespaces|namespacealiases"))['query'] + for ns in data['namespaces'].itervalues(): + self.add(ns['id'], w.lang, ns['*']) + for ns in data['namespacealiases']: + self.add(ns['id'], w.lang, ns['*']) + + def add(self, ns, lang, translation): + """ Contains logic for determining whether to define a namespace or not """ + ns = int(ns) + if ns == 0: # never translate the article namespace + return + try: + if translation not in self.f.namespace(lang, ns, all=True): # self.f.namespace might return KeyError itself + raise KeyError + except KeyError: + self._store(ns, lang, translation) + + def _store(self, ns, lang, translation): + """ Contains logic on how to store a translation """ + self.nsinfo.setdefault(ns, {}).setdefault(lang, []).append(translation) + + def output(self, indent): + data = "" + for nsid, langs in self.nsinfo.iteritems(): + for lang, translations in langs.iteritems(): + data += " " * indent + data += "self.namespaces[%(nsid)i][%(lang)r] = %(translations)r" % locals() + data += "\n" + return data + + +class Wiki(object): + REwgEnableApi = re.compile(ur'wgEnableAPI ?= ?true') + REwgServer = re.compile(ur'wgServer ?= ?"([^"]*)"') + REwgScriptPath = re.compile(ur'wgScriptPath ?= ?"([^"]*)"') + REwgArticlePath = re.compile(ur'wgArticlePath ?= ?"([^"]*)"') + REwgContentLanguage = re.compile(ur'wgContentLanguage ?= ?"([^"]*)"') + REwgVersion = re.compile(ur'wgVersion ?= ?"([^"]*)"') + + def __init__(self, fromurl): + if fromurl.endswith("$1"): + fromurl = fromurl[:-2] + try: + data = urlopen(fromurl).read() + except HTTPError, e: + if e.code != 404: + raise + data = e.read() + pass + + if not self.REwgEnableApi.search(data): + print "*** WARNING: Api does not seem to be enabled on %s" % fromurl + try: + self.version = self.REwgVersion.search(data).groups()[0] + except AttributeError: + self.version = None + self.server = self.REwgServer.search(data).groups()[0] + self.scriptpath = self.REwgScriptPath.search(data).groups()[0] + self.articlepath = self.REwgArticlePath.search(data).groups()[0] + self.lang = self.REwgContentLanguage.search(data).groups()[0] + + def __cmp__(self, other): + return (self.server + self.scriptpath == other.server + other.scriptpath) + + def __hash__(self): + return hash(self.server + self.scriptpath) + + @property + def api(self): + return self.server + self.scriptpath + "/api.php" + + @property + def iwpath(self): + return self.server + self.articlepath + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print "Usage: %s <url> <short name>" + print "Example: %s http://www.mywiki.bogus/wiki/Main_Page mywiki" + print "This will create the file families/mywiki_family.py" + + FamilyFileGenerator(*sys.argv[1:]).run()
pywikipedia-svn@lists.wikimedia.org