http://www.mediawiki.org/wiki/Special:Code/pywikipedia/9472
Revision: 9472 Author: xqt Date: 2011-08-29 14:49:11 +0000 (Mon, 29 Aug 2011) Log Message: ----------- delete old file
Removed Paths: ------------- trunk/pywikiparser/Parser.py.old
Deleted: trunk/pywikiparser/Parser.py.old =================================================================== --- trunk/pywikiparser/Parser.py.old 2011-08-29 05:13:04 UTC (rev 9471) +++ trunk/pywikiparser/Parser.py.old 2011-08-29 14:49:11 UTC (rev 9472) @@ -1,208 +0,0 @@ -# -*- coding: utf-8 -*- -""" Mediawiki wikitext parser """ -# -# (C) 2007 Merlijn 'valhallasw' van Deen -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id$' - -# - -import re -import warnings - -import ObjectTree as dom - -from Lexer import Lexer, Token - - -# System loosely based on 'the dragon book': -# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986 - -class ParseError(Exception): - """ booh """ - -class parser: - def __init__(self, string): - self.wikipwn = [a for a in lexer(string).lexer()] - self.counter = 0 - - def expect(self, types, values=None): - #print 'Expect: %s %s' % (types, values) - token = self.wikipwn[self.counter] - if (token[0] not in types): - if values: - raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) - else: - raise ParseError("Expected one of (%r), got %r" % (types, token)) - if values: - if (token[1] not in values): - raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) - self.counter += 1 - return token - - def parsetext(self): - data = '' - try: - while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] - except ParseError, e: pass - - k = dom.Element('parsetext') - k.append(data) - return k - - def parseurl(self): - pre = self.expect([lexer.SQRE_OPEN])[1]-1 - url = self.expect([lexer.TEXT])[1] - # checkurl, raise ParseError - ws = '' - try: - ws = self.expect([lexer.WHITESPACE])[1] - except ParseError: pass - - if '\n' in ws: - raise ParseError('No newlines allowed in external links') - - desc = '' - try: - while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] - except ParseError, e: pass - - aft = self.expect([lexer.SQRE_CLOSE])[1]-1 - - root = dom.Element('parseurl') - root.append('['*pre) - extlink = root.appendElement('externallink') - extlink.appendElement('url').append(url) - if len(desc) > 0: - extlink.appendElement('description').append(desc) - root.append(']'*aft) - - return root - - def parsewikilink(self): - pre = self.expect([lexer.SQRE_OPEN])[1]-2 - if (pre < 0): raise ParseError('Not a wiki link') - - page = '' - try: - while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] - except ParseError,e: pass - # if not re.match(...): raise ParseError - - root = dom.Element('parsewikilink') - root.append('['*pre) - pagelink = root.appendElement('pagelink') - pagelink.appendElement('title').append(page) - print 'wikilink: %s' % page - try: - while(True): - root.append(self.parseparameter(breaktokens=[lexer.SQRE_CLOSE])) - except ParseError, e: pass - print 'result: %r' % (root,) - aft = self.expect([lexer.SQRE_CLOSE])[1]-2 - if (aft < 0): - raise ParseError('Not a wiki link') - - root.append(']'*aft) - return root - - def parseparameter(self, breaktokens=None): - if breaktokens: - breaktokens.append(lexer.PIPE) - else: - breaktokens = [lexer.PIPE] - try: - while(True): self.expect([lexer.WHITESPACE]) #eat whitespace - except ParseError: pass - self.expect([lexer.PIPE]) - #now we can expect anything except a loose pipe. - data = self.parse(breaktokens=breaktokens) - return dom.Element('parameter', {}, data) - - def parseone(self, breaktokens=[]): - token = self.wikipwn[self.counter] - if (token[0] == lexer.EOF) or (token[0] in breaktokens): - raise StopIteration - - if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text - try: return self.parsetext(); - except ParseError, e: pass - - if (token[0] == lexer.SQRE_OPEN): #wikilink or external link - begin = self.counter - try: return self.parsewikilink(); - except ParseError, e: pass - self.counter = begin - try: return self.parseurl(); - except ParseError, e: pass - self.counter = begin - return ('[' * self.expect([lexer.SQRE_OPEN])[1]) - - if (token[0] == lexer.SQRE_CLOSE): - return ']'*self.expect([lexer.SQRE_CLOSE])[1] - - if (token[0] == lexer.PIPE): - self.expect([lexer.PIPE]) - return '|' - - if (token[0] == lexer.CURL_OPEN): - #parse_template - warnings.warn("Not implemented yet. Returning string") - return '{'*self.expect([lexer.CURL_OPEN])[1] - - if (token[0] == lexer.CURL_CLOSE): - return '}'*self.expect([lexer.CURL_CLOSE])[1] - - if (token[0] == lexer.ANGL_OPEN): - #parse html - warnings.warn("Not implemented yet. Returning string") - return '<'*self.expect([lexer.ANGL_OPEN])[1] - - if (token[0] == lexer.ANGL_CLOSE): - return '>'*self.expect([lexer.ANGL_CLOSE])[1] - - if (token[0] == lexer.NEWPAR): - self.expect([lexer.NEWPAR]) - return '\n\n' - - if (token[0] == lexer.TAB_OPEN): - # parse wikitable - warnings.warn("Not implemented yet. Returning string") - self.expect([lexer.TAB_OPEN]) - return '(|' - - if (token[0] == lexer.TAB_NEWLINE): - self.expect([lexer.TAB_NEWLINE]) - return '|-' - - if (token[0] == lexer.TAB_CLOSE): - self.expect([lexer.TAB_CLOSE]) - return '|}' - - if (token[0] == lexer.WHITESPACE): - return self.expect([lexer.WHITESPACE])[1] - - if (token[0] == lexer.EQUAL_SIGN): - return '='*self.expect([lexer.EQUAL_SIGN])[1] - - if (token[0] == lexer.APOSTROPHE): - return '''*self.expect([lexer.APOSTROPHE])[1] - - else: - raise Exception, 'ZOMG THIS CANNOT HAPPEN' - - def parseonegenerator(self, *args, **kwargs): - while(True): - yield self.parseone(*args, **kwargs) - - def parse(self, *args, **kwargs): - root = dom.Element('wikipage') - for data in self.parseonegenerator(*args, **kwargs): - root.extend(data) - return root - - - - \ No newline at end of file