Revision: 3919 Author: valhallasw Date: 2007-07-29 20:06:39 +0000 (Sun, 29 Jul 2007)
Log Message: ----------- Moved old parser, small OT update; large Lexer update. New parser in next rev
Modified Paths: -------------- trunk/pywikiparser/Lexer.py trunk/pywikiparser/ObjectTree/Element.py
Added Paths: ----------- trunk/pywikiparser/Parser.py.old
Removed Paths: ------------- trunk/pywikiparser/Parser.py
Modified: trunk/pywikiparser/Lexer.py =================================================================== --- trunk/pywikiparser/Lexer.py 2007-07-28 13:29:12 UTC (rev 3918) +++ trunk/pywikiparser/Lexer.py 2007-07-29 20:06:39 UTC (rev 3919) @@ -10,23 +10,35 @@ import re
class Token: - TEXT = 258 # Text - SQRE_OPEN = 259 # [ Square bracket open - SQRE_CLOSE = 260 # ] Square bracket close - PIPE = 261 # | Pipe symbol - EQUAL_SIGN = 262 # = Equal sign - APOSTROPHE = 263 # ' Apostrophe - CURL_OPEN = 264 # { Curly bracket open - CURL_CLOSE = 265 # } Curly bracket close - ANGL_OPEN = 266 # < Angular bracket open - ANGL_CLOSE = 267 # > Angular bracket close - NEWPAR = 268 # \n\n New paragraph - TAB_OPEN = 269 # {| Table opening symbol - TAB_NEWLINE = 270 # |- Table new row symbol - TAB_CLOSE = 271 # |} Table closing symbol - WHITESPACE = 272 # Whitespace with max 1 newline - EOF = 273 # End of file + def __init__(self, name, description): + self.name = name + self.__doc__ = description + + def __repr__(self): + return '<T_%s>' % (self.name,)
+class Tokens: + tokens = [ + ('TEXT', ' Text data'), + ('SQRE_OPEN', '[ Square bracket open'), + ('SQRE_CLOSE', '] Square bracket close'), + ('PIPE', '| Pipe symbol'), + ('EQUAL_SIGN', '= Equal sign'), + ('APOSTROPHE', '' Apostrophe'), + ('CURL_OPEN', '{ Curly bracket open'), + ('CURL_CLOSE', '} Curly bracket close'), + ('ANGL_OPEN', '< Angular bracket open'), + ('ANGL_CLOSE', '> Angular bracket close'), + ('NEWPAR', '\n\n New paragraph'), + ('TAB_OPEN', '{| Table opening symbol'), + ('TAB_NEWLINE', '|- Table new row symbol'), + ('TAB_CLOSE', '|} Table closing symbol'), + ('WHITESPACE', ' Whitespace with max 1 newline'), + ('EOF', ' End of file') + ] + for token in tokens: + exec("%s = Token(%r,%r)" % (token[0], token[0], token[1]), globals(), locals()) + class Lexer: """ Lexer class for mediawiki wikitext. Used by the Parser module
@@ -46,7 +58,7 @@ while True: if (c in ('[', ']', '{', '}', '<', '>', '=', ''')): if text: - yield (Token.TEXT, text) + yield (Tokens.TEXT, text) text = '' num = 1 try: @@ -56,37 +68,41 @@ t = self.getchar()
finally: - if (c == '['): yield (Token.SQRE_OPEN, num) - elif (c == ']'): yield (Token.SQRE_CLOSE, num) - elif (c == '{'): yield (Token.CURL_OPEN, num) - elif (c == '}'): yield (Token.CURL_CLOSE, num) - elif (c == '<'): yield (Token.ANGL_OPEN, num) - elif (c == '>'): yield (Token.ANGL_CLOSE, num) - elif (c == '='): yield (Token.EQUAL_SIGN, num) - elif (c == '''): yield(Token.APOSTROPHE, num) + if (c == '['): yield (Tokens.SQRE_OPEN, num) + elif (c == ']'): yield (Tokens.SQRE_CLOSE, num) + elif (c == '{'): yield (Tokens.CURL_OPEN, num) + elif (c == '}'): yield (Tokens.CURL_CLOSE, num) + elif (c == '<'): yield (Tokens.ANGL_OPEN, num) + elif (c == '>'): yield (Tokens.ANGL_CLOSE, num) + elif (c == '='): yield (Tokens.EQUAL_SIGN, num) + elif (c == '''): yield(Tokens.APOSTROPHE, num) c = t elif (c == '|'): if text: - yield (Token.TEXT, text) + yield (Tokens.TEXT, text) text = '' try: t = self.getchar() except StopIteration: - yield (Token.PIPE, None) + yield (Tokens.PIPE, None) raise
if (t == '-'): - yield (Token.TAB_NEWLINE, None) + yield (Tokens.TAB_NEWLINE, None) c = self.getchar() elif (t == '}'): - yield (Token.TAB_CLOSE, None) + yield (Tokens.TAB_CLOSE, None) c = self.getchar() - else: - yield (Token.PIPE, None) + else: + num = 1 + while (t == c): + num += 1 + t = self.getchar() + yield (Tokens.PIPE, num) c = t elif re.match('\s', c): # whitespace eater pro (TM) if text: - yield (Token.TEXT, text) + yield (Tokens.TEXT, text) text = '' ws = '' try: @@ -95,16 +111,16 @@ c = self.getchar() #eat up remaining whitespace finally: if (ws.count('\n') > 1): - yield (Token.NEWPAR, ws) + yield (Tokens.NEWPAR, ws) else: - yield (Token.WHITESPACE, ws) + yield (Tokens.WHITESPACE, ws) else: text = text + c c = self.getchar() except StopIteration: pass if text: - yield (Token.TEXT, text) - yield (Token.EOF, None) + yield (Tokens.TEXT, text) + yield (Tokens.EOF, None)
def getchar(self): return self.data.next() \ No newline at end of file
Modified: trunk/pywikiparser/ObjectTree/Element.py =================================================================== --- trunk/pywikiparser/ObjectTree/Element.py 2007-07-28 13:29:12 UTC (rev 3918) +++ trunk/pywikiparser/ObjectTree/Element.py 2007-07-29 20:06:39 UTC (rev 3919) @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- """ Simple object tree system for python. This module contains the Element class @@ -84,8 +84,11 @@ arg.parent = self else: raise TypeError(u'Argument is of %r; expected <type 'BaseElement'>.' % (type(arg),)) + + def extend(self, list): + for item in list: + self.append(item)
- def appendElement(self, *args, **kwargs): element = Element(*args, **kwargs) self.append(element)
Deleted: trunk/pywikiparser/Parser.py =================================================================== --- trunk/pywikiparser/Parser.py 2007-07-28 13:29:12 UTC (rev 3918) +++ trunk/pywikiparser/Parser.py 2007-07-29 20:06:39 UTC (rev 3919) @@ -1,208 +0,0 @@ -# -*- coding: utf-8 -*- -""" Mediawiki wikitext parser """ -# -# (C) 2007 Merlijn 'valhallasw' van Deen -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id$' - -# - -import re -import warnings - -import ObjectTree as dom - -from Lexer import Lexer, Token - - -# System loosely based on 'the dragon book': -# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986 - -class ParseError(Exception): - """ booh """ - -class parser: - def __init__(self, string): - self.wikipwn = [a for a in lexer(string).lexer()] - self.counter = 0 - - def expect(self, types, values=None): - #print 'Expect: %s %s' % (types, values) - token = self.wikipwn[self.counter] - if (token[0] not in types): - if values: - raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) - else: - raise ParseError("Expected one of (%r), got %r" % (types, token)) - if values: - if (token[1] not in values): - raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) - self.counter += 1 - return token - - def parsetext(self): - data = '' - try: - while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] - except ParseError, e: pass - - k = dom.Element('parsetext') - k.append(data) - return k - - def parseurl(self): - pre = self.expect([lexer.SQRE_OPEN])[1]-1 - url = self.expect([lexer.TEXT])[1] - # checkurl, raise ParseError - ws = '' - try: - ws = self.expect([lexer.WHITESPACE])[1] - except ParseError: pass - - if '\n' in ws: - raise ParseError('No newlines allowed in external links') - - desc = '' - try: - while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] - except ParseError, e: pass - - aft = self.expect([lexer.SQRE_CLOSE])[1]-1 - - root = dom.Element('parseurl') - root.append('['*pre) - extlink = root.appendElement('externallink') - extlink.appendElement('url').append(url) - if len(desc) > 0: - extlink.appendElement('description').append(desc) - root.append(']'*aft) - - return root - - def parsewikilink(self): - pre = self.expect([lexer.SQRE_OPEN])[1]-2 - if (pre < 0): raise ParseError('Not a wiki link') - - page = '' - try: - while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] - except ParseError,e: pass - # if not re.match(...): raise ParseError - - root = dom.Element('parsewikilink') - root.append('['*pre) - pagelink = root.appendElement('pagelink') - pagelink.appendElement('title').append(page) - print 'wikilink: %s' % page - try: - while(True): - root.append(self.parseparameter(breaktokens=[lexer.SQRE_CLOSE])) - except ParseError, e: pass - print 'result: %r' % (root,) - aft = self.expect([lexer.SQRE_CLOSE])[1]-2 - if (aft < 0): - raise ParseError('Not a wiki link') - - root.append(']'*aft) - return root - - def parseparameter(self, breaktokens=None): - if breaktokens: - breaktokens.append(lexer.PIPE) - else: - breaktokens = [lexer.PIPE] - try: - while(True): self.expect([lexer.WHITESPACE]) #eat whitespace - except ParseError: pass - self.expect([lexer.PIPE]) - #now we can expect anything except a loose pipe. - data = self.parse(breaktokens=breaktokens) - return dom.Element('parameter', {}, data) - - def parseone(self, breaktokens=[]): - token = self.wikipwn[self.counter] - if (token[0] == lexer.EOF) or (token[0] in breaktokens): - raise StopIteration - - if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text - try: return self.parsetext(); - except ParseError, e: pass - - if (token[0] == lexer.SQRE_OPEN): #wikilink or external link - begin = self.counter - try: return self.parsewikilink(); - except ParseError, e: pass - self.counter = begin - try: return self.parseurl(); - except ParseError, e: pass - self.counter = begin - return ('[' * self.expect([lexer.SQRE_OPEN])[1]) - - if (token[0] == lexer.SQRE_CLOSE): - return ']'*self.expect([lexer.SQRE_CLOSE])[1] - - if (token[0] == lexer.PIPE): - self.expect([lexer.PIPE]) - return '|' - - if (token[0] == lexer.CURL_OPEN): - #parse_template - warnings.warn("Not implemented yet. Returning string") - return '{'*self.expect([lexer.CURL_OPEN])[1] - - if (token[0] == lexer.CURL_CLOSE): - return '}'*self.expect([lexer.CURL_CLOSE])[1] - - if (token[0] == lexer.ANGL_OPEN): - #parse html - warnings.warn("Not implemented yet. Returning string") - return '<'*self.expect([lexer.ANGL_OPEN])[1] - - if (token[0] == lexer.ANGL_CLOSE): - return '>'*self.expect([lexer.ANGL_CLOSE])[1] - - if (token[0] == lexer.NEWPAR): - self.expect([lexer.NEWPAR]) - return '\n\n' - - if (token[0] == lexer.TAB_OPEN): - # parse wikitable - warnings.warn("Not implemented yet. Returning string") - self.expect([lexer.TAB_OPEN]) - return '(|' - - if (token[0] == lexer.TAB_NEWLINE): - self.expect([lexer.TAB_NEWLINE]) - return '|-' - - if (token[0] == lexer.TAB_CLOSE): - self.expect([lexer.TAB_CLOSE]) - return '|}' - - if (token[0] == lexer.WHITESPACE): - return self.expect([lexer.WHITESPACE])[1] - - if (token[0] == lexer.EQUAL_SIGN): - return '='*self.expect([lexer.EQUAL_SIGN])[1] - - if (token[0] == lexer.APOSTROPHE): - return '''*self.expect([lexer.APOSTROPHE])[1] - - else: - raise Exception, 'ZOMG THIS CANNOT HAPPEN' - - def parseonegenerator(self, *args, **kwargs): - while(True): - yield self.parseone(*args, **kwargs) - - def parse(self, *args, **kwargs): - root = dom.Element('wikipage') - for data in self.parseonegenerator(*args, **kwargs): - root.extend(data) - return root - - - - \ No newline at end of file
Copied: trunk/pywikiparser/Parser.py.old (from rev 3918, trunk/pywikiparser/Parser.py) =================================================================== --- trunk/pywikiparser/Parser.py.old (rev 0) +++ trunk/pywikiparser/Parser.py.old 2007-07-29 20:06:39 UTC (rev 3919) @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- +""" Mediawiki wikitext parser """ +# +# (C) 2007 Merlijn 'valhallasw' van Deen +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +# + +import re +import warnings + +import ObjectTree as dom + +from Lexer import Lexer, Token + + +# System loosely based on 'the dragon book': +# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986 + +class ParseError(Exception): + """ booh """ + +class parser: + def __init__(self, string): + self.wikipwn = [a for a in lexer(string).lexer()] + self.counter = 0 + + def expect(self, types, values=None): + #print 'Expect: %s %s' % (types, values) + token = self.wikipwn[self.counter] + if (token[0] not in types): + if values: + raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) + else: + raise ParseError("Expected one of (%r), got %r" % (types, token)) + if values: + if (token[1] not in values): + raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) + self.counter += 1 + return token + + def parsetext(self): + data = '' + try: + while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] + except ParseError, e: pass + + k = dom.Element('parsetext') + k.append(data) + return k + + def parseurl(self): + pre = self.expect([lexer.SQRE_OPEN])[1]-1 + url = self.expect([lexer.TEXT])[1] + # checkurl, raise ParseError + ws = '' + try: + ws = self.expect([lexer.WHITESPACE])[1] + except ParseError: pass + + if '\n' in ws: + raise ParseError('No newlines allowed in external links') + + desc = '' + try: + while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] + except ParseError, e: pass + + aft = self.expect([lexer.SQRE_CLOSE])[1]-1 + + root = dom.Element('parseurl') + root.append('['*pre) + extlink = root.appendElement('externallink') + extlink.appendElement('url').append(url) + if len(desc) > 0: + extlink.appendElement('description').append(desc) + root.append(']'*aft) + + return root + + def parsewikilink(self): + pre = self.expect([lexer.SQRE_OPEN])[1]-2 + if (pre < 0): raise ParseError('Not a wiki link') + + page = '' + try: + while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] + except ParseError,e: pass + # if not re.match(...): raise ParseError + + root = dom.Element('parsewikilink') + root.append('['*pre) + pagelink = root.appendElement('pagelink') + pagelink.appendElement('title').append(page) + print 'wikilink: %s' % page + try: + while(True): + root.append(self.parseparameter(breaktokens=[lexer.SQRE_CLOSE])) + except ParseError, e: pass + print 'result: %r' % (root,) + aft = self.expect([lexer.SQRE_CLOSE])[1]-2 + if (aft < 0): + raise ParseError('Not a wiki link') + + root.append(']'*aft) + return root + + def parseparameter(self, breaktokens=None): + if breaktokens: + breaktokens.append(lexer.PIPE) + else: + breaktokens = [lexer.PIPE] + try: + while(True): self.expect([lexer.WHITESPACE]) #eat whitespace + except ParseError: pass + self.expect([lexer.PIPE]) + #now we can expect anything except a loose pipe. + data = self.parse(breaktokens=breaktokens) + return dom.Element('parameter', {}, data) + + def parseone(self, breaktokens=[]): + token = self.wikipwn[self.counter] + if (token[0] == lexer.EOF) or (token[0] in breaktokens): + raise StopIteration + + if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text + try: return self.parsetext(); + except ParseError, e: pass + + if (token[0] == lexer.SQRE_OPEN): #wikilink or external link + begin = self.counter + try: return self.parsewikilink(); + except ParseError, e: pass + self.counter = begin + try: return self.parseurl(); + except ParseError, e: pass + self.counter = begin + return ('[' * self.expect([lexer.SQRE_OPEN])[1]) + + if (token[0] == lexer.SQRE_CLOSE): + return ']'*self.expect([lexer.SQRE_CLOSE])[1] + + if (token[0] == lexer.PIPE): + self.expect([lexer.PIPE]) + return '|' + + if (token[0] == lexer.CURL_OPEN): + #parse_template + warnings.warn("Not implemented yet. Returning string") + return '{'*self.expect([lexer.CURL_OPEN])[1] + + if (token[0] == lexer.CURL_CLOSE): + return '}'*self.expect([lexer.CURL_CLOSE])[1] + + if (token[0] == lexer.ANGL_OPEN): + #parse html + warnings.warn("Not implemented yet. Returning string") + return '<'*self.expect([lexer.ANGL_OPEN])[1] + + if (token[0] == lexer.ANGL_CLOSE): + return '>'*self.expect([lexer.ANGL_CLOSE])[1] + + if (token[0] == lexer.NEWPAR): + self.expect([lexer.NEWPAR]) + return '\n\n' + + if (token[0] == lexer.TAB_OPEN): + # parse wikitable + warnings.warn("Not implemented yet. Returning string") + self.expect([lexer.TAB_OPEN]) + return '(|' + + if (token[0] == lexer.TAB_NEWLINE): + self.expect([lexer.TAB_NEWLINE]) + return '|-' + + if (token[0] == lexer.TAB_CLOSE): + self.expect([lexer.TAB_CLOSE]) + return '|}' + + if (token[0] == lexer.WHITESPACE): + return self.expect([lexer.WHITESPACE])[1] + + if (token[0] == lexer.EQUAL_SIGN): + return '='*self.expect([lexer.EQUAL_SIGN])[1] + + if (token[0] == lexer.APOSTROPHE): + return '''*self.expect([lexer.APOSTROPHE])[1] + + else: + raise Exception, 'ZOMG THIS CANNOT HAPPEN' + + def parseonegenerator(self, *args, **kwargs): + while(True): + yield self.parseone(*args, **kwargs) + + def parse(self, *args, **kwargs): + root = dom.Element('wikipage') + for data in self.parseonegenerator(*args, **kwargs): + root.extend(data) + return root + + + + \ No newline at end of file