Revision: 3918 Author: valhallasw Date: 2007-07-28 13:29:12 +0000 (Sat, 28 Jul 2007)
Log Message: ----------- Split parser. XML parser now with $Id$ ;)
Modified Paths: -------------- trunk/pywikiparser/ObjectTree/XMLParse.py
Added Paths: ----------- trunk/pywikiparser/Lexer.py trunk/pywikiparser/Parser.py
Removed Paths: ------------- trunk/pywikiparser/parser.py
Added: trunk/pywikiparser/Lexer.py =================================================================== --- trunk/pywikiparser/Lexer.py (rev 0) +++ trunk/pywikiparser/Lexer.py 2007-07-28 13:29:12 UTC (rev 3918) @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- +""" Mediawiki wikitext lexer """ +# +# (C) 2007 Merlijn 'valhallasw' van Deen +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +import re + +class Token: + TEXT = 258 # Text + SQRE_OPEN = 259 # [ Square bracket open + SQRE_CLOSE = 260 # ] Square bracket close + PIPE = 261 # | Pipe symbol + EQUAL_SIGN = 262 # = Equal sign + APOSTROPHE = 263 # ' Apostrophe + CURL_OPEN = 264 # { Curly bracket open + CURL_CLOSE = 265 # } Curly bracket close + ANGL_OPEN = 266 # < Angular bracket open + ANGL_CLOSE = 267 # > Angular bracket close + NEWPAR = 268 # \n\n New paragraph + TAB_OPEN = 269 # {| Table opening symbol + TAB_NEWLINE = 270 # |- Table new row symbol + TAB_CLOSE = 271 # |} Table closing symbol + WHITESPACE = 272 # Whitespace with max 1 newline + EOF = 273 # End of file + +class Lexer: + """ Lexer class for mediawiki wikitext. Used by the Parser module + + >>> l = Lexer('Test with [[wikilink|description]], {{template|parameter\'s|{{nested}}=booh}}, \n\n new paragraphs, <html>, {| tables |- |}') + >>> gen = l.lexer() + >>> [token for token in gen] + [(258, 'Test'), (272, ' '), (258, 'with'), (272, ' '), (259, 2), (258, 'wikilink'), (261, None), (258, 'description'), (260, 2), (258, ','), (272, ' '), (264, 2), (258, 'template'), (261, None), (258, 'parameter'), (263, 1), (258, 's'), (261, None), (264, 2), (258, 'nested'), (265, 2), (262, 1), (258, 'booh'), (265, 2), (258, ','), (268, ' \n\n '), (258, 'new'), (272, ' '), (258, 'paragraphs,'), (272, ' '), (266, 1), (258, 'html'), (267, 1), (258, ','), (272, ' '), (264, 1), (261, None), (272, ' '), (258, 'tables'), (272, ' '), (270, None), (258, '-'), (271, None), (273, None)] + """ + + def __init__(self, string): + self.data = (a for a in string) + + def lexer(self): + text = '' + try: + c = self.getchar() + while True: + if (c in ('[', ']', '{', '}', '<', '>', '=', ''')): + if text: + yield (Token.TEXT, text) + text = '' + num = 1 + try: + t = self.getchar() + while (t == c): + num += 1 + t = self.getchar() + + finally: + if (c == '['): yield (Token.SQRE_OPEN, num) + elif (c == ']'): yield (Token.SQRE_CLOSE, num) + elif (c == '{'): yield (Token.CURL_OPEN, num) + elif (c == '}'): yield (Token.CURL_CLOSE, num) + elif (c == '<'): yield (Token.ANGL_OPEN, num) + elif (c == '>'): yield (Token.ANGL_CLOSE, num) + elif (c == '='): yield (Token.EQUAL_SIGN, num) + elif (c == '''): yield(Token.APOSTROPHE, num) + c = t + elif (c == '|'): + if text: + yield (Token.TEXT, text) + text = '' + try: + t = self.getchar() + except StopIteration: + yield (Token.PIPE, None) + raise + + if (t == '-'): + yield (Token.TAB_NEWLINE, None) + c = self.getchar() + elif (t == '}'): + yield (Token.TAB_CLOSE, None) + c = self.getchar() + else: + yield (Token.PIPE, None) + c = t + elif re.match('\s', c): # whitespace eater pro (TM) + if text: + yield (Token.TEXT, text) + text = '' + ws = '' + try: + while re.match('\s', c): + ws += c + c = self.getchar() #eat up remaining whitespace + finally: + if (ws.count('\n') > 1): + yield (Token.NEWPAR, ws) + else: + yield (Token.WHITESPACE, ws) + else: + text = text + c + c = self.getchar() + except StopIteration: pass + if text: + yield (Token.TEXT, text) + yield (Token.EOF, None) + + def getchar(self): + return self.data.next() \ No newline at end of file
Property changes on: trunk/pywikiparser/Lexer.py ___________________________________________________________________ Name: svn:keywords + Id *.c = svn:eol-style=native *.cpp = svn:eol-style=native *.h = svn:eol-style=native *.dsp = svn:eol-style=CRLF *.dsw = svn:eol-style=CRLF *.sh = svn:eol-style=native Name: svn:executable *.txt + svn:eol-style=native *.png = svn:mime-type=image/png *.jpg = svn:mime-type=image/jpeg Makefile = svn:eol-style=native Name: svn:eol-style + native
Modified: trunk/pywikiparser/ObjectTree/XMLParse.py =================================================================== --- trunk/pywikiparser/ObjectTree/XMLParse.py 2007-07-28 13:09:37 UTC (rev 3917) +++ trunk/pywikiparser/ObjectTree/XMLParse.py 2007-07-28 13:29:12 UTC (rev 3918) @@ -8,6 +8,7 @@ # # Distributed under the terms of the MIT license. # +__version__ = u'$Id$'
import warnings import xml.sax
Added: trunk/pywikiparser/Parser.py =================================================================== --- trunk/pywikiparser/Parser.py (rev 0) +++ trunk/pywikiparser/Parser.py 2007-07-28 13:29:12 UTC (rev 3918) @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- +""" Mediawiki wikitext parser """ +# +# (C) 2007 Merlijn 'valhallasw' van Deen +# +# Distributed under the terms of the MIT license. +# +__version__ = '$Id$' + +# + +import re +import warnings + +import ObjectTree as dom + +from Lexer import Lexer, Token + + +# System loosely based on 'the dragon book': +# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986 + +class ParseError(Exception): + """ booh """ + +class parser: + def __init__(self, string): + self.wikipwn = [a for a in lexer(string).lexer()] + self.counter = 0 + + def expect(self, types, values=None): + #print 'Expect: %s %s' % (types, values) + token = self.wikipwn[self.counter] + if (token[0] not in types): + if values: + raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) + else: + raise ParseError("Expected one of (%r), got %r" % (types, token)) + if values: + if (token[1] not in values): + raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) + self.counter += 1 + return token + + def parsetext(self): + data = '' + try: + while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] + except ParseError, e: pass + + k = dom.Element('parsetext') + k.append(data) + return k + + def parseurl(self): + pre = self.expect([lexer.SQRE_OPEN])[1]-1 + url = self.expect([lexer.TEXT])[1] + # checkurl, raise ParseError + ws = '' + try: + ws = self.expect([lexer.WHITESPACE])[1] + except ParseError: pass + + if '\n' in ws: + raise ParseError('No newlines allowed in external links') + + desc = '' + try: + while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] + except ParseError, e: pass + + aft = self.expect([lexer.SQRE_CLOSE])[1]-1 + + root = dom.Element('parseurl') + root.append('['*pre) + extlink = root.appendElement('externallink') + extlink.appendElement('url').append(url) + if len(desc) > 0: + extlink.appendElement('description').append(desc) + root.append(']'*aft) + + return root + + def parsewikilink(self): + pre = self.expect([lexer.SQRE_OPEN])[1]-2 + if (pre < 0): raise ParseError('Not a wiki link') + + page = '' + try: + while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] + except ParseError,e: pass + # if not re.match(...): raise ParseError + + root = dom.Element('parsewikilink') + root.append('['*pre) + pagelink = root.appendElement('pagelink') + pagelink.appendElement('title').append(page) + print 'wikilink: %s' % page + try: + while(True): + root.append(self.parseparameter(breaktokens=[lexer.SQRE_CLOSE])) + except ParseError, e: pass + print 'result: %r' % (root,) + aft = self.expect([lexer.SQRE_CLOSE])[1]-2 + if (aft < 0): + raise ParseError('Not a wiki link') + + root.append(']'*aft) + return root + + def parseparameter(self, breaktokens=None): + if breaktokens: + breaktokens.append(lexer.PIPE) + else: + breaktokens = [lexer.PIPE] + try: + while(True): self.expect([lexer.WHITESPACE]) #eat whitespace + except ParseError: pass + self.expect([lexer.PIPE]) + #now we can expect anything except a loose pipe. + data = self.parse(breaktokens=breaktokens) + return dom.Element('parameter', {}, data) + + def parseone(self, breaktokens=[]): + token = self.wikipwn[self.counter] + if (token[0] == lexer.EOF) or (token[0] in breaktokens): + raise StopIteration + + if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text + try: return self.parsetext(); + except ParseError, e: pass + + if (token[0] == lexer.SQRE_OPEN): #wikilink or external link + begin = self.counter + try: return self.parsewikilink(); + except ParseError, e: pass + self.counter = begin + try: return self.parseurl(); + except ParseError, e: pass + self.counter = begin + return ('[' * self.expect([lexer.SQRE_OPEN])[1]) + + if (token[0] == lexer.SQRE_CLOSE): + return ']'*self.expect([lexer.SQRE_CLOSE])[1] + + if (token[0] == lexer.PIPE): + self.expect([lexer.PIPE]) + return '|' + + if (token[0] == lexer.CURL_OPEN): + #parse_template + warnings.warn("Not implemented yet. Returning string") + return '{'*self.expect([lexer.CURL_OPEN])[1] + + if (token[0] == lexer.CURL_CLOSE): + return '}'*self.expect([lexer.CURL_CLOSE])[1] + + if (token[0] == lexer.ANGL_OPEN): + #parse html + warnings.warn("Not implemented yet. Returning string") + return '<'*self.expect([lexer.ANGL_OPEN])[1] + + if (token[0] == lexer.ANGL_CLOSE): + return '>'*self.expect([lexer.ANGL_CLOSE])[1] + + if (token[0] == lexer.NEWPAR): + self.expect([lexer.NEWPAR]) + return '\n\n' + + if (token[0] == lexer.TAB_OPEN): + # parse wikitable + warnings.warn("Not implemented yet. Returning string") + self.expect([lexer.TAB_OPEN]) + return '(|' + + if (token[0] == lexer.TAB_NEWLINE): + self.expect([lexer.TAB_NEWLINE]) + return '|-' + + if (token[0] == lexer.TAB_CLOSE): + self.expect([lexer.TAB_CLOSE]) + return '|}' + + if (token[0] == lexer.WHITESPACE): + return self.expect([lexer.WHITESPACE])[1] + + if (token[0] == lexer.EQUAL_SIGN): + return '='*self.expect([lexer.EQUAL_SIGN])[1] + + if (token[0] == lexer.APOSTROPHE): + return '''*self.expect([lexer.APOSTROPHE])[1] + + else: + raise Exception, 'ZOMG THIS CANNOT HAPPEN' + + def parseonegenerator(self, *args, **kwargs): + while(True): + yield self.parseone(*args, **kwargs) + + def parse(self, *args, **kwargs): + root = dom.Element('wikipage') + for data in self.parseonegenerator(*args, **kwargs): + root.extend(data) + return root + + + + \ No newline at end of file
Property changes on: trunk/pywikiparser/Parser.py ___________________________________________________________________ Name: svn:keywords + Id *.c = svn:eol-style=native *.cpp = svn:eol-style=native *.h = svn:eol-style=native *.dsp = svn:eol-style=CRLF *.dsw = svn:eol-style=CRLF *.sh = svn:eol-style=native Name: svn:executable *.txt + svn:eol-style=native *.png = svn:mime-type=image/png *.jpg = svn:mime-type=image/jpeg Makefile = svn:eol-style=native Name: svn:eol-style + native
Deleted: trunk/pywikiparser/parser.py =================================================================== --- trunk/pywikiparser/parser.py 2007-07-28 13:09:37 UTC (rev 3917) +++ trunk/pywikiparser/parser.py 2007-07-28 13:29:12 UTC (rev 3918) @@ -1,217 +0,0 @@ -# -*- coding: utf-8 -*- -""" Mediawiki wikitext parser """ -# -# (C) 2007 Merlijn 'valhallasw' van Deen -# -# Distributed under the terms of the MIT license. -# -__version__ = '$Id$' - -# - -import re -import xml.dom.minidom as dom - -# System loosely based on 'the dragon book': -# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986 - -class ParseError(Exception): - """ booh """ - -class parser: - def __init__(self, string): - self.wikipwn = [a for a in lexer(string).lexer()] - self.counter = 0 - - def expect(self, types, values=None): - #print 'Expect: %s %s' % (types, values) - token = self.wikipwn[self.counter] - if (token[0] not in types): - if values: - raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) - else: - raise ParseError("Expected one of (%r), got %r" % (types, token)) - if values: - if (token[1] not in values): - raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token)) - self.counter += 1 - return token - - def parsetext(self): - data = '' - try: - while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] - except ParseError, e: pass - return data - - def parseurl(self): - pre = self.expect([lexer.SQRE_OPEN])[1]-1 - url = self.expect([lexer.TEXT])[1] - # checkurl, raise ParseError - try: - ws = self.expect([lexer.WHITESPACE])[1] - except ParseError: - aft = self.expect([lexer.SQRE_CLOSE])[1]-1 - return ('['*pre, url, ']'*aft) - - if '\n' in ws: - raise ParseError('No newlines allowed in external links') - desc = '' - try: - while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] - except ParseError, e: pass - aft = self.expect([lexer.SQRE_CLOSE])[1]-1 - return ('['*pre, url, desc, ']'*aft) - - def parsewikilink(self): - pre = self.expect([lexer.SQRE_OPEN])[1]-2 - if (pre < 0): raise ParseError('Not a wiki link') - page = '' - try: - while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1] - except ParseError,e: pass - # if not re.match(...): raise ParseError - try: - aft = self.expect([lexer.SQRE_CLOSE])[1]-2 - except ParseError, e: pass - else: - if (aft < 0): - raise ParseError('Not a wiki link') - return ('['*pre, page, ']'*aft) - print 'boom.' - return 0 - - def parseone(self): - token = self.wikipwn[self.counter] - if (token[0] == lexer.EOF): - raise StopIteration - if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text - try: return self.parsetext(); - except ParseError, e: pass - if (token[0] == lexer.SQRE_OPEN): #wikilink or external link - begin = self.counter - try: return self.parsewikilink(); - except ParseError, e: pass - self.counter = begin - try: return self.parseurl(); - except ParseError, e: pass - self.counter = begin - return ('[' * self.expect([lexer.SQRE_OPEN])[1]) - - try: self.expect([lexer.SQRE_OPEN], [1]) - except ParseError, e: pass - else: return self.parseurl() - - # Wikilink - try: self.expect([lexer.SQRE_OPEN], [2]) #wikilink - except ParseError, e: pass - -class lexer: - class TEXT: - """ Text """ - class SQRE_OPEN: - """ Square bracket open """ - class SQRE_CLOSE: - """ Square bracket close """ - class PIPE: - """ Pipe symbol """ - class CURL_OPEN: - """ Curly bracket open """ - class CURL_CLOSE: - """ Curly bracket close """ - class ANGL_OPEN: - """ Angular bracket open """ - class ANGL_CLOSE: - """ Angular bracket close """ - class NEWPAR: - """ New paragraph """ - class TAB_OPEN: - """ Table open """ - class TAB_NEWLINE: - """ Table new row """ - class TAB_CLOSE: - """ Table close """ - class WHITESPACE: - """ Whitespace """ - class EQUAL_SIGN: - """ Equal sign """ - class APOSTROPHE: - """ Apostrophe """ - class EOF: - """ End Of File """ - - def __init__(self, string): - self.wikipwn = (a for a in string) - - def lexer(self): - text = '' - try: - c = self.getchar() - while True: - if (c in ('[', ']', '{', '}', '<', '>', '=', ''')): - if text: - yield (lexer.TEXT, text) - text = '' - num = 1 - try: - t = self.getchar() - while (t == c): - t = self.getchar() - num += 1 - finally: - if (c == '['): yield (lexer.SQRE_OPEN, num) - elif (c == ']'): yield (lexer.SQRE_CLOSE, num) - elif (c == '{'): yield (lexer.CURL_OPEN, num) - elif (c == '}'): yield (lexer.CURL_CLOSE, num) - elif (c == '<'): yield (lexer.ANGL_OPEN, num) - elif (c == '>'): yield (lexer.ANGL_CLOSE, num) - elif (c == '='): yield (lexer.EQUAL_SIGN, num) - elif (c == '''): yield(lexer.APOSTROPHE, num) - c = t - elif (c == '|'): - if text: - yield (lexer.TEXT, text) - text = '' - try: - t = self.getchar() - except StopIteration: - yield (lexer.PIPE, None) - raise - - if (t == '-'): - yield (lexer.TAB_NEWLINE, None) - c = self.getchar() - elif (t == '}'): - yield (lexer.TAB_CLOSE, None) - c = self.getchar() - else: - yield (lexer.PIPE, None) - c = t - elif re.match('\s', c): # whitespace eater pro (TM) - if text: - yield (lexer.TEXT, text) - text = '' - ws = '' - try: - while re.match('\s', c): - ws += c - c = self.getchar() #eat up remaining whitespace - finally: - if (ws.count('\n') > 1): - yield (lexer.NEWPAR, ws) - else: - yield (lexer.WHITESPACE, ws) - else: - text = text + c - c = self.getchar() - except StopIteration: pass - if text: - yield (lexer.TEXT, text) - yield (lexer.EOF, None) - - def getchar(self): - return self.wikipwn.next() - - - - \ No newline at end of file