Revision: 3918
Author: valhallasw
Date: 2007-07-28 13:29:12 +0000 (Sat, 28 Jul 2007)
Log Message:
-----------
Split parser. XML parser now with $Id$ ;)
Modified Paths:
--------------
trunk/pywikiparser/ObjectTree/XMLParse.py
Added Paths:
-----------
trunk/pywikiparser/Lexer.py
trunk/pywikiparser/Parser.py
Removed Paths:
-------------
trunk/pywikiparser/parser.py
Added: trunk/pywikiparser/Lexer.py
===================================================================
--- trunk/pywikiparser/Lexer.py (rev 0)
+++ trunk/pywikiparser/Lexer.py 2007-07-28 13:29:12 UTC (rev 3918)
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+""" Mediawiki wikitext lexer """
+#
+# (C) 2007 Merlijn 'valhallasw' van Deen
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+import re
+
+class Token:
+ TEXT = 258 # Text
+ SQRE_OPEN = 259 # [ Square bracket open
+ SQRE_CLOSE = 260 # ] Square bracket close
+ PIPE = 261 # | Pipe symbol
+ EQUAL_SIGN = 262 # = Equal sign
+ APOSTROPHE = 263 # ' Apostrophe
+ CURL_OPEN = 264 # { Curly bracket open
+ CURL_CLOSE = 265 # } Curly bracket close
+ ANGL_OPEN = 266 # < Angular bracket open
+ ANGL_CLOSE = 267 # > Angular bracket close
+ NEWPAR = 268 # \n\n New paragraph
+ TAB_OPEN = 269 # {| Table opening symbol
+ TAB_NEWLINE = 270 # |- Table new row symbol
+ TAB_CLOSE = 271 # |} Table closing symbol
+ WHITESPACE = 272 # Whitespace with max 1 newline
+ EOF = 273 # End of file
+
+class Lexer:
+ """ Lexer class for mediawiki wikitext. Used by the Parser module
+
+ >>> l = Lexer('Test with [[wikilink|description]], {{template|parameter\\'s|{{nested}}=booh}}, \n\n new paragraphs, <html>, {| tables |- |}')
+ >>> gen = l.lexer()
+ >>> [token for token in gen]
+ [(258, 'Test'), (272, ' '), (258, 'with'), (272, ' '), (259, 2), (258, 'wikilink'), (261, None), (258, 'description'), (260, 2), (258, ','), (272, ' '), (264, 2), (258, 'template'), (261, None), (258, 'parameter'), (263, 1), (258, 's'), (261, None), (264, 2), (258, 'nested'), (265, 2), (262, 1), (258, 'booh'), (265, 2), (258, ','), (268, ' \n\n '), (258, 'new'), (272, ' '), (258, 'paragraphs,'), (272, ' '), (266, 1), (258, 'html'), (267, 1), (258, ','), (272, ' '), (264, 1), (261, None), (272, ' '), (258, 'tables'), (272, ' '), (270, None), (258, '-'), (271, None), (273, None)]
+ """
+
+ def __init__(self, string):
+ self.data = (a for a in string)
+
+ def lexer(self):
+ text = ''
+ try:
+ c = self.getchar()
+ while True:
+ if (c in ('[', ']', '{', '}', '<', '>', '=', '\'')):
+ if text:
+ yield (Token.TEXT, text)
+ text = ''
+ num = 1
+ try:
+ t = self.getchar()
+ while (t == c):
+ num += 1
+ t = self.getchar()
+
+ finally:
+ if (c == '['): yield (Token.SQRE_OPEN, num)
+ elif (c == ']'): yield (Token.SQRE_CLOSE, num)
+ elif (c == '{'): yield (Token.CURL_OPEN, num)
+ elif (c == '}'): yield (Token.CURL_CLOSE, num)
+ elif (c == '<'): yield (Token.ANGL_OPEN, num)
+ elif (c == '>'): yield (Token.ANGL_CLOSE, num)
+ elif (c == '='): yield (Token.EQUAL_SIGN, num)
+ elif (c == '\''): yield(Token.APOSTROPHE, num)
+ c = t
+ elif (c == '|'):
+ if text:
+ yield (Token.TEXT, text)
+ text = ''
+ try:
+ t = self.getchar()
+ except StopIteration:
+ yield (Token.PIPE, None)
+ raise
+
+ if (t == '-'):
+ yield (Token.TAB_NEWLINE, None)
+ c = self.getchar()
+ elif (t == '}'):
+ yield (Token.TAB_CLOSE, None)
+ c = self.getchar()
+ else:
+ yield (Token.PIPE, None)
+ c = t
+ elif re.match('\s', c): # whitespace eater pro (TM)
+ if text:
+ yield (Token.TEXT, text)
+ text = ''
+ ws = ''
+ try:
+ while re.match('\s', c):
+ ws += c
+ c = self.getchar() #eat up remaining whitespace
+ finally:
+ if (ws.count('\n') > 1):
+ yield (Token.NEWPAR, ws)
+ else:
+ yield (Token.WHITESPACE, ws)
+ else:
+ text = text + c
+ c = self.getchar()
+ except StopIteration: pass
+ if text:
+ yield (Token.TEXT, text)
+ yield (Token.EOF, None)
+
+ def getchar(self):
+ return self.data.next()
\ No newline at end of file
Property changes on: trunk/pywikiparser/Lexer.py
___________________________________________________________________
Name: svn:keywords
+ Id *.c = svn:eol-style=native *.cpp = svn:eol-style=native *.h = svn:eol-style=native *.dsp = svn:eol-style=CRLF *.dsw = svn:eol-style=CRLF *.sh = svn:eol-style=native
Name: svn:executable *.txt
+ svn:eol-style=native *.png = svn:mime-type=image/png *.jpg = svn:mime-type=image/jpeg Makefile = svn:eol-style=native
Name: svn:eol-style
+ native
Modified: trunk/pywikiparser/ObjectTree/XMLParse.py
===================================================================
--- trunk/pywikiparser/ObjectTree/XMLParse.py 2007-07-28 13:09:37 UTC (rev 3917)
+++ trunk/pywikiparser/ObjectTree/XMLParse.py 2007-07-28 13:29:12 UTC (rev 3918)
@@ -8,6 +8,7 @@
#
# Distributed under the terms of the MIT license.
#
+__version__ = u'$Id$'
import warnings
import xml.sax
Added: trunk/pywikiparser/Parser.py
===================================================================
--- trunk/pywikiparser/Parser.py (rev 0)
+++ trunk/pywikiparser/Parser.py 2007-07-28 13:29:12 UTC (rev 3918)
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+""" Mediawiki wikitext parser """
+#
+# (C) 2007 Merlijn 'valhallasw' van Deen
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+#
+
+import re
+import warnings
+
+import ObjectTree as dom
+
+from Lexer import Lexer, Token
+
+
+# System loosely based on 'the dragon book':
+# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986
+
+class ParseError(Exception):
+ """ booh """
+
+class parser:
+ def __init__(self, string):
+ self.wikipwn = [a for a in lexer(string).lexer()]
+ self.counter = 0
+
+ def expect(self, types, values=None):
+ #print 'Expect: %s %s' % (types, values)
+ token = self.wikipwn[self.counter]
+ if (token[0] not in types):
+ if values:
+ raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
+ else:
+ raise ParseError("Expected one of (%r), got %r" % (types, token))
+ if values:
+ if (token[1] not in values):
+ raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
+ self.counter += 1
+ return token
+
+ def parsetext(self):
+ data = ''
+ try:
+ while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
+ except ParseError, e: pass
+
+ k = dom.Element('parsetext')
+ k.append(data)
+ return k
+
+ def parseurl(self):
+ pre = self.expect([lexer.SQRE_OPEN])[1]-1
+ url = self.expect([lexer.TEXT])[1]
+ # checkurl, raise ParseError
+ ws = ''
+ try:
+ ws = self.expect([lexer.WHITESPACE])[1]
+ except ParseError: pass
+
+ if '\n' in ws:
+ raise ParseError('No newlines allowed in external links')
+
+ desc = ''
+ try:
+ while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
+ except ParseError, e: pass
+
+ aft = self.expect([lexer.SQRE_CLOSE])[1]-1
+
+ root = dom.Element('parseurl')
+ root.append('['*pre)
+ extlink = root.appendElement('externallink')
+ extlink.appendElement('url').append(url)
+ if len(desc) > 0:
+ extlink.appendElement('description').append(desc)
+ root.append(']'*aft)
+
+ return root
+
+ def parsewikilink(self):
+ pre = self.expect([lexer.SQRE_OPEN])[1]-2
+ if (pre < 0): raise ParseError('Not a wiki link')
+
+ page = ''
+ try:
+ while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
+ except ParseError,e: pass
+ # if not re.match(...): raise ParseError
+
+ root = dom.Element('parsewikilink')
+ root.append('['*pre)
+ pagelink = root.appendElement('pagelink')
+ pagelink.appendElement('title').append(page)
+ print 'wikilink: %s' % page
+ try:
+ while(True):
+ root.append(self.parseparameter(breaktokens=[lexer.SQRE_CLOSE]))
+ except ParseError, e: pass
+ print 'result: %r' % (root,)
+ aft = self.expect([lexer.SQRE_CLOSE])[1]-2
+ if (aft < 0):
+ raise ParseError('Not a wiki link')
+
+ root.append(']'*aft)
+ return root
+
+ def parseparameter(self, breaktokens=None):
+ if breaktokens:
+ breaktokens.append(lexer.PIPE)
+ else:
+ breaktokens = [lexer.PIPE]
+ try:
+ while(True): self.expect([lexer.WHITESPACE]) #eat whitespace
+ except ParseError: pass
+ self.expect([lexer.PIPE])
+ #now we can expect anything except a loose pipe.
+ data = self.parse(breaktokens=breaktokens)
+ return dom.Element('parameter', {}, data)
+
+ def parseone(self, breaktokens=[]):
+ token = self.wikipwn[self.counter]
+ if (token[0] == lexer.EOF) or (token[0] in breaktokens):
+ raise StopIteration
+
+ if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text
+ try: return self.parsetext();
+ except ParseError, e: pass
+
+ if (token[0] == lexer.SQRE_OPEN): #wikilink or external link
+ begin = self.counter
+ try: return self.parsewikilink();
+ except ParseError, e: pass
+ self.counter = begin
+ try: return self.parseurl();
+ except ParseError, e: pass
+ self.counter = begin
+ return ('[' * self.expect([lexer.SQRE_OPEN])[1])
+
+ if (token[0] == lexer.SQRE_CLOSE):
+ return ']'*self.expect([lexer.SQRE_CLOSE])[1]
+
+ if (token[0] == lexer.PIPE):
+ self.expect([lexer.PIPE])
+ return '|'
+
+ if (token[0] == lexer.CURL_OPEN):
+ #parse_template
+ warnings.warn("Not implemented yet. Returning string")
+ return '{'*self.expect([lexer.CURL_OPEN])[1]
+
+ if (token[0] == lexer.CURL_CLOSE):
+ return '}'*self.expect([lexer.CURL_CLOSE])[1]
+
+ if (token[0] == lexer.ANGL_OPEN):
+ #parse html
+ warnings.warn("Not implemented yet. Returning string")
+ return '<'*self.expect([lexer.ANGL_OPEN])[1]
+
+ if (token[0] == lexer.ANGL_CLOSE):
+ return '>'*self.expect([lexer.ANGL_CLOSE])[1]
+
+ if (token[0] == lexer.NEWPAR):
+ self.expect([lexer.NEWPAR])
+ return '\n\n'
+
+ if (token[0] == lexer.TAB_OPEN):
+ # parse wikitable
+ warnings.warn("Not implemented yet. Returning string")
+ self.expect([lexer.TAB_OPEN])
+ return '(|'
+
+ if (token[0] == lexer.TAB_NEWLINE):
+ self.expect([lexer.TAB_NEWLINE])
+ return '|-'
+
+ if (token[0] == lexer.TAB_CLOSE):
+ self.expect([lexer.TAB_CLOSE])
+ return '|}'
+
+ if (token[0] == lexer.WHITESPACE):
+ return self.expect([lexer.WHITESPACE])[1]
+
+ if (token[0] == lexer.EQUAL_SIGN):
+ return '='*self.expect([lexer.EQUAL_SIGN])[1]
+
+ if (token[0] == lexer.APOSTROPHE):
+ return '\''*self.expect([lexer.APOSTROPHE])[1]
+
+ else:
+ raise Exception, 'ZOMG THIS CANNOT HAPPEN'
+
+ def parseonegenerator(self, *args, **kwargs):
+ while(True):
+ yield self.parseone(*args, **kwargs)
+
+ def parse(self, *args, **kwargs):
+ root = dom.Element('wikipage')
+ for data in self.parseonegenerator(*args, **kwargs):
+ root.extend(data)
+ return root
+
+
+
+
\ No newline at end of file
Property changes on: trunk/pywikiparser/Parser.py
___________________________________________________________________
Name: svn:keywords
+ Id *.c = svn:eol-style=native *.cpp = svn:eol-style=native *.h = svn:eol-style=native *.dsp = svn:eol-style=CRLF *.dsw = svn:eol-style=CRLF *.sh = svn:eol-style=native
Name: svn:executable *.txt
+ svn:eol-style=native *.png = svn:mime-type=image/png *.jpg = svn:mime-type=image/jpeg Makefile = svn:eol-style=native
Name: svn:eol-style
+ native
Deleted: trunk/pywikiparser/parser.py
===================================================================
--- trunk/pywikiparser/parser.py 2007-07-28 13:09:37 UTC (rev 3917)
+++ trunk/pywikiparser/parser.py 2007-07-28 13:29:12 UTC (rev 3918)
@@ -1,217 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Mediawiki wikitext parser """
-#
-# (C) 2007 Merlijn 'valhallasw' van Deen
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id$'
-
-#
-
-import re
-import xml.dom.minidom as dom
-
-# System loosely based on 'the dragon book':
-# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986
-
-class ParseError(Exception):
- """ booh """
-
-class parser:
- def __init__(self, string):
- self.wikipwn = [a for a in lexer(string).lexer()]
- self.counter = 0
-
- def expect(self, types, values=None):
- #print 'Expect: %s %s' % (types, values)
- token = self.wikipwn[self.counter]
- if (token[0] not in types):
- if values:
- raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
- else:
- raise ParseError("Expected one of (%r), got %r" % (types, token))
- if values:
- if (token[1] not in values):
- raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
- self.counter += 1
- return token
-
- def parsetext(self):
- data = ''
- try:
- while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
- except ParseError, e: pass
- return data
-
- def parseurl(self):
- pre = self.expect([lexer.SQRE_OPEN])[1]-1
- url = self.expect([lexer.TEXT])[1]
- # checkurl, raise ParseError
- try:
- ws = self.expect([lexer.WHITESPACE])[1]
- except ParseError:
- aft = self.expect([lexer.SQRE_CLOSE])[1]-1
- return ('['*pre, url, ']'*aft)
-
- if '\n' in ws:
- raise ParseError('No newlines allowed in external links')
- desc = ''
- try:
- while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
- except ParseError, e: pass
- aft = self.expect([lexer.SQRE_CLOSE])[1]-1
- return ('['*pre, url, desc, ']'*aft)
-
- def parsewikilink(self):
- pre = self.expect([lexer.SQRE_OPEN])[1]-2
- if (pre < 0): raise ParseError('Not a wiki link')
- page = ''
- try:
- while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
- except ParseError,e: pass
- # if not re.match(...): raise ParseError
- try:
- aft = self.expect([lexer.SQRE_CLOSE])[1]-2
- except ParseError, e: pass
- else:
- if (aft < 0):
- raise ParseError('Not a wiki link')
- return ('['*pre, page, ']'*aft)
- print 'boom.'
- return 0
-
- def parseone(self):
- token = self.wikipwn[self.counter]
- if (token[0] == lexer.EOF):
- raise StopIteration
- if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text
- try: return self.parsetext();
- except ParseError, e: pass
- if (token[0] == lexer.SQRE_OPEN): #wikilink or external link
- begin = self.counter
- try: return self.parsewikilink();
- except ParseError, e: pass
- self.counter = begin
- try: return self.parseurl();
- except ParseError, e: pass
- self.counter = begin
- return ('[' * self.expect([lexer.SQRE_OPEN])[1])
-
- try: self.expect([lexer.SQRE_OPEN], [1])
- except ParseError, e: pass
- else: return self.parseurl()
-
- # Wikilink
- try: self.expect([lexer.SQRE_OPEN], [2]) #wikilink
- except ParseError, e: pass
-
-class lexer:
- class TEXT:
- """ Text """
- class SQRE_OPEN:
- """ Square bracket open """
- class SQRE_CLOSE:
- """ Square bracket close """
- class PIPE:
- """ Pipe symbol """
- class CURL_OPEN:
- """ Curly bracket open """
- class CURL_CLOSE:
- """ Curly bracket close """
- class ANGL_OPEN:
- """ Angular bracket open """
- class ANGL_CLOSE:
- """ Angular bracket close """
- class NEWPAR:
- """ New paragraph """
- class TAB_OPEN:
- """ Table open """
- class TAB_NEWLINE:
- """ Table new row """
- class TAB_CLOSE:
- """ Table close """
- class WHITESPACE:
- """ Whitespace """
- class EQUAL_SIGN:
- """ Equal sign """
- class APOSTROPHE:
- """ Apostrophe """
- class EOF:
- """ End Of File """
-
- def __init__(self, string):
- self.wikipwn = (a for a in string)
-
- def lexer(self):
- text = ''
- try:
- c = self.getchar()
- while True:
- if (c in ('[', ']', '{', '}', '<', '>', '=', '\'')):
- if text:
- yield (lexer.TEXT, text)
- text = ''
- num = 1
- try:
- t = self.getchar()
- while (t == c):
- t = self.getchar()
- num += 1
- finally:
- if (c == '['): yield (lexer.SQRE_OPEN, num)
- elif (c == ']'): yield (lexer.SQRE_CLOSE, num)
- elif (c == '{'): yield (lexer.CURL_OPEN, num)
- elif (c == '}'): yield (lexer.CURL_CLOSE, num)
- elif (c == '<'): yield (lexer.ANGL_OPEN, num)
- elif (c == '>'): yield (lexer.ANGL_CLOSE, num)
- elif (c == '='): yield (lexer.EQUAL_SIGN, num)
- elif (c == '\''): yield(lexer.APOSTROPHE, num)
- c = t
- elif (c == '|'):
- if text:
- yield (lexer.TEXT, text)
- text = ''
- try:
- t = self.getchar()
- except StopIteration:
- yield (lexer.PIPE, None)
- raise
-
- if (t == '-'):
- yield (lexer.TAB_NEWLINE, None)
- c = self.getchar()
- elif (t == '}'):
- yield (lexer.TAB_CLOSE, None)
- c = self.getchar()
- else:
- yield (lexer.PIPE, None)
- c = t
- elif re.match('\s', c): # whitespace eater pro (TM)
- if text:
- yield (lexer.TEXT, text)
- text = ''
- ws = ''
- try:
- while re.match('\s', c):
- ws += c
- c = self.getchar() #eat up remaining whitespace
- finally:
- if (ws.count('\n') > 1):
- yield (lexer.NEWPAR, ws)
- else:
- yield (lexer.WHITESPACE, ws)
- else:
- text = text + c
- c = self.getchar()
- except StopIteration: pass
- if text:
- yield (lexer.TEXT, text)
- yield (lexer.EOF, None)
-
- def getchar(self):
- return self.wikipwn.next()
-
-
-
-
\ No newline at end of file