Revision: 3919
Author: valhallasw
Date: 2007-07-29 20:06:39 +0000 (Sun, 29 Jul 2007)
Log Message:
-----------
Moved old parser, small OT update; large Lexer update. New parser in next rev
Modified Paths:
--------------
trunk/pywikiparser/Lexer.py
trunk/pywikiparser/ObjectTree/Element.py
Added Paths:
-----------
trunk/pywikiparser/Parser.py.old
Removed Paths:
-------------
trunk/pywikiparser/Parser.py
Modified: trunk/pywikiparser/Lexer.py
===================================================================
--- trunk/pywikiparser/Lexer.py 2007-07-28 13:29:12 UTC (rev 3918)
+++ trunk/pywikiparser/Lexer.py 2007-07-29 20:06:39 UTC (rev 3919)
@@ -10,23 +10,35 @@
import re
class Token:
- TEXT = 258 # Text
- SQRE_OPEN = 259 # [ Square bracket open
- SQRE_CLOSE = 260 # ] Square bracket close
- PIPE = 261 # | Pipe symbol
- EQUAL_SIGN = 262 # = Equal sign
- APOSTROPHE = 263 # ' Apostrophe
- CURL_OPEN = 264 # { Curly bracket open
- CURL_CLOSE = 265 # } Curly bracket close
- ANGL_OPEN = 266 # < Angular bracket open
- ANGL_CLOSE = 267 # > Angular bracket close
- NEWPAR = 268 # \n\n New paragraph
- TAB_OPEN = 269 # {| Table opening symbol
- TAB_NEWLINE = 270 # |- Table new row symbol
- TAB_CLOSE = 271 # |} Table closing symbol
- WHITESPACE = 272 # Whitespace with max 1 newline
- EOF = 273 # End of file
+ def __init__(self, name, description):
+ self.name = name
+ self.__doc__ = description
+
+ def __repr__(self):
+ return '<T_%s>' % (self.name,)
+class Tokens:
+ tokens = [
+ ('TEXT', ' Text data'),
+ ('SQRE_OPEN', '[ Square bracket open'),
+ ('SQRE_CLOSE', '] Square bracket close'),
+ ('PIPE', '| Pipe symbol'),
+ ('EQUAL_SIGN', '= Equal sign'),
+ ('APOSTROPHE', '\' Apostrophe'),
+ ('CURL_OPEN', '{ Curly bracket open'),
+ ('CURL_CLOSE', '} Curly bracket close'),
+ ('ANGL_OPEN', '< Angular bracket open'),
+ ('ANGL_CLOSE', '> Angular bracket close'),
+ ('NEWPAR', '\n\n New paragraph'),
+ ('TAB_OPEN', '{| Table opening symbol'),
+ ('TAB_NEWLINE', '|- Table new row symbol'),
+ ('TAB_CLOSE', '|} Table closing symbol'),
+ ('WHITESPACE', ' Whitespace with max 1 newline'),
+ ('EOF', ' End of file')
+ ]
+ for token in tokens:
+ exec("%s = Token(%r,%r)" % (token[0], token[0], token[1]), globals(), locals())
+
class Lexer:
""" Lexer class for mediawiki wikitext. Used by the Parser module
@@ -46,7 +58,7 @@
while True:
if (c in ('[', ']', '{', '}', '<', '>', '=', '\'')):
if text:
- yield (Token.TEXT, text)
+ yield (Tokens.TEXT, text)
text = ''
num = 1
try:
@@ -56,37 +68,41 @@
t = self.getchar()
finally:
- if (c == '['): yield (Token.SQRE_OPEN, num)
- elif (c == ']'): yield (Token.SQRE_CLOSE, num)
- elif (c == '{'): yield (Token.CURL_OPEN, num)
- elif (c == '}'): yield (Token.CURL_CLOSE, num)
- elif (c == '<'): yield (Token.ANGL_OPEN, num)
- elif (c == '>'): yield (Token.ANGL_CLOSE, num)
- elif (c == '='): yield (Token.EQUAL_SIGN, num)
- elif (c == '\''): yield(Token.APOSTROPHE, num)
+ if (c == '['): yield (Tokens.SQRE_OPEN, num)
+ elif (c == ']'): yield (Tokens.SQRE_CLOSE, num)
+ elif (c == '{'): yield (Tokens.CURL_OPEN, num)
+ elif (c == '}'): yield (Tokens.CURL_CLOSE, num)
+ elif (c == '<'): yield (Tokens.ANGL_OPEN, num)
+ elif (c == '>'): yield (Tokens.ANGL_CLOSE, num)
+ elif (c == '='): yield (Tokens.EQUAL_SIGN, num)
+ elif (c == '\''): yield(Tokens.APOSTROPHE, num)
c = t
elif (c == '|'):
if text:
- yield (Token.TEXT, text)
+ yield (Tokens.TEXT, text)
text = ''
try:
t = self.getchar()
except StopIteration:
- yield (Token.PIPE, None)
+ yield (Tokens.PIPE, None)
raise
if (t == '-'):
- yield (Token.TAB_NEWLINE, None)
+ yield (Tokens.TAB_NEWLINE, None)
c = self.getchar()
elif (t == '}'):
- yield (Token.TAB_CLOSE, None)
+ yield (Tokens.TAB_CLOSE, None)
c = self.getchar()
- else:
- yield (Token.PIPE, None)
+ else:
+ num = 1
+ while (t == c):
+ num += 1
+ t = self.getchar()
+ yield (Tokens.PIPE, num)
c = t
elif re.match('\s', c): # whitespace eater pro (TM)
if text:
- yield (Token.TEXT, text)
+ yield (Tokens.TEXT, text)
text = ''
ws = ''
try:
@@ -95,16 +111,16 @@
c = self.getchar() #eat up remaining whitespace
finally:
if (ws.count('\n') > 1):
- yield (Token.NEWPAR, ws)
+ yield (Tokens.NEWPAR, ws)
else:
- yield (Token.WHITESPACE, ws)
+ yield (Tokens.WHITESPACE, ws)
else:
text = text + c
c = self.getchar()
except StopIteration: pass
if text:
- yield (Token.TEXT, text)
- yield (Token.EOF, None)
+ yield (Tokens.TEXT, text)
+ yield (Tokens.EOF, None)
def getchar(self):
return self.data.next()
\ No newline at end of file
Modified: trunk/pywikiparser/ObjectTree/Element.py
===================================================================
--- trunk/pywikiparser/ObjectTree/Element.py 2007-07-28 13:29:12 UTC (rev 3918)
+++ trunk/pywikiparser/ObjectTree/Element.py 2007-07-29 20:06:39 UTC (rev 3919)
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
"""
Simple object tree system for python.
This module contains the Element class
@@ -84,8 +84,11 @@
arg.parent = self
else:
raise TypeError(u'Argument is of %r; expected <type \'BaseElement\'>.' % (type(arg),))
+
+ def extend(self, list):
+ for item in list:
+ self.append(item)
-
def appendElement(self, *args, **kwargs):
element = Element(*args, **kwargs)
self.append(element)
Deleted: trunk/pywikiparser/Parser.py
===================================================================
--- trunk/pywikiparser/Parser.py 2007-07-28 13:29:12 UTC (rev 3918)
+++ trunk/pywikiparser/Parser.py 2007-07-29 20:06:39 UTC (rev 3919)
@@ -1,208 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Mediawiki wikitext parser """
-#
-# (C) 2007 Merlijn 'valhallasw' van Deen
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id$'
-
-#
-
-import re
-import warnings
-
-import ObjectTree as dom
-
-from Lexer import Lexer, Token
-
-
-# System loosely based on 'the dragon book':
-# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986
-
-class ParseError(Exception):
- """ booh """
-
-class parser:
- def __init__(self, string):
- self.wikipwn = [a for a in lexer(string).lexer()]
- self.counter = 0
-
- def expect(self, types, values=None):
- #print 'Expect: %s %s' % (types, values)
- token = self.wikipwn[self.counter]
- if (token[0] not in types):
- if values:
- raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
- else:
- raise ParseError("Expected one of (%r), got %r" % (types, token))
- if values:
- if (token[1] not in values):
- raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
- self.counter += 1
- return token
-
- def parsetext(self):
- data = ''
- try:
- while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
- except ParseError, e: pass
-
- k = dom.Element('parsetext')
- k.append(data)
- return k
-
- def parseurl(self):
- pre = self.expect([lexer.SQRE_OPEN])[1]-1
- url = self.expect([lexer.TEXT])[1]
- # checkurl, raise ParseError
- ws = ''
- try:
- ws = self.expect([lexer.WHITESPACE])[1]
- except ParseError: pass
-
- if '\n' in ws:
- raise ParseError('No newlines allowed in external links')
-
- desc = ''
- try:
- while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
- except ParseError, e: pass
-
- aft = self.expect([lexer.SQRE_CLOSE])[1]-1
-
- root = dom.Element('parseurl')
- root.append('['*pre)
- extlink = root.appendElement('externallink')
- extlink.appendElement('url').append(url)
- if len(desc) > 0:
- extlink.appendElement('description').append(desc)
- root.append(']'*aft)
-
- return root
-
- def parsewikilink(self):
- pre = self.expect([lexer.SQRE_OPEN])[1]-2
- if (pre < 0): raise ParseError('Not a wiki link')
-
- page = ''
- try:
- while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
- except ParseError,e: pass
- # if not re.match(...): raise ParseError
-
- root = dom.Element('parsewikilink')
- root.append('['*pre)
- pagelink = root.appendElement('pagelink')
- pagelink.appendElement('title').append(page)
- print 'wikilink: %s' % page
- try:
- while(True):
- root.append(self.parseparameter(breaktokens=[lexer.SQRE_CLOSE]))
- except ParseError, e: pass
- print 'result: %r' % (root,)
- aft = self.expect([lexer.SQRE_CLOSE])[1]-2
- if (aft < 0):
- raise ParseError('Not a wiki link')
-
- root.append(']'*aft)
- return root
-
- def parseparameter(self, breaktokens=None):
- if breaktokens:
- breaktokens.append(lexer.PIPE)
- else:
- breaktokens = [lexer.PIPE]
- try:
- while(True): self.expect([lexer.WHITESPACE]) #eat whitespace
- except ParseError: pass
- self.expect([lexer.PIPE])
- #now we can expect anything except a loose pipe.
- data = self.parse(breaktokens=breaktokens)
- return dom.Element('parameter', {}, data)
-
- def parseone(self, breaktokens=[]):
- token = self.wikipwn[self.counter]
- if (token[0] == lexer.EOF) or (token[0] in breaktokens):
- raise StopIteration
-
- if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text
- try: return self.parsetext();
- except ParseError, e: pass
-
- if (token[0] == lexer.SQRE_OPEN): #wikilink or external link
- begin = self.counter
- try: return self.parsewikilink();
- except ParseError, e: pass
- self.counter = begin
- try: return self.parseurl();
- except ParseError, e: pass
- self.counter = begin
- return ('[' * self.expect([lexer.SQRE_OPEN])[1])
-
- if (token[0] == lexer.SQRE_CLOSE):
- return ']'*self.expect([lexer.SQRE_CLOSE])[1]
-
- if (token[0] == lexer.PIPE):
- self.expect([lexer.PIPE])
- return '|'
-
- if (token[0] == lexer.CURL_OPEN):
- #parse_template
- warnings.warn("Not implemented yet. Returning string")
- return '{'*self.expect([lexer.CURL_OPEN])[1]
-
- if (token[0] == lexer.CURL_CLOSE):
- return '}'*self.expect([lexer.CURL_CLOSE])[1]
-
- if (token[0] == lexer.ANGL_OPEN):
- #parse html
- warnings.warn("Not implemented yet. Returning string")
- return '<'*self.expect([lexer.ANGL_OPEN])[1]
-
- if (token[0] == lexer.ANGL_CLOSE):
- return '>'*self.expect([lexer.ANGL_CLOSE])[1]
-
- if (token[0] == lexer.NEWPAR):
- self.expect([lexer.NEWPAR])
- return '\n\n'
-
- if (token[0] == lexer.TAB_OPEN):
- # parse wikitable
- warnings.warn("Not implemented yet. Returning string")
- self.expect([lexer.TAB_OPEN])
- return '(|'
-
- if (token[0] == lexer.TAB_NEWLINE):
- self.expect([lexer.TAB_NEWLINE])
- return '|-'
-
- if (token[0] == lexer.TAB_CLOSE):
- self.expect([lexer.TAB_CLOSE])
- return '|}'
-
- if (token[0] == lexer.WHITESPACE):
- return self.expect([lexer.WHITESPACE])[1]
-
- if (token[0] == lexer.EQUAL_SIGN):
- return '='*self.expect([lexer.EQUAL_SIGN])[1]
-
- if (token[0] == lexer.APOSTROPHE):
- return '\''*self.expect([lexer.APOSTROPHE])[1]
-
- else:
- raise Exception, 'ZOMG THIS CANNOT HAPPEN'
-
- def parseonegenerator(self, *args, **kwargs):
- while(True):
- yield self.parseone(*args, **kwargs)
-
- def parse(self, *args, **kwargs):
- root = dom.Element('wikipage')
- for data in self.parseonegenerator(*args, **kwargs):
- root.extend(data)
- return root
-
-
-
-
\ No newline at end of file
Copied: trunk/pywikiparser/Parser.py.old (from rev 3918, trunk/pywikiparser/Parser.py)
===================================================================
--- trunk/pywikiparser/Parser.py.old (rev 0)
+++ trunk/pywikiparser/Parser.py.old 2007-07-29 20:06:39 UTC (rev 3919)
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+""" Mediawiki wikitext parser """
+#
+# (C) 2007 Merlijn 'valhallasw' van Deen
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+#
+
+import re
+import warnings
+
+import ObjectTree as dom
+
+from Lexer import Lexer, Token
+
+
+# System loosely based on 'the dragon book':
+# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986
+
+class ParseError(Exception):
+ """ booh """
+
+class parser:
+ def __init__(self, string):
+ self.wikipwn = [a for a in lexer(string).lexer()]
+ self.counter = 0
+
+ def expect(self, types, values=None):
+ #print 'Expect: %s %s' % (types, values)
+ token = self.wikipwn[self.counter]
+ if (token[0] not in types):
+ if values:
+ raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
+ else:
+ raise ParseError("Expected one of (%r), got %r" % (types, token))
+ if values:
+ if (token[1] not in values):
+ raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
+ self.counter += 1
+ return token
+
+ def parsetext(self):
+ data = ''
+ try:
+ while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
+ except ParseError, e: pass
+
+ k = dom.Element('parsetext')
+ k.append(data)
+ return k
+
+ def parseurl(self):
+ pre = self.expect([lexer.SQRE_OPEN])[1]-1
+ url = self.expect([lexer.TEXT])[1]
+ # checkurl, raise ParseError
+ ws = ''
+ try:
+ ws = self.expect([lexer.WHITESPACE])[1]
+ except ParseError: pass
+
+ if '\n' in ws:
+ raise ParseError('No newlines allowed in external links')
+
+ desc = ''
+ try:
+ while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
+ except ParseError, e: pass
+
+ aft = self.expect([lexer.SQRE_CLOSE])[1]-1
+
+ root = dom.Element('parseurl')
+ root.append('['*pre)
+ extlink = root.appendElement('externallink')
+ extlink.appendElement('url').append(url)
+ if len(desc) > 0:
+ extlink.appendElement('description').append(desc)
+ root.append(']'*aft)
+
+ return root
+
+ def parsewikilink(self):
+ pre = self.expect([lexer.SQRE_OPEN])[1]-2
+ if (pre < 0): raise ParseError('Not a wiki link')
+
+ page = ''
+ try:
+ while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
+ except ParseError,e: pass
+ # if not re.match(...): raise ParseError
+
+ root = dom.Element('parsewikilink')
+ root.append('['*pre)
+ pagelink = root.appendElement('pagelink')
+ pagelink.appendElement('title').append(page)
+ print 'wikilink: %s' % page
+ try:
+ while(True):
+ root.append(self.parseparameter(breaktokens=[lexer.SQRE_CLOSE]))
+ except ParseError, e: pass
+ print 'result: %r' % (root,)
+ aft = self.expect([lexer.SQRE_CLOSE])[1]-2
+ if (aft < 0):
+ raise ParseError('Not a wiki link')
+
+ root.append(']'*aft)
+ return root
+
+ def parseparameter(self, breaktokens=None):
+ if breaktokens:
+ breaktokens.append(lexer.PIPE)
+ else:
+ breaktokens = [lexer.PIPE]
+ try:
+ while(True): self.expect([lexer.WHITESPACE]) #eat whitespace
+ except ParseError: pass
+ self.expect([lexer.PIPE])
+ #now we can expect anything except a loose pipe.
+ data = self.parse(breaktokens=breaktokens)
+ return dom.Element('parameter', {}, data)
+
+ def parseone(self, breaktokens=[]):
+ token = self.wikipwn[self.counter]
+ if (token[0] == lexer.EOF) or (token[0] in breaktokens):
+ raise StopIteration
+
+ if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text
+ try: return self.parsetext();
+ except ParseError, e: pass
+
+ if (token[0] == lexer.SQRE_OPEN): #wikilink or external link
+ begin = self.counter
+ try: return self.parsewikilink();
+ except ParseError, e: pass
+ self.counter = begin
+ try: return self.parseurl();
+ except ParseError, e: pass
+ self.counter = begin
+ return ('[' * self.expect([lexer.SQRE_OPEN])[1])
+
+ if (token[0] == lexer.SQRE_CLOSE):
+ return ']'*self.expect([lexer.SQRE_CLOSE])[1]
+
+ if (token[0] == lexer.PIPE):
+ self.expect([lexer.PIPE])
+ return '|'
+
+ if (token[0] == lexer.CURL_OPEN):
+ #parse_template
+ warnings.warn("Not implemented yet. Returning string")
+ return '{'*self.expect([lexer.CURL_OPEN])[1]
+
+ if (token[0] == lexer.CURL_CLOSE):
+ return '}'*self.expect([lexer.CURL_CLOSE])[1]
+
+ if (token[0] == lexer.ANGL_OPEN):
+ #parse html
+ warnings.warn("Not implemented yet. Returning string")
+ return '<'*self.expect([lexer.ANGL_OPEN])[1]
+
+ if (token[0] == lexer.ANGL_CLOSE):
+ return '>'*self.expect([lexer.ANGL_CLOSE])[1]
+
+ if (token[0] == lexer.NEWPAR):
+ self.expect([lexer.NEWPAR])
+ return '\n\n'
+
+ if (token[0] == lexer.TAB_OPEN):
+ # parse wikitable
+ warnings.warn("Not implemented yet. Returning string")
+ self.expect([lexer.TAB_OPEN])
+ return '(|'
+
+ if (token[0] == lexer.TAB_NEWLINE):
+ self.expect([lexer.TAB_NEWLINE])
+ return '|-'
+
+ if (token[0] == lexer.TAB_CLOSE):
+ self.expect([lexer.TAB_CLOSE])
+ return '|}'
+
+ if (token[0] == lexer.WHITESPACE):
+ return self.expect([lexer.WHITESPACE])[1]
+
+ if (token[0] == lexer.EQUAL_SIGN):
+ return '='*self.expect([lexer.EQUAL_SIGN])[1]
+
+ if (token[0] == lexer.APOSTROPHE):
+ return '\''*self.expect([lexer.APOSTROPHE])[1]
+
+ else:
+ raise Exception, 'ZOMG THIS CANNOT HAPPEN'
+
+ def parseonegenerator(self, *args, **kwargs):
+ while(True):
+ yield self.parseone(*args, **kwargs)
+
+ def parse(self, *args, **kwargs):
+ root = dom.Element('wikipage')
+ for data in self.parseonegenerator(*args, **kwargs):
+ root.extend(data)
+ return root
+
+
+
+
\ No newline at end of file