Revision: 3929
Author: valhallasw
Date: 2007-08-01 12:31:16 +0000 (Wed, 01 Aug 2007)
Log Message:
-----------
Lexer.py: special characters now return one token; all tokens have text representation attached.
Parser.py: updated to allow for new Lexer
Modified Paths:
--------------
trunk/pywikiparser/Lexer.py
trunk/pywikiparser/Parser.py
Modified: trunk/pywikiparser/Lexer.py
===================================================================
--- trunk/pywikiparser/Lexer.py 2007-08-01 00:30:33 UTC (rev 3928)
+++ trunk/pywikiparser/Lexer.py 2007-08-01 12:31:16 UTC (rev 3929)
@@ -45,11 +45,16 @@
class Lexer:
""" Lexer class for mediawiki wikitext. Used by the Parser module
+ Lexer.lexer() returns a generator that returns (Token, text) pairs. The text represents the actual text data, the token the interpreted data.
- >>> l = Lexer('Test with [[wikilink|description]], {{template|parameter\\'s|{{nested}}=booh}}, \n\n new paragraphs, <html>, {| tables |- |}')
+ >>> l = Lexer('Test with [[wikilink|description]], {{template|parameter\\'s|{{nested}}=booh}}, \\n\\n new paragraphs, <html>, {| tables |- |}')
>>> gen = l.lexer()
- >>> [token for token in gen]
- [(258, 'Test'), (272, ' '), (258, 'with'), (272, ' '), (259, 2), (258, 'wikilink'), (261, None), (258, 'description'), (260, 2), (258, ','), (272, ' '), (264, 2), (258, 'template'), (261, None), (258, 'parameter'), (263, 1), (258, 's'), (261, None), (264, 2), (258, 'nested'), (265, 2), (262, 1), (258, 'booh'), (265, 2), (258, ','), (268, ' \n\n '), (258, 'new'), (272, ' '), (258, 'paragraphs,'), (272, ' '), (266, 1), (258, 'html'), (267, 1), (258, ','), (272, ' '), (264, 1), (261, None), (272, ' '), (258, 'tables'), (272, ' '), (270, None), (258, '-'), (271, None), (273, None)]
+ >>> gen.next()
+ (<T_TEXT>, 'Test')
+ >>> gen.next()
+ (<T_WHITESPACE>, ' ')
+ >>> [token for token in gen][:10]
+ [(<T_TEXT>, 'with'), (<T_WHITESPACE>, ' '), (<T_SQRE_OPEN>, '['), (<T_SQRE_OPEN>, '['), (<T_TEXT>, 'wikilink'), (<T_PIPE>, None), (<T_TEXT>, 'description'), (<T_SQRE_CLOSE>, ']'), (<T_SQRE_CLOSE>, ']'), (<T_TEXT>, ',')]
"""
def __init__(self, string):
@@ -60,54 +65,50 @@
try:
c = self.getchar()
while True:
- if (c in ('[', ']', '{', '}', '<', '>', '=', '\'', '*', ':', ';', '#')):
+ if (c in ('[', ']', '}', '<', '>', '=', '\'', '*', ':', ';', '#')):
if text:
yield (Tokens.TEXT, text)
text = ''
- num = 1
- try:
- t = self.getchar()
- while (t == c):
- num += 1
- t = self.getchar()
-
- finally:
- if (c == '['): yield (Tokens.SQRE_OPEN, num)
- elif (c == ']'): yield (Tokens.SQRE_CLOSE, num)
- elif (c == '{'): yield (Tokens.CURL_OPEN, num)
- elif (c == '}'): yield (Tokens.CURL_CLOSE, num)
- elif (c == '<'): yield (Tokens.ANGL_OPEN, num)
- elif (c == '>'): yield (Tokens.ANGL_CLOSE, num)
- elif (c == '='): yield (Tokens.EQUAL_SIGN, num)
- elif (c == '\''): yield(Tokens.APOSTROPHE, num)
- elif (c == '*'): yield (Tokens.ASTERISK, num)
- elif (c == ':'): yield (Tokens.COLON, num)
- elif (c == ';'): yield (Tokens.SEMICOLON, num)
- elif (c == '#'): yield (Tokens.HASH, num)
+
+ if (c == '['): yield (Tokens.SQRE_OPEN, c)
+ elif (c == ']'): yield (Tokens.SQRE_CLOSE, c)
+ elif (c == '}'): yield (Tokens.CURL_CLOSE, c)
+ elif (c == '<'): yield (Tokens.ANGL_OPEN, c)
+ elif (c == '>'): yield (Tokens.ANGL_CLOSE, c)
+ elif (c == '='): yield (Tokens.EQUAL_SIGN, c)
+ elif (c == '\''): yield(Tokens.APOSTROPHE, c)
+ elif (c == '*'): yield (Tokens.ASTERISK, c)
+ elif (c == ':'): yield (Tokens.COLON, c)
+ elif (c == ';'): yield (Tokens.SEMICOLON, c)
+ elif (c == '#'): yield (Tokens.HASH, c)
+ c = self.getchar()
+ elif (c == '{'):
+ if text:
+ yield (Tokens.TEXT, text)
+ text = ''
+ t = self.getchar()
+ if (t == '|'):
+ yield (Tokens.TAB_OPEN, '{|')
+ c = self.getchar()
+ else:
+ yield (Tokens.CURL_OPEN, '{')
+
c = t
elif (c == '|'):
if text:
yield (Tokens.TEXT, text)
text = ''
- try:
- t = self.getchar()
- except StopIteration:
- yield (Tokens.PIPE, None)
- raise
+ t = self.getchar()
if (t == '-'):
- yield (Tokens.TAB_NEWLINE, None)
+ yield (Tokens.TAB_NEWLINE, '|-')
c = self.getchar()
elif (t == '}'):
- yield (Tokens.TAB_CLOSE, None)
+ yield (Tokens.TAB_CLOSE, '|}')
c = self.getchar()
else:
- num = 1
- while (t == c):
- num += 1
- t = self.getchar()
- yield (Tokens.PIPE, num)
- c = t
+ yield (Tokens.PIPE, None)
+ c = t
elif re.match('\s', c): # whitespace eater pro (TM)
if text:
yield (Tokens.TEXT, text)
@@ -131,4 +132,8 @@
yield (Tokens.EOF, None)
def getchar(self):
- return self.data.next()
\ No newline at end of file
+ return self.data.next()
+
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod()
\ No newline at end of file
Modified: trunk/pywikiparser/Parser.py
===================================================================
--- trunk/pywikiparser/Parser.py 2007-08-01 00:30:33 UTC (rev 3928)
+++ trunk/pywikiparser/Parser.py 2007-08-01 12:31:16 UTC (rev 3929)
@@ -28,26 +28,24 @@
data = self.lex.peek()
if data[0] in tokens:
- return self.lex.next()
+ return self.lex.next()[1]
else:
raise ParseError('%r is not one of %r' % (data[0], tokens))
-
- def expecttext(self):
- data = self.lex.peek()
- if data[0] in [Tokens.TEXT, Tokens.WHITESPACE]:
- return self.lex.next()
- elif data[0] in [Tokens.EQUAL_SIGN, Tokens.APOSTROPHE, Tokens.ASTERISK,
- Tokens.COLON, Tokens.SEMICOLON, Tokens.HASH]:
- data = self.lex.next()
- return (data[0], data[0].__doc__[0]*data[1])
- else:
- raise ParseError('%r is not parsable as text data' % (data[0],))
+ def eat(self, tokens):
+ data = ''
+ try:
+ while(True):
+ data += self.expect(tokens)
+ except ParseError:
+ return data
+
def parse(self, breaktoken=[]):
self.root = dom.Element('wikipage')
self.par = self.root.appendElement('p')
self.italic = False
self.bold = False
+
try:
while(True):
token = self.lex.peek()
@@ -61,7 +59,7 @@
except StopIteration: pass
return self.root
-
+
def parsetoken(self, token):
# The function to call is parser<token>
exec("data = self.parse%s()" % token[0].name, globals(), locals())
@@ -81,8 +79,7 @@
return []
def parseAPOSTROPHE(self):
- token = self.expect(Tokens.APOSTROPHE)
- num = token[1]
+ num = len(self.eat(Tokens.APOSTROPHE))
#prepare length
if (num == 1):
@@ -130,51 +127,38 @@
# Functions that return the input directly
def parseSQRE_CLOSE(self):
- token = self.expect(Tokens.SQRE_CLOSE)
- return [']'*token[1]]
+ return self.expect(Tokens.SQRE_CLOSE)
def parsePIPE(self):
- token = self.expect(Tokens.PIPE)
- return ['|'*token[1]]
+ return self.expect(Tokens.PIPE)
def parseEQUAL_SIGN(self):
- token = self.expect(Tokens.EQUAL_SIGN)
- return ['='*token[1]]
+ return self.expect(Tokens.EQUAL_SIGN)
def parseCURL_CLOSE(self):
- token = self.expect(Tokens.CURL_CLOSE)
- return ['}'*token[1]]
+ return self.expect(Tokens.CURL_CLOSE)
def parseANGL_CLOSE(self):
- token = self.expect(Tokens.ANGL_CLOSE)
- return ['>'*token[1]]
+ return self.expect(Tokens.ANGL_CLOSE)
def parseASTERISK(self):
- token = self.expect(Tokens.ASTERISK)
- return ['*'*token[1]]
+ return self.expect(Tokens.ASTERISK)
def parseCOLON(self):
- token = self.expect(Tokens.COLON)
- return [':'*token[1]]
+ return self.expect(Tokens.COLON)
def parseSEMICOLON(self):
- token = self.expect(Tokens.SEMICOLON)
- return [';'*token[1]]
+ return self.expect(Tokens.SEMICOLON)
def parseHASH(self):
- token = self.expect(Tokens.HASH)
- return ['#'*token[1]]
+ return self.expect(Tokens.HASH)
def parseTAB_NEWLINE(self):
- token = self.expect(Tokens.TAB_NEWLINE)
- return ['|-']
+ return self.expect(Tokens.TAB_NEWLINE)
def parseTAB_CLOSE(self):
- token = self.expect(Tokens.TAB_CLOSE)
- return ['|}']
-
-
-
+ return self.expect(Tokens.TAB_CLOSE)
+
# True parser callers
def parseWHITESPACE(self):
@@ -182,11 +166,7 @@
return self.parseTEXT()
def parseTEXT(self):
- text = ''
- while(True):
- try:
- text += self.expect([Tokens.TEXT, Tokens.WHITESPACE])[1]
- except ParseError: break
+ text = self.eat([Tokens.TEXT, Tokens.WHITESPACE])
if text:
return [text]
@@ -204,8 +184,7 @@
except ParseError: pass
self.lex.undo()
- token = self.expect(Tokens.SQRE_OPEN)
- return ['['*token[1]]
+ return self.expect(Tokens.SQRE_OPEN)
def parseCURL_OPEN(self):
try:
@@ -218,8 +197,7 @@
except ParseError: pass
self.lex.undo()
- token = self.expect(Tokens.CURL_OPEN)
- return ['{'*token[1]]
+ return self.expect(Tokens.CURL_OPEN)
def parseANGL_OPEN(self):
try:
@@ -227,8 +205,7 @@
except ParseError: pass
self.lex.undo()
- token = self.expect(Tokens.ANGL_OPEN)
- return ['<'*token[1]]
+ return self.expect(Tokens.ANGL_OPEN)
def parseTAB_OPEN(self):
try:
@@ -236,39 +213,25 @@
except ParseError: pass
self.lex.undo()
- token = self.expect(Tokens.TAB_OPEN)
- return ['{|']
+ return self.expect(Tokens.TAB_OPEN)
titlere = re.compile(r"[^\^\]#<>\[\|\{\}\n]*$")
def parseWikilink(self):
retval = dom.Element('')
- pre = self.expect(Tokens.SQRE_OPEN)[1]-2
+ self.expect(Tokens.SQRE_OPEN)
+ self.expect(Tokens.SQRE_OPEN)
- if pre < 0:
- raise ParseError("Not enough opening brackets")
- elif pre > 0:
- retval.append('['*pre)
+ pre = self.eat(Tokens.SQRE_OPEN)
+ if pre:
+ retval.append(pre)
- title = ''
- while(True):
- try:
- data = self.expecttext()[1]
- print data
- except ParseError: break
- if not self.titlere.match(data):
- raise ParseError("Illegal page title")
- else:
- title += data
+ title = self.eat(Tokens.TEXT) # temp. needs to allow templates etc.
link = retval.appendElement('wikilink')
link.appendElement('url').append(title)
-
- aft = self.expect(Tokens.SQRE_CLOSE)[1]-2
- if aft < 0:
- raise ParseError("Not enough closing brackets")
- elif aft > 0:
- self.lex.push((Tokens.SQRE_CLOSE, aft))
-
+
+ self.expect(Tokens.SQRE_CLOSE)
+ self.expect(Tokens.SQRE_CLOSE)
return retval