Revision: 3929 Author: valhallasw Date: 2007-08-01 12:31:16 +0000 (Wed, 01 Aug 2007)
Log Message: ----------- Lexer.py: special characters now return one token; all tokens have text representation attached. Parser.py: updated to allow for new Lexer
Modified Paths: -------------- trunk/pywikiparser/Lexer.py trunk/pywikiparser/Parser.py
Modified: trunk/pywikiparser/Lexer.py =================================================================== --- trunk/pywikiparser/Lexer.py 2007-08-01 00:30:33 UTC (rev 3928) +++ trunk/pywikiparser/Lexer.py 2007-08-01 12:31:16 UTC (rev 3929) @@ -45,11 +45,16 @@
class Lexer: """ Lexer class for mediawiki wikitext. Used by the Parser module + Lexer.lexer() returns a generator that returns (Token, text) pairs. The text represents the actual text data, the token the interpreted data.
- >>> l = Lexer('Test with [[wikilink|description]], {{template|parameter\'s|{{nested}}=booh}}, \n\n new paragraphs, <html>, {| tables |- |}') + >>> l = Lexer('Test with [[wikilink|description]], {{template|parameter\'s|{{nested}}=booh}}, \n\n new paragraphs, <html>, {| tables |- |}') >>> gen = l.lexer() - >>> [token for token in gen] - [(258, 'Test'), (272, ' '), (258, 'with'), (272, ' '), (259, 2), (258, 'wikilink'), (261, None), (258, 'description'), (260, 2), (258, ','), (272, ' '), (264, 2), (258, 'template'), (261, None), (258, 'parameter'), (263, 1), (258, 's'), (261, None), (264, 2), (258, 'nested'), (265, 2), (262, 1), (258, 'booh'), (265, 2), (258, ','), (268, ' \n\n '), (258, 'new'), (272, ' '), (258, 'paragraphs,'), (272, ' '), (266, 1), (258, 'html'), (267, 1), (258, ','), (272, ' '), (264, 1), (261, None), (272, ' '), (258, 'tables'), (272, ' '), (270, None), (258, '-'), (271, None), (273, None)] + >>> gen.next() + (<T_TEXT>, 'Test') + >>> gen.next() + (<T_WHITESPACE>, ' ') + >>> [token for token in gen][:10] + [(<T_TEXT>, 'with'), (<T_WHITESPACE>, ' '), (<T_SQRE_OPEN>, '['), (<T_SQRE_OPEN>, '['), (<T_TEXT>, 'wikilink'), (<T_PIPE>, None), (<T_TEXT>, 'description'), (<T_SQRE_CLOSE>, ']'), (<T_SQRE_CLOSE>, ']'), (<T_TEXT>, ',')] """
def __init__(self, string): @@ -60,54 +65,50 @@ try: c = self.getchar() while True: - if (c in ('[', ']', '{', '}', '<', '>', '=', ''', '*', ':', ';', '#')): + if (c in ('[', ']', '}', '<', '>', '=', ''', '*', ':', ';', '#')): if text: yield (Tokens.TEXT, text) text = '' - num = 1 - try: - t = self.getchar() - while (t == c): - num += 1 - t = self.getchar() - - finally: - if (c == '['): yield (Tokens.SQRE_OPEN, num) - elif (c == ']'): yield (Tokens.SQRE_CLOSE, num) - elif (c == '{'): yield (Tokens.CURL_OPEN, num) - elif (c == '}'): yield (Tokens.CURL_CLOSE, num) - elif (c == '<'): yield (Tokens.ANGL_OPEN, num) - elif (c == '>'): yield (Tokens.ANGL_CLOSE, num) - elif (c == '='): yield (Tokens.EQUAL_SIGN, num) - elif (c == '''): yield(Tokens.APOSTROPHE, num) - elif (c == '*'): yield (Tokens.ASTERISK, num) - elif (c == ':'): yield (Tokens.COLON, num) - elif (c == ';'): yield (Tokens.SEMICOLON, num) - elif (c == '#'): yield (Tokens.HASH, num) + + if (c == '['): yield (Tokens.SQRE_OPEN, c) + elif (c == ']'): yield (Tokens.SQRE_CLOSE, c) + elif (c == '}'): yield (Tokens.CURL_CLOSE, c) + elif (c == '<'): yield (Tokens.ANGL_OPEN, c) + elif (c == '>'): yield (Tokens.ANGL_CLOSE, c) + elif (c == '='): yield (Tokens.EQUAL_SIGN, c) + elif (c == '''): yield(Tokens.APOSTROPHE, c) + elif (c == '*'): yield (Tokens.ASTERISK, c) + elif (c == ':'): yield (Tokens.COLON, c) + elif (c == ';'): yield (Tokens.SEMICOLON, c) + elif (c == '#'): yield (Tokens.HASH, c) + c = self.getchar() + elif (c == '{'): + if text: + yield (Tokens.TEXT, text) + text = '' + t = self.getchar() + if (t == '|'): + yield (Tokens.TAB_OPEN, '{|') + c = self.getchar() + else: + yield (Tokens.CURL_OPEN, '{') + c = t elif (c == '|'): if text: yield (Tokens.TEXT, text) text = '' - try: - t = self.getchar() - except StopIteration: - yield (Tokens.PIPE, None) - raise + t = self.getchar()
if (t == '-'): - yield (Tokens.TAB_NEWLINE, None) + yield (Tokens.TAB_NEWLINE, '|-') c = self.getchar() elif (t == '}'): - yield (Tokens.TAB_CLOSE, None) + yield (Tokens.TAB_CLOSE, '|}') c = self.getchar() else: - num = 1 - while (t == c): - num += 1 - t = self.getchar() - yield (Tokens.PIPE, num) - c = t + yield (Tokens.PIPE, None) + c = t elif re.match('\s', c): # whitespace eater pro (TM) if text: yield (Tokens.TEXT, text) @@ -131,4 +132,8 @@ yield (Tokens.EOF, None)
def getchar(self): - return self.data.next() \ No newline at end of file + return self.data.next() + +if __name__ == "__main__": + import doctest + doctest.testmod() \ No newline at end of file
Modified: trunk/pywikiparser/Parser.py =================================================================== --- trunk/pywikiparser/Parser.py 2007-08-01 00:30:33 UTC (rev 3928) +++ trunk/pywikiparser/Parser.py 2007-08-01 12:31:16 UTC (rev 3929) @@ -28,26 +28,24 @@
data = self.lex.peek() if data[0] in tokens: - return self.lex.next() + return self.lex.next()[1] else: raise ParseError('%r is not one of %r' % (data[0], tokens)) - - def expecttext(self): - data = self.lex.peek() - if data[0] in [Tokens.TEXT, Tokens.WHITESPACE]: - return self.lex.next() - elif data[0] in [Tokens.EQUAL_SIGN, Tokens.APOSTROPHE, Tokens.ASTERISK, - Tokens.COLON, Tokens.SEMICOLON, Tokens.HASH]: - data = self.lex.next() - return (data[0], data[0].__doc__[0]*data[1]) - else: - raise ParseError('%r is not parsable as text data' % (data[0],))
+ def eat(self, tokens): + data = '' + try: + while(True): + data += self.expect(tokens) + except ParseError: + return data + def parse(self, breaktoken=[]): self.root = dom.Element('wikipage') self.par = self.root.appendElement('p') self.italic = False self.bold = False + try: while(True): token = self.lex.peek() @@ -61,7 +59,7 @@
except StopIteration: pass return self.root - + def parsetoken(self, token): # The function to call is parser<token> exec("data = self.parse%s()" % token[0].name, globals(), locals()) @@ -81,8 +79,7 @@ return []
def parseAPOSTROPHE(self): - token = self.expect(Tokens.APOSTROPHE) - num = token[1] + num = len(self.eat(Tokens.APOSTROPHE))
#prepare length if (num == 1): @@ -130,51 +127,38 @@ # Functions that return the input directly
def parseSQRE_CLOSE(self): - token = self.expect(Tokens.SQRE_CLOSE) - return [']'*token[1]] + return self.expect(Tokens.SQRE_CLOSE)
def parsePIPE(self): - token = self.expect(Tokens.PIPE) - return ['|'*token[1]] + return self.expect(Tokens.PIPE)
def parseEQUAL_SIGN(self): - token = self.expect(Tokens.EQUAL_SIGN) - return ['='*token[1]] + return self.expect(Tokens.EQUAL_SIGN)
def parseCURL_CLOSE(self): - token = self.expect(Tokens.CURL_CLOSE) - return ['}'*token[1]] + return self.expect(Tokens.CURL_CLOSE)
def parseANGL_CLOSE(self): - token = self.expect(Tokens.ANGL_CLOSE) - return ['>'*token[1]] + return self.expect(Tokens.ANGL_CLOSE)
def parseASTERISK(self): - token = self.expect(Tokens.ASTERISK) - return ['*'*token[1]] + return self.expect(Tokens.ASTERISK)
def parseCOLON(self): - token = self.expect(Tokens.COLON) - return [':'*token[1]] + return self.expect(Tokens.COLON)
def parseSEMICOLON(self): - token = self.expect(Tokens.SEMICOLON) - return [';'*token[1]] + return self.expect(Tokens.SEMICOLON)
def parseHASH(self): - token = self.expect(Tokens.HASH) - return ['#'*token[1]] + return self.expect(Tokens.HASH)
def parseTAB_NEWLINE(self): - token = self.expect(Tokens.TAB_NEWLINE) - return ['|-'] + return self.expect(Tokens.TAB_NEWLINE)
def parseTAB_CLOSE(self): - token = self.expect(Tokens.TAB_CLOSE) - return ['|}'] - - - + return self.expect(Tokens.TAB_CLOSE) + # True parser callers
def parseWHITESPACE(self): @@ -182,11 +166,7 @@ return self.parseTEXT()
def parseTEXT(self): - text = '' - while(True): - try: - text += self.expect([Tokens.TEXT, Tokens.WHITESPACE])[1] - except ParseError: break + text = self.eat([Tokens.TEXT, Tokens.WHITESPACE])
if text: return [text] @@ -204,8 +184,7 @@ except ParseError: pass
self.lex.undo() - token = self.expect(Tokens.SQRE_OPEN) - return ['['*token[1]] + return self.expect(Tokens.SQRE_OPEN)
def parseCURL_OPEN(self): try: @@ -218,8 +197,7 @@ except ParseError: pass
self.lex.undo() - token = self.expect(Tokens.CURL_OPEN) - return ['{'*token[1]] + return self.expect(Tokens.CURL_OPEN)
def parseANGL_OPEN(self): try: @@ -227,8 +205,7 @@ except ParseError: pass
self.lex.undo() - token = self.expect(Tokens.ANGL_OPEN) - return ['<'*token[1]] + return self.expect(Tokens.ANGL_OPEN)
def parseTAB_OPEN(self): try: @@ -236,39 +213,25 @@ except ParseError: pass
self.lex.undo() - token = self.expect(Tokens.TAB_OPEN) - return ['{|'] + return self.expect(Tokens.TAB_OPEN)
titlere = re.compile(r"[^^]#<>[|{}\n]*$") def parseWikilink(self): retval = dom.Element('') - pre = self.expect(Tokens.SQRE_OPEN)[1]-2 + self.expect(Tokens.SQRE_OPEN) + self.expect(Tokens.SQRE_OPEN) - if pre < 0: - raise ParseError("Not enough opening brackets") - elif pre > 0: - retval.append('['*pre) + pre = self.eat(Tokens.SQRE_OPEN) + if pre: + retval.append(pre) - title = '' - while(True): - try: - data = self.expecttext()[1] - print data - except ParseError: break - if not self.titlere.match(data): - raise ParseError("Illegal page title") - else: - title += data + title = self.eat(Tokens.TEXT) # temp. needs to allow templates etc. link = retval.appendElement('wikilink') link.appendElement('url').append(title) - - aft = self.expect(Tokens.SQRE_CLOSE)[1]-2 - if aft < 0: - raise ParseError("Not enough closing brackets") - elif aft > 0: - self.lex.push((Tokens.SQRE_CLOSE, aft)) - + + self.expect(Tokens.SQRE_CLOSE) + self.expect(Tokens.SQRE_CLOSE) return retval