Revision: 3925 Author: valhallasw Date: 2007-07-29 22:18:58 +0000 (Sun, 29 Jul 2007)
Log Message: ----------- Late night commit: Parser.py: * parseTEXT now has try/except inside while(True) and uses break instead of pass * Added text parser function (accepts all text except []{}<>|) * Added simple wikilink parser (accepts wikilinks of format [[link]], check wikilink legality)
Lexer.py: * Updated ASTERISK description
Modified Paths: -------------- trunk/pywikiparser/Lexer.py trunk/pywikiparser/Parser.py
Modified: trunk/pywikiparser/Lexer.py =================================================================== --- trunk/pywikiparser/Lexer.py 2007-07-29 21:17:39 UTC (rev 3924) +++ trunk/pywikiparser/Lexer.py 2007-07-29 22:18:58 UTC (rev 3925) @@ -25,7 +25,7 @@ ('PIPE', '| Pipe symbol'), ('EQUAL_SIGN', '= Equal sign'), ('APOSTROPHE', '' Apostrophe'), - ('ASTERISK', '* Star sign'), + ('ASTERISK', '* Asterisk'), ('COLON', ': Colon'), ('SEMICOLON', '; Semicolon'), ('HASH', '# Hash symbol'),
Modified: trunk/pywikiparser/Parser.py =================================================================== --- trunk/pywikiparser/Parser.py 2007-07-29 21:17:39 UTC (rev 3924) +++ trunk/pywikiparser/Parser.py 2007-07-29 22:18:58 UTC (rev 3925) @@ -8,6 +8,7 @@ __version__ = '$Id$'
import warnings +import re
import ObjectTree as dom from BufferedReader import BufferedReader @@ -30,6 +31,17 @@ return self.lex.next() else: raise ParseError('%r is not one of %r' % (data[0], tokens)) + + def expecttext(self): + data = self.lex.peek() + if data[0] in [Tokens.TEXT, Tokens.WHITESPACE]: + return self.lex.next() + elif data[0] in [Tokens.EQUAL_SIGN, Tokens.APOSTROPHE, Tokens.ASTERISK, + Tokens.COLON, Tokens.SEMICOLON, Tokens.HASH]: + data = self.lex.next() + return (data[0], data[0].__doc__[0]*data[1]) + else: + raise ParseError('%r is not parsable as text data' % (data[0],))
def parse(self, breaktoken=[]): self.root = dom.Element('wikipage') @@ -166,14 +178,16 @@ # True parser callers
def parseWHITESPACE(self): + # Todo: return self.parseTEXT()
def parseTEXT(self): text = '' - try: - while(True): - text += self.expect([Tokens.TEXT, Tokens.WHITESPACE])[1] - except ParseError: pass + while(True): + try: + text += self.expect([Tokens.TEXT, Tokens.WHITESPACE])[1] + except ParseError: break + if text: return [text] else: @@ -183,7 +197,7 @@ try: return self.parseWikilink() except ParseError: pass - + self.lex.undo() try: return self.parseExternallink() @@ -224,9 +238,39 @@ self.lex.undo() token = self.expect(Tokens.TAB_OPEN) return ['{|'] - + + titlere = re.compile(r"[^^]#<>[|{}\n]*$") def parseWikilink(self): - raise ParseError("Needs implementation") + retval = dom.Element('') + pre = self.expect(Tokens.SQRE_OPEN)[1]-2 + + if pre < 0: + raise ParseError("Not enough opening brackets") + elif pre > 0: + retval.append('['*pre) + + title = '' + while(True): + try: + data = self.expecttext()[1] + print data + except ParseError: break + if not self.titlere.match(data): + raise ParseError("Illegal page title") + else: + title += data + + link = retval.appendElement('wikilink') + link.appendElement('url').append(title) + + aft = self.expect(Tokens.SQRE_CLOSE)[1]-2 + if aft < 0: + raise ParseError("Not enough closing brackets") + elif aft > 0: + self.lex.push((Tokens.SQRE_CLOSE, aft)) + + return retval +
def parseExternallink(self): raise ParseError("Needs implementation")
pywikipedia-l@lists.wikimedia.org