Revision: 3930 Author: valhallasw Date: 2007-08-01 16:05:20 +0000 (Wed, 01 Aug 2007)
Log Message: ----------- Some BufferedReader hacks; Parser updated to understand both simple wikilinks [[blah]] and simple templates {{blah}}. [[boo{{bah}}]] is also understood \o/
Modified Paths: -------------- trunk/pywikiparser/BufferedReader.py trunk/pywikiparser/Parser.py
Modified: trunk/pywikiparser/BufferedReader.py =================================================================== --- trunk/pywikiparser/BufferedReader.py 2007-08-01 12:31:16 UTC (rev 3929) +++ trunk/pywikiparser/BufferedReader.py 2007-08-01 16:05:20 UTC (rev 3930) @@ -1,7 +1,41 @@ class BufferedReader(object): + """ Buffered reader. Usage: + + >>> reader = BufferedReader((i for i in range(10))) + >>> restore = reader.getrestore() + >>> restore + -1 + >>> reader.next() + 0 + >>> reader.next() + 1 + >>> reader.undo(-1) + >>> reader.next() + 0 + >>> restore = reader.commit(-1) + >>> restore + -1 + >>> reader.next() + 1 + >>> reader.getrestore() + 0 + >>> reader.next() + 2 + >>> reader.undo(0) + >>> reader.next() + 2 + >>> reader.commit(0) + 1 + >>> reader.undo(1) + >>> reader.next() + 3 + >>> reader.undo(-1) + >>> reader.next() + 1 + """ + def __init__(self, generator): - self.inbuffer = [] - self.outbuffer = [] + self.buffer = [] self.counter = -1 self.generator = generator self.gen = self._generator() @@ -31,33 +65,36 @@ def next(self, *args, **kwargs): return self.gen.next(*args, **kwargs)
- def peek(self): - if len(self.outbuffer) <= self.counter+1: + def peek(self, num=1): + if len(self.buffer) <= self.counter+num: data = self.generator.next() - self.inbuffer.append(data) - self.outbuffer.append(data) - return self.outbuffer[self.counter+1] + self.buffer.append(data) + return self.buffer[self.counter+num]
def _generator(self): while(True): self.counter += 1 - if len(self.outbuffer) <= self.counter: + if len(self.buffer) <= self.counter: data = self.generator.next() - self.inbuffer.append(data) - self.outbuffer.append(data) - yield self.outbuffer[self.counter] + self.buffer.append(data) + yield self.buffer[self.counter]
- def commit(self): - self.inbuffer = self.inbuffer[self.counter+1:] - self.outbuffer = self.outbuffer[self.counter+1:] - self.counter = -1 + def getrestore(self): + return self.counter + + def commit(self, counter): + if counter == -1: + # clear memory + self.buffer = self.buffer[self.counter+1:] + self.counter = -1 + self.gen = self._generator() + return self.counter
- def undo(self): - self.outbuffer = self.inbuffer[:] - self.counter = -1 + def undo(self, counter): + self.counter = counter self.gen = self._generator() - - def push(self, data): - self.outbuffer.append(data) - self.gen = self._generator() \ No newline at end of file + +if __name__ == "__main__": + import doctest + doctest.testmod() \ No newline at end of file
Modified: trunk/pywikiparser/Parser.py =================================================================== --- trunk/pywikiparser/Parser.py 2007-08-01 12:31:16 UTC (rev 3929) +++ trunk/pywikiparser/Parser.py 2007-08-01 16:05:20 UTC (rev 3930) @@ -46,39 +46,41 @@ self.italic = False self.bold = False
+ restore = self.lex.getrestore() + try: while(True): token = self.lex.peek() if token[0] in breaktoken: break
- node = self.parsetoken(token) + node = self.parsetoken(token, restore) print "Adding %r (was %r)" % (node,token) self.par.extend(node) - self.lex.commit() + restore = self.lex.commit(restore)
except StopIteration: pass return self.root
- def parsetoken(self, token): + def parsetoken(self, token, restore): # The function to call is parser<token> - exec("data = self.parse%s()" % token[0].name, globals(), locals()) + exec("data = self.parse%s(restore)" % token[0].name, globals(), locals()) return data
- def parseEOF(self): + def parseEOF(self, restore): token = self.expect(Tokens.EOF) raise StopIteration
# Special functions that directly access the storage tree
- def parseNEWPAR(self): + def parseNEWPAR(self, restore): token = self.expect(Tokens.NEWPAR) self.par = self.root.appendElement('p') self.bold = False self.italic = False return []
- def parseAPOSTROPHE(self): + def parseAPOSTROPHE(self, restore): num = len(self.eat(Tokens.APOSTROPHE))
#prepare length @@ -126,46 +128,46 @@
# Functions that return the input directly
- def parseSQRE_CLOSE(self): + def parseSQRE_CLOSE(self, restore): return self.expect(Tokens.SQRE_CLOSE)
- def parsePIPE(self): + def parsePIPE(self, restore): return self.expect(Tokens.PIPE)
- def parseEQUAL_SIGN(self): + def parseEQUAL_SIGN(self, restore): return self.expect(Tokens.EQUAL_SIGN)
- def parseCURL_CLOSE(self): + def parseCURL_CLOSE(self, restore): return self.expect(Tokens.CURL_CLOSE)
- def parseANGL_CLOSE(self): + def parseANGL_CLOSE(self, restore): return self.expect(Tokens.ANGL_CLOSE)
- def parseASTERISK(self): + def parseASTERISK(self, restore): return self.expect(Tokens.ASTERISK)
- def parseCOLON(self): + def parseCOLON(self, restore): return self.expect(Tokens.COLON)
- def parseSEMICOLON(self): + def parseSEMICOLON(self, restore): return self.expect(Tokens.SEMICOLON)
- def parseHASH(self): + def parseHASH(self, restore): return self.expect(Tokens.HASH)
- def parseTAB_NEWLINE(self): + def parseTAB_NEWLINE(self, restore): return self.expect(Tokens.TAB_NEWLINE)
- def parseTAB_CLOSE(self): + def parseTAB_CLOSE(self, restore): return self.expect(Tokens.TAB_CLOSE)
# True parser callers
- def parseWHITESPACE(self): + def parseWHITESPACE(self, restore): # Todo: - return self.parseTEXT() + return self.parseTEXT(restore)
- def parseTEXT(self): + def parseTEXT(self, restore): text = self.eat([Tokens.TEXT, Tokens.WHITESPACE])
if text: @@ -173,49 +175,48 @@ else: return []
- def parseSQRE_OPEN(self): + def parseSQRE_OPEN(self, restore): try: return self.parseWikilink() except ParseError: pass
- self.lex.undo() + self.lex.undo(restore) try: return self.parseExternallink() except ParseError: pass
- self.lex.undo() + self.lex.undo(restore) return self.expect(Tokens.SQRE_OPEN)
- def parseCURL_OPEN(self): + def parseCURL_OPEN(self, restore): try: return self.parseTemplateparam() except ParseError: pass
- self.lex.undo() + self.lex.undo(restore) try: return self.parseTemplate() except ParseError: pass
- self.lex.undo() + self.lex.undo(restore) return self.expect(Tokens.CURL_OPEN)
- def parseANGL_OPEN(self): + def parseANGL_OPEN(self, restore): try: return self.parseHTML() except ParseError: pass
- self.lex.undo() + self.lex.undo(restore) return self.expect(Tokens.ANGL_OPEN)
- def parseTAB_OPEN(self): + def parseTAB_OPEN(self, restore): try: return self.parseWikitable() except ParseError: pass
- self.lex.undo() + self.lex.undo(restore) return self.expect(Tokens.TAB_OPEN)
- titlere = re.compile(r"[^^]#<>[|{}\n]*$") def parseWikilink(self): retval = dom.Element('') self.expect(Tokens.SQRE_OPEN) @@ -224,17 +225,53 @@ pre = self.eat(Tokens.SQRE_OPEN) if pre: retval.append(pre) - - title = self.eat(Tokens.TEXT) # temp. needs to allow templates etc. - - link = retval.appendElement('wikilink') - link.appendElement('url').append(title)
- self.expect(Tokens.SQRE_CLOSE) + wikilink = retval.appendElement('wikilink') + # get page title + title = wikilink.appendElement('title') + + #parse title + title.extend(self.parseTitle(Tokens.SQRE_CLOSE)) + self.expect(Tokens.SQRE_CLOSE) - return retval - + self.expect(Tokens.SQRE_CLOSE)
+ return retval + + + +# while( titlere.match(next) ): +# title += next +# next = self.lex.peek() +# +# +# else: +# break +# while(True): +# param = .Element('parameter') +# parampiece = self.parse([Tokens.SQRE_CLOSE, Tokens.PIPE]) +# param.extend(parampiece) +# if (self.lex.peek( )[0] == Tokens.SQRE_CLOSE) and +# (self.lex.peek(2)[0] != Tokens.SQRE_CLOSE): # ][^]]: a single ] +# param.append('[') +# continue +# else: +# break +# +# +# +# breaktoken = self.lex.peek() +# if breaktoken[0] == Tokens.PIPE: +# break +# elif breaktoken[0] == Tokens.SQRE_CLOSE: +# next = self.lex.peek(2) +# if next[0] == Tokens.SQRE_CLOSE: +# +# self.expect(Tokens.SQRE_CLOSE) +# self.expect(Tokens.SQRE_CLOSE) +# return retval +# + def parseExternallink(self): raise ParseError("Needs implementation")
@@ -242,11 +279,52 @@ raise ParseError("Needs implementation")
def parseTemplate(self): - raise ParseError("Needs implementation") + retval = dom.Element('') + self.expect(Tokens.CURL_OPEN) + self.expect(Tokens.CURL_OPEN) + pre = self.eat(Tokens.CURL_OPEN) + print 'pre: ' + pre + if pre: + retval.append(pre) + + wikilink = retval.appendElement('template') + # get page title + title = wikilink.appendElement('title') + title.extend(self.parseTitle(Tokens.CURL_CLOSE)) + + self.expect(Tokens.CURL_CLOSE) + self.expect(Tokens.CURL_CLOSE) + + return retval +
def parseHTML(self): raise ParseError("Needs implementation")
def parseWikitable(self): raise ParseError("Needs implementation") + + titlere = re.compile(r"[^^]<>[|{}\n]*$") + def parseTitle(self, closetoken): + title = dom.Element('title') + while(True): + next = self.lex.peek() + if next[0] == closetoken or next[0] == Tokens.PIPE: + break + elif next[0] == Tokens.CURL_OPEN: # allow templates to expand + restore = self.lex.getrestore() + data = self.parseCURL_OPEN(restore) + print 'Parsed template: %r' % (data,) + for item in data: + if isinstance(item, basestring): + if not self.titlere.match(item): + raise ParseError('illegal wiki link') + title.extend(data) + else: + next = self.lex.next() + if not self.titlere.match(next[1]): + raise ParseError('illegal wiki link') + title.append(next[1]) + return title + \ No newline at end of file
pywikipedia-l@lists.wikimedia.org