http://www.mediawiki.org/wiki/Special:Code/pywikipedia/10014
Revision: 10014 Author: xqt Date: 2012-03-12 14:06:23 +0000 (Mon, 12 Mar 2012) Log Message: ----------- strip trailing whitespace
Modified Paths: -------------- trunk/pywikiparser/Lexer.py trunk/pywikiparser/Parser.py
Modified: trunk/pywikiparser/Lexer.py =================================================================== --- trunk/pywikiparser/Lexer.py 2012-03-12 14:06:07 UTC (rev 10013) +++ trunk/pywikiparser/Lexer.py 2012-03-12 14:06:23 UTC (rev 10014) @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- """ Mediawiki wikitext lexer """ # # (C) 2007 Merlijn 'valhallasw' van Deen @@ -13,11 +13,11 @@ def __init__(self, name, description): self.name = name self.__doc__ = description - + def __repr__(self): return '<T_%s>' % (self.name,)
-class Tokens: +class Tokens: tokens = [ ('TEXT', ' Text data'), ('SQRE_OPEN', '[ Square bracket open'), @@ -46,7 +46,7 @@ class Lexer: """ Lexer class for mediawiki wikitext. Used by the Parser module Lexer.lexer() returns a generator that returns (Token, text) pairs. The text represents the actual text data, the token the interpreted data. - + >>> l = Lexer('Test with [[wikilink|description]], {{template|parameter\'s|{{nested}}=booh}}, \n\n new paragraphs, <html>, {| tables |- |}') >>> gen = l.lexer() >>> gen.next() @@ -56,10 +56,10 @@ >>> [token for token in gen][:10] [(<T_TEXT>, 'with'), (<T_WHITESPACE>, ' '), (<T_SQRE_OPEN>, '['), (<T_SQRE_OPEN>, '['), (<T_TEXT>, 'wikilink'), (<T_PIPE>, None), (<T_TEXT>, 'description'), (<T_SQRE_CLOSE>, ']'), (<T_SQRE_CLOSE>, ']'), (<T_TEXT>, ',')] """ - + def __init__(self, string): self.data = (a for a in string) - + def lexer(self): text = '' try: @@ -69,7 +69,7 @@ if text: yield (Tokens.TEXT, text) text = '' - + if (c == '['): yield (Tokens.SQRE_OPEN, c) elif (c == ']'): yield (Tokens.SQRE_CLOSE, c) elif (c == '}'): yield (Tokens.CURL_CLOSE, c) @@ -92,14 +92,14 @@ c = self.getchar() else: yield (Tokens.CURL_OPEN, '{') - + c = t elif (c == '|'): if text: yield (Tokens.TEXT, text) text = '' t = self.getchar() - + if (t == '-'): yield (Tokens.TAB_NEWLINE, '|-') c = self.getchar() @@ -131,9 +131,9 @@ yield (Tokens.TEXT, text) yield (Tokens.EOF, None)
- def getchar(self): + def getchar(self): return self.data.next() - + if __name__ == "__main__": import doctest - doctest.testmod() \ No newline at end of file + doctest.testmod()
Modified: trunk/pywikiparser/Parser.py =================================================================== --- trunk/pywikiparser/Parser.py 2012-03-12 14:06:07 UTC (rev 10013) +++ trunk/pywikiparser/Parser.py 2012-03-12 14:06:23 UTC (rev 10014) @@ -30,11 +30,11 @@ _debug = debug
self.lex = BufferedReader(Lexer(data).lexer()) - + def expect(self, tokens): if not isinstance(tokens, list): tokens = [tokens,] - + data = self.lex.peek() if data[0] in tokens: return self.lex.next()[1] @@ -48,15 +48,15 @@ data += self.expect(tokens) except ParseError: return data - + def parse(self, breaktoken=[]): self.root = dom.Element('wikipage') self.par = self.root.appendElement('p') self.italic = False self.bold = False - + restore = self.lex.getrestore() - + try: while(True): token = self.lex.peek() @@ -67,7 +67,7 @@ dbgmsg("Adding %r (was %r)" % (node,token)) self.par.extend(node) restore = self.lex.commit(restore) - + except StopIteration: pass return self.root
@@ -75,23 +75,23 @@ # The function to call is parser<token> exec("data = self.parse%s(restore)" % token[0].name, globals(), locals()) return data - + def parseEOF(self, restore): token = self.expect(Tokens.EOF) raise StopIteration - + # Special functions that directly access the storage tree - + def parseNEWPAR(self, restore): token = self.expect(Tokens.NEWPAR) self.par = self.root.appendElement('p') self.bold = False self.italic = False return [] - + def parseAPOSTROPHE(self, restore): num = len(self.eat(Tokens.APOSTROPHE)) - + #prepare length if (num == 1): self.par.append(''') @@ -101,11 +101,11 @@ elif (num > 5): self.par.append(''' * (num-5)) num = 5 - + # determine changes newitalic = self.italic newbold = self.bold - + if num == 2: #toggle italic newitalic = not self.italic elif num == 3: #toggle bold @@ -113,7 +113,7 @@ elif num == 5: #toggle both newitalic = not self.italic newbold = not self.bold - + dbgmsg('bold: %r>%r italic: %r>%r' % (self.bold, newbold, self.italic, newitalic)) if self.italic and not newitalic: if self.par.name == 'i' or not newbold: @@ -131,21 +131,21 @@ self.par = self.par.appendElement('i') self.italic = True if not self.bold and newbold: - self.par = self.par.appendElement('b') + self.par = self.par.appendElement('b') self.bold = True - return [] - + return [] + # Functions that return the input directly - + def parseSQRE_CLOSE(self, restore): return self.expect(Tokens.SQRE_CLOSE) - + def parsePIPE(self, restore): return self.expect(Tokens.PIPE) - + def parseEQUAL_SIGN(self, restore): return self.expect(Tokens.EQUAL_SIGN) - + def parseCURL_CLOSE(self, restore): return self.expect(Tokens.CURL_CLOSE)
@@ -154,13 +154,13 @@
def parseASTERISK(self, restore): return self.expect(Tokens.ASTERISK) - + def parseCOLON(self, restore): return self.expect(Tokens.COLON) - + def parseSEMICOLON(self, restore): return self.expect(Tokens.SEMICOLON) - + def parseHASH(self, restore): return self.expect(Tokens.HASH)
@@ -169,11 +169,11 @@
def parseTAB_CLOSE(self, restore): return self.expect(Tokens.TAB_CLOSE) - + # True parser callers
def parseWHITESPACE(self, restore): - # Todo: + # Todo: return self.parseTEXT(restore)
def parseTEXT(self, restore): @@ -193,15 +193,15 @@ try: return self.parseExternallink() except ParseError: pass - + self.lex.undo(restore) return self.expect(Tokens.SQRE_OPEN) - + def parseCURL_OPEN(self, restore): try: return self.parseTemplateparam() except ParseError: pass - + self.lex.undo(restore) try: return self.parseTemplate() @@ -209,12 +209,12 @@
self.lex.undo(restore) return self.expect(Tokens.CURL_OPEN) - + def parseANGL_OPEN(self, restore): try: return self.parseHTML() except ParseError: pass - + self.lex.undo(restore) return self.expect(Tokens.ANGL_OPEN)
@@ -222,38 +222,38 @@ try: return self.parseWikitable() except ParseError: pass - + self.lex.undo(restore) return self.expect(Tokens.TAB_OPEN) - + def parseWikilink(self): retval = dom.Element('') self.expect(Tokens.SQRE_OPEN) self.expect(Tokens.SQRE_OPEN) - + pre = self.eat(Tokens.SQRE_OPEN) if pre: retval.append(pre)
wikilink = retval.appendElement('wikilink') - # get page title + # get page title title = wikilink.appendElement('title')
#parse title title.extend(self.parseTitle(Tokens.SQRE_CLOSE)) - + self.expect(Tokens.SQRE_CLOSE) self.expect(Tokens.SQRE_CLOSE) - + return retval - - - + + + # while( titlere.match(next) ): # title += next # next = self.lex.peek() -# # +# # else: # break # while(True): @@ -266,27 +266,27 @@ # continue # else: # break -# -# -# +# +# +# # breaktoken = self.lex.peek() # if breaktoken[0] == Tokens.PIPE: # break # elif breaktoken[0] == Tokens.SQRE_CLOSE: # next = self.lex.peek(2) # if next[0] == Tokens.SQRE_CLOSE: -# -# self.expect(Tokens.SQRE_CLOSE) +# # self.expect(Tokens.SQRE_CLOSE) +# self.expect(Tokens.SQRE_CLOSE) # return retval -# - +# + def parseExternallink(self): raise ParseError("Needs implementation") - + def parseTemplateparam(self): raise ParseError("Needs implementation") - + def parseTemplate(self): retval = dom.Element('') self.expect(Tokens.CURL_OPEN) @@ -297,23 +297,23 @@ retval.append(pre)
wikilink = retval.appendElement('template') - # get page title + # get page title title = wikilink.appendElement('title') title.extend(self.parseTitle(Tokens.CURL_CLOSE)) - + self.expect(Tokens.CURL_CLOSE) self.expect(Tokens.CURL_CLOSE)
return retval - - + + def parseHTML(self): raise ParseError("Needs implementation") - + def parseWikitable(self): raise ParseError("Needs implementation") - - titlere = re.compile(r"[^^]<>[|{}\n]*$") + + titlere = re.compile(r"[^^]<>[|{}\n]*$") def parseTitle(self, closetoken): title = dom.Element('title') while(True):