Revision: 3914
Author: valhallasw
Date: 2007-07-27 15:27:56 +0000 (Fri, 27 Jul 2007)
Log Message:
-----------
moved wikitext parser to own project directory
Added Paths:
-----------
trunk/pywikiparser/
trunk/pywikiparser/parser.py
Removed Paths:
-------------
trunk/pywikipedia/parser.py
Copied: trunk/pywikiparser/parser.py (from rev 3913, trunk/pywikipedia/parser.py)
===================================================================
--- trunk/pywikiparser/parser.py (rev 0)
+++ trunk/pywikiparser/parser.py 2007-07-27 15:27:56 UTC (rev 3914)
@@ -0,0 +1,217 @@
+# -*- coding: utf-8 -*-
+""" Mediawiki wikitext parser """
+#
+# (C) 2007 Merlijn 'valhallasw' van Deen
+#
+# Distributed under the terms of the MIT license.
+#
+__version__ = '$Id$'
+
+#
+
+import re
+import xml.dom.minidom as dom
+
+# System loosely based on 'the dragon book':
+# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986
+
+class ParseError(Exception):
+ """ booh """
+
+class parser:
+ def __init__(self, string):
+ self.wikipwn = [a for a in lexer(string).lexer()]
+ self.counter = 0
+
+ def expect(self, types, values=None):
+ #print 'Expect: %s %s' % (types, values)
+ token = self.wikipwn[self.counter]
+ if (token[0] not in types):
+ if values:
+ raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
+ else:
+ raise ParseError("Expected one of (%r), got %r" % (types, token))
+ if values:
+ if (token[1] not in values):
+ raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
+ self.counter += 1
+ return token
+
+ def parsetext(self):
+ data = ''
+ try:
+ while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
+ except ParseError, e: pass
+ return data
+
+ def parseurl(self):
+ pre = self.expect([lexer.SQRE_OPEN])[1]-1
+ url = self.expect([lexer.TEXT])[1]
+ # checkurl, raise ParseError
+ try:
+ ws = self.expect([lexer.WHITESPACE])[1]
+ except ParseError:
+ aft = self.expect([lexer.SQRE_CLOSE])[1]-1
+ return ('['*pre, url, ']'*aft)
+
+ if '\n' in ws:
+ raise ParseError('No newlines allowed in external links')
+ desc = ''
+ try:
+ while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
+ except ParseError, e: pass
+ aft = self.expect([lexer.SQRE_CLOSE])[1]-1
+ return ('['*pre, url, desc, ']'*aft)
+
+ def parsewikilink(self):
+ pre = self.expect([lexer.SQRE_OPEN])[1]-2
+ if (pre < 0): raise ParseError('Not a wiki link')
+ page = ''
+ try:
+ while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
+ except ParseError,e: pass
+ # if not re.match(...): raise ParseError
+ try:
+ aft = self.expect([lexer.SQRE_CLOSE])[1]-2
+ except ParseError, e: pass
+ else:
+ if (aft < 0):
+ raise ParseError('Not a wiki link')
+ return ('['*pre, page, ']'*aft)
+ print 'boom.'
+ return 0
+
+ def parseone(self):
+ token = self.wikipwn[self.counter]
+ if (token[0] == lexer.EOF):
+ raise StopIteration
+ if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text
+ try: return self.parsetext();
+ except ParseError, e: pass
+ if (token[0] == lexer.SQRE_OPEN): #wikilink or external link
+ begin = self.counter
+ try: return self.parsewikilink();
+ except ParseError, e: pass
+ self.counter = begin
+ try: return self.parseurl();
+ except ParseError, e: pass
+ self.counter = begin
+ return ('[' * self.expect([lexer.SQRE_OPEN])[1])
+
+ try: self.expect([lexer.SQRE_OPEN], [1])
+ except ParseError, e: pass
+ else: return self.parseurl()
+
+ # Wikilink
+ try: self.expect([lexer.SQRE_OPEN], [2]) #wikilink
+ except ParseError, e: pass
+
+class lexer:
+ class TEXT:
+ """ Text """
+ class SQRE_OPEN:
+ """ Square bracket open """
+ class SQRE_CLOSE:
+ """ Square bracket close """
+ class PIPE:
+ """ Pipe symbol """
+ class CURL_OPEN:
+ """ Curly bracket open """
+ class CURL_CLOSE:
+ """ Curly bracket close """
+ class ANGL_OPEN:
+ """ Angular bracket open """
+ class ANGL_CLOSE:
+ """ Angular bracket close """
+ class NEWPAR:
+ """ New paragraph """
+ class TAB_OPEN:
+ """ Table open """
+ class TAB_NEWLINE:
+ """ Table new row """
+ class TAB_CLOSE:
+ """ Table close """
+ class WHITESPACE:
+ """ Whitespace """
+ class EQUAL_SIGN:
+ """ Equal sign """
+ class APOSTROPHE:
+ """ Apostrophe """
+ class EOF:
+ """ End Of File """
+
+ def __init__(self, string):
+ self.wikipwn = (a for a in string)
+
+ def lexer(self):
+ text = ''
+ try:
+ c = self.getchar()
+ while True:
+ if (c in ('[', ']', '{', '}', '<', '>', '=', '\'')):
+ if text:
+ yield (lexer.TEXT, text)
+ text = ''
+ num = 1
+ try:
+ t = self.getchar()
+ while (t == c):
+ t = self.getchar()
+ num += 1
+ finally:
+ if (c == '['): yield (lexer.SQRE_OPEN, num)
+ elif (c == ']'): yield (lexer.SQRE_CLOSE, num)
+ elif (c == '{'): yield (lexer.CURL_OPEN, num)
+ elif (c == '}'): yield (lexer.CURL_CLOSE, num)
+ elif (c == '<'): yield (lexer.ANGL_OPEN, num)
+ elif (c == '>'): yield (lexer.ANGL_CLOSE, num)
+ elif (c == '='): yield (lexer.EQUAL_SIGN, num)
+ elif (c == '\''): yield(lexer.APOSTROPHE, num)
+ c = t
+ elif (c == '|'):
+ if text:
+ yield (lexer.TEXT, text)
+ text = ''
+ try:
+ t = self.getchar()
+ except StopIteration:
+ yield (lexer.PIPE, None)
+ raise
+
+ if (t == '-'):
+ yield (lexer.TAB_NEWLINE, None)
+ c = self.getchar()
+ elif (t == '}'):
+ yield (lexer.TAB_CLOSE, None)
+ c = self.getchar()
+ else:
+ yield (lexer.PIPE, None)
+ c = t
+ elif re.match('\s', c): # whitespace eater pro (TM)
+ if text:
+ yield (lexer.TEXT, text)
+ text = ''
+ ws = ''
+ try:
+ while re.match('\s', c):
+ ws += c
+ c = self.getchar() #eat up remaining whitespace
+ finally:
+ if (ws.count('\n') > 1):
+ yield (lexer.NEWPAR, ws)
+ else:
+ yield (lexer.WHITESPACE, ws)
+ else:
+ text = text + c
+ c = self.getchar()
+ except StopIteration: pass
+ if text:
+ yield (lexer.TEXT, text)
+ yield (lexer.EOF, None)
+
+ def getchar(self):
+ return self.wikipwn.next()
+
+
+
+
\ No newline at end of file
Deleted: trunk/pywikipedia/parser.py
===================================================================
--- trunk/pywikipedia/parser.py 2007-07-27 14:26:25 UTC (rev 3913)
+++ trunk/pywikipedia/parser.py 2007-07-27 15:27:56 UTC (rev 3914)
@@ -1,217 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Mediawiki wikitext parser """
-#
-# (C) 2007 Merlijn 'valhallasw' van Deen
-#
-# Distributed under the terms of the MIT license.
-#
-__version__ = '$Id$'
-
-#
-
-import re
-import xml.dom.minidom as dom
-
-# System loosely based on 'the dragon book':
-# Compilers, Principles, Techniques and Tools, Aho, Sethi, Ullman, 1st edition, 1986
-
-class ParseError(Exception):
- """ booh """
-
-class parser:
- def __init__(self, string):
- self.wikipwn = [a for a in lexer(string).lexer()]
- self.counter = 0
-
- def expect(self, types, values=None):
- #print 'Expect: %s %s' % (types, values)
- token = self.wikipwn[self.counter]
- if (token[0] not in types):
- if values:
- raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
- else:
- raise ParseError("Expected one of (%r), got %r" % (types, token))
- if values:
- if (token[1] not in values):
- raise ParseError("Expected one of (%r, %r), got %r" % (types, values, token))
- self.counter += 1
- return token
-
- def parsetext(self):
- data = ''
- try:
- while(True): data += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
- except ParseError, e: pass
- return data
-
- def parseurl(self):
- pre = self.expect([lexer.SQRE_OPEN])[1]-1
- url = self.expect([lexer.TEXT])[1]
- # checkurl, raise ParseError
- try:
- ws = self.expect([lexer.WHITESPACE])[1]
- except ParseError:
- aft = self.expect([lexer.SQRE_CLOSE])[1]-1
- return ('['*pre, url, ']'*aft)
-
- if '\n' in ws:
- raise ParseError('No newlines allowed in external links')
- desc = ''
- try:
- while(True): desc += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
- except ParseError, e: pass
- aft = self.expect([lexer.SQRE_CLOSE])[1]-1
- return ('['*pre, url, desc, ']'*aft)
-
- def parsewikilink(self):
- pre = self.expect([lexer.SQRE_OPEN])[1]-2
- if (pre < 0): raise ParseError('Not a wiki link')
- page = ''
- try:
- while(True): page += self.expect([lexer.TEXT, lexer.WHITESPACE])[1]
- except ParseError,e: pass
- # if not re.match(...): raise ParseError
- try:
- aft = self.expect([lexer.SQRE_CLOSE])[1]-2
- except ParseError, e: pass
- else:
- if (aft < 0):
- raise ParseError('Not a wiki link')
- return ('['*pre, page, ']'*aft)
- print 'boom.'
- return 0
-
- def parseone(self):
- token = self.wikipwn[self.counter]
- if (token[0] == lexer.EOF):
- raise StopIteration
- if (token[0] == lexer.TEXT or token[0] == lexer.WHITESPACE): #text
- try: return self.parsetext();
- except ParseError, e: pass
- if (token[0] == lexer.SQRE_OPEN): #wikilink or external link
- begin = self.counter
- try: return self.parsewikilink();
- except ParseError, e: pass
- self.counter = begin
- try: return self.parseurl();
- except ParseError, e: pass
- self.counter = begin
- return ('[' * self.expect([lexer.SQRE_OPEN])[1])
-
- try: self.expect([lexer.SQRE_OPEN], [1])
- except ParseError, e: pass
- else: return self.parseurl()
-
- # Wikilink
- try: self.expect([lexer.SQRE_OPEN], [2]) #wikilink
- except ParseError, e: pass
-
-class lexer:
- class TEXT:
- """ Text """
- class SQRE_OPEN:
- """ Square bracket open """
- class SQRE_CLOSE:
- """ Square bracket close """
- class PIPE:
- """ Pipe symbol """
- class CURL_OPEN:
- """ Curly bracket open """
- class CURL_CLOSE:
- """ Curly bracket close """
- class ANGL_OPEN:
- """ Angular bracket open """
- class ANGL_CLOSE:
- """ Angular bracket close """
- class NEWPAR:
- """ New paragraph """
- class TAB_OPEN:
- """ Table open """
- class TAB_NEWLINE:
- """ Table new row """
- class TAB_CLOSE:
- """ Table close """
- class WHITESPACE:
- """ Whitespace """
- class EQUAL_SIGN:
- """ Equal sign """
- class APOSTROPHE:
- """ Apostrophe """
- class EOF:
- """ End Of File """
-
- def __init__(self, string):
- self.wikipwn = (a for a in string)
-
- def lexer(self):
- text = ''
- try:
- c = self.getchar()
- while True:
- if (c in ('[', ']', '{', '}', '<', '>', '=', '\'')):
- if text:
- yield (lexer.TEXT, text)
- text = ''
- num = 1
- try:
- t = self.getchar()
- while (t == c):
- t = self.getchar()
- num += 1
- finally:
- if (c == '['): yield (lexer.SQRE_OPEN, num)
- elif (c == ']'): yield (lexer.SQRE_CLOSE, num)
- elif (c == '{'): yield (lexer.CURL_OPEN, num)
- elif (c == '}'): yield (lexer.CURL_CLOSE, num)
- elif (c == '<'): yield (lexer.ANGL_OPEN, num)
- elif (c == '>'): yield (lexer.ANGL_CLOSE, num)
- elif (c == '='): yield (lexer.EQUAL_SIGN, num)
- elif (c == '\''): yield(lexer.APOSTROPHE, num)
- c = t
- elif (c == '|'):
- if text:
- yield (lexer.TEXT, text)
- text = ''
- try:
- t = self.getchar()
- except StopIteration:
- yield (lexer.PIPE, None)
- raise
-
- if (t == '-'):
- yield (lexer.TAB_NEWLINE, None)
- c = self.getchar()
- elif (t == '}'):
- yield (lexer.TAB_CLOSE, None)
- c = self.getchar()
- else:
- yield (lexer.PIPE, None)
- c = t
- elif re.match('\s', c): # whitespace eater pro (TM)
- if text:
- yield (lexer.TEXT, text)
- text = ''
- ws = ''
- try:
- while re.match('\s', c):
- ws += c
- c = self.getchar() #eat up remaining whitespace
- finally:
- if (ws.count('\n') > 1):
- yield (lexer.NEWPAR, ws)
- else:
- yield (lexer.WHITESPACE, ws)
- else:
- text = text + c
- c = self.getchar()
- except StopIteration: pass
- if text:
- yield (lexer.TEXT, text)
- yield (lexer.EOF, None)
-
- def getchar(self):
- return self.wikipwn.next()
-
-
-
-
\ No newline at end of file