[Pywikipedia-l] SVN: [4908] trunk/pywikipedia/table2wiki.py

wikipedian at svn.wikimedia.org wikipedian at svn.wikimedia.org
Wed Jan 16 20:35:29 UTC 2008


Revision: 4908
Author:   wikipedian
Date:     2008-01-16 20:35:29 +0000 (Wed, 16 Jan 2008)

Log Message:
-----------
skip tables inside nowiki, comments etc.

Modified Paths:
--------------
    trunk/pywikipedia/table2wiki.py

Modified: trunk/pywikipedia/table2wiki.py
===================================================================
--- trunk/pywikipedia/table2wiki.py	2008-01-16 19:25:42 UTC (rev 4907)
+++ trunk/pywikipedia/table2wiki.py	2008-01-16 20:35:29 UTC (rev 4908)
@@ -146,20 +146,21 @@
     
     
         ##################
+        # Note that we added the ## characters in markActiveTables().
         # <table> tag with attributes, with more text on the same line
-        newTable = re.sub("(?i)[\r\n]*?<table (?P<attr>[\w\W]*?)>(?P<more>[\w\W]*?)[\r\n ]*",
+        newTable = re.sub("(?i)[\r\n]*?<##table## (?P<attr>[\w\W]*?)>(?P<more>[\w\W]*?)[\r\n ]*",
                          r"\r\n{| \g<attr>\r\n\g<more>", newTable)
         # <table> tag without attributes, with more text on the same line
-        newTable = re.sub("(?i)[\r\n]*?<table>(?P<more>[\w\W]*?)[\r\n ]*",
+        newTable = re.sub("(?i)[\r\n]*?<##table##>(?P<more>[\w\W]*?)[\r\n ]*",
                          r"\r\n{|\n\g<more>\r\n", newTable)
         # <table> tag with attributes, without more text on the same line
-        newTable = re.sub("(?i)[\r\n]*?<table (?P<attr>[\w\W]*?)>[\r\n ]*",
+        newTable = re.sub("(?i)[\r\n]*?<##table## (?P<attr>[\w\W]*?)>[\r\n ]*",
                          r"\r\n{| \g<attr>\r\n", newTable)
         # <table> tag without attributes, without more text on the same line
-        newTable = re.sub("(?i)[\r\n]*?<table>[\r\n ]*",
+        newTable = re.sub("(?i)[\r\n]*?<##table##>[\r\n ]*",
                          "\r\n{|\r\n", newTable)
         # end </table>
-        newTable = re.sub("(?i)[\s]*<\/table>",
+        newTable = re.sub("(?i)[\s]*<\/##table##>",
                           "\r\n|}", newTable)
         
         ##################
@@ -375,24 +376,32 @@
                 # why are only äöüß used, but not other special characters?
                 newTable, num = re.subn("(\r\n[A-Z]{1}[^\n\r]{200,}?[a-zäöüß]\.)\ ([A-ZÄÖÜ]{1}[^\n\r]{200,})",
                                        r"\1\r\n\2", newTable)
-        # show the changes for this table
-        if self.debug:
-            print table
-            print newTable
-        elif not self.quietMode:
-            wikipedia.showDiff(table, newTable)
         return newTable, warnings, warning_messages
 
+    def markActiveTables(self, text):
+        """
+        Marks all table start and end tags that are not disabled by nowiki
+        tags, comments etc.
+
+        We will then later only work on these marked tags.
+        """
+        tableStartTagR = re.compile("<table", re.IGNORECASE)
+        tableEndTagR = re.compile("</table>", re.IGNORECASE)
+
+        text = wikipedia.replaceExcept(text, tableStartTagR, "<##table##", exceptions = ['comment', 'math', 'nowiki', 'pre', 'source'])
+        text = wikipedia.replaceExcept(text, tableEndTagR, "</##table##>", exceptions = ['comment', 'math', 'nowiki', 'pre', 'source'])
+        return text
+
     def findTable(self, text):
         """
         Finds the first HTML table (which can contain nested tables) inside a
         text.
         Returns the table and the start and end position inside the text.
         """
-        # TODO: skip tables in HTML comments and nowiki tags
-        tableStartTagR = re.compile("<table", re.IGNORECASE)
-        tableEndTagR = re.compile("</table>", re.IGNORECASE)
-        m = tableStartTagR.search(text)
+        # Note that we added the ## characters in markActiveTables().
+        markedTableStartTagR = re.compile("<##table##", re.IGNORECASE)
+        markedTableEndTagR = re.compile("</##table##>", re.IGNORECASE)
+        m = markedTableStartTagR.search(text)
         if not m:
             return None, 0, 0
         else:
@@ -404,8 +413,8 @@
             depth = 1
             #i = start + 1
             while depth > 0:
-                nextStarting = tableStartTagR.search(text)
-                nextEnding = tableEndTagR.search(text)
+                nextStarting = markedTableStartTagR.search(text)
+                nextEnding = markedTableEndTagR.search(text)
                 if not nextEnding:
                     print "More opening than closing table tags. Skipping."
                     return None, 0, 0
@@ -420,13 +429,15 @@
                     depth -= 1
             end = offset
             return originalText[start:end], start, end
-                        
+
     def convertAllHTMLTables(self, text):
         '''
         Converts all HTML tables in text to wiki syntax.
         Returns the converted text, the number of converted tables and the
         number of warnings that occured.
         '''
+        text = self.markActiveTables(text)
+
         convertedTables = 0
         warningSum = 0
         warningMessages = u''
@@ -439,6 +450,12 @@
             print ">> Table %i <<" % (convertedTables + 1)
             # convert the current table
             newTable, warningsThisTable, warnMsgsThisTable = self.convertTable(table)
+            # show the changes for this table
+            if self.debug:
+                print table
+                print newTable
+            elif not self.quietMode:
+                wikipedia.showDiff(table.replace('##table##', 'table'), newTable)
             print ""
             warningSum += warningsThisTable
             for msg in warnMsgsThisTable:
@@ -447,7 +464,7 @@
             convertedTables += 1
 
         wikipedia.output(warningMessages)
-            
+
         return text, convertedTables, warningSum
 
     def treat(self, page):
@@ -468,6 +485,13 @@
             wikipedia.output(u'Skipping redirect %s' % page.title())
             return False
         newText, convertedTables, warningSum = self.convertAllHTMLTables(text)
+
+        # Check if there are any marked tags left
+        markedTableTagR = re.compile("<##table##|</##table##>", re.IGNORECASE)
+        if markedTableTagR.search(newText):
+            wikipedia.output(u'ERROR: not all marked table start or end tags processed!')
+            return
+
         if convertedTables == 0:
             wikipedia.output(u"No changes were necessary.")
         else:





More information about the Pywikipedia-l mailing list