SVN: [7350] trunk/pywikipedia/cosmetic_changes.py - Pywikipedia-svn

2 Oct 2009

Revision: 7350
Author:   siebrand
Date:     2009-10-02 09:34:45 +0000 (Fri, 02 Oct 2009)
Log Message:
-----------
additional features copied from fixes.py
Patch by xqt
Modified Paths:
--------------
    trunk/pywikipedia/cosmetic_changes.py
Modified: trunk/pywikipedia/cosmetic_changes.py
===================================================================

--- trunk/pywikipedia/cosmetic_changes.py	2009-10-02 09:34:16 UTC (rev 7349)
+++ trunk/pywikipedia/cosmetic_changes.py	2009-10-02 09:34:45 UTC (rev 7350)
@@ -188,6 +188,8 @@
         text = self.validXhtml(text)
         text = self.removeUselessSpaces(text)
         text = self.removeNonBreakingSpaceBeforePercent(text)
+        text = self.fixSyntaxSave(text)
+        text = self.fixHtml(text)
         try:
             text = isbn.hyphenateIsbnNumbers(text)
         except isbn.InvalidIsbnException, error:
@@ -424,6 +426,39 @@
                 text = wikipedia.replaceExcept(text, r'{{([mM][sS][gG]:)?' + template + '(?P<parameters>|[^}]+|)}}', '', ['comment', 'math', 'nowiki', 'pre'])
         return text
+    #from fixes.py
+    def fixSyntaxSave(self, text):
+        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace']
+        # external link in double brackets
+        text = wikipedia.replaceExcept(text, r'[[(?P<url>https?://[^]]+?)]]', r'[\g<url>]', exceptions)
+        # external link starting with double bracket
+        text = wikipedia.replaceExcept(text, r'[[(?P<url>https?://.+?)]', r'[\g<url>]', exceptions)
+        # external link and description separated by a dash, with
+        # whitespace in front of the dash, so that it is clear that
+        # the dash is not a legitimate part of the URL.
+        text = wikipedia.replaceExcept(text, r'[(?P<url>https?://[^|] \r\n]+?) +| *(?P<label>[^|]]+?)]', r'[\g<url> \g<label>]', exceptions)
+        # dash in external link, where the correct end of the URL can
+        # be detected from the file extension. It is very unlikely that
+        # this will cause mistakes.
+        text = wikipedia.replaceExcept(text, r'[(?P<url>https?://[^|] ]+?(.pdf|.html|.htm|.php|.asp|.aspx|.jsp)) *| *(?P<label>[^|]]+?)]', r'[\g<url> \g<label>]', exceptions)
+        return text
+
+    def fixHtml(self, text):
+        # Everything case-insensitive (?i)
+        # Keep in mind that MediaWiki automatically converts <br> to <br />
+        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace']
+        text = wikipedia.replaceExcept(text, r'(?i)<b>(.*?)</b>', r"'''\1'''" , exceptions)
+        text = wikipedia.replaceExcept(text, r'(?i)<strong>(.*?)</strong>', r"'''\1'''" , exceptions)
+        text = wikipedia.replaceExcept(text, r'(?i)<i>(.*?)</i>', r"''\1''" , exceptions)
+        text = wikipedia.replaceExcept(text, r'(?i)<em>(.*?)</em>', r"''\1''" , exceptions)
+        # horizontal line without attributes in a single line
+        text = wikipedia.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2', exceptions)
+        # horizontal line with attributes; can't be done with wiki syntax
+        # so we only make it XHTML compliant
+        text = wikipedia.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', r'<hr \1 />', exceptions)
+        # TODO: maybe we can make the bot replace <p> tags with \r\n's.
+        return text
+
 class CosmeticChangesBot:
     def __init__(self, generator, acceptall = False):
         self.generator = generator