[Pywikipedia-l] SVN: [6240] trunk/pywikipedia/replace.py

multichill at svn.wikimedia.org multichill at svn.wikimedia.org
Fri Jan 9 20:19:56 UTC 2009


Revision: 6240
Author:   multichill
Date:     2009-01-09 20:19:56 +0000 (Fri, 09 Jan 2009)

Log Message:
-----------
Added dotall and multiline options for the regular expressions. Info at http://docs.python.org/library/re.html#re.compile

Modified Paths:
--------------
    trunk/pywikipedia/replace.py

Modified: trunk/pywikipedia/replace.py
===================================================================
--- trunk/pywikipedia/replace.py	2009-01-09 19:35:00 UTC (rev 6239)
+++ trunk/pywikipedia/replace.py	2009-01-09 20:19:56 UTC (rev 6240)
@@ -27,6 +27,11 @@
 
 -nocase           Use case insensitive regular expressions.
 
+-dotall           Make the dot match any character at all, including a newline.
+                  Without this flag, '.' will match anything except a newline.
+
+-multiline        '^' and '$' will now match begin and end of each line.
+
 -xmlstart         (Only works with -xml) Skip all articles in the XML dump
                   before the one specified (may also be given as
                   -xmlstart:Article).
@@ -489,6 +494,10 @@
     acceptall = False
     # Will become True if the user inputs the commandline parameter -nocase
     caseInsensitive = False
+    # Will become True if the user inputs the commandline parameter -dotall
+    dotall = False
+    # Will become True if the user inputs the commandline parameter -multiline
+    multiline = False
     # Which namespaces should be processed?
     # default to [] which means all namespaces will be processed
     namespaces = []
@@ -555,6 +564,10 @@
             recursive = True
         elif arg == '-nocase':
             caseInsensitive = True
+        elif arg == '-dotall':
+            dotall = True
+        elif arg == '-multiline':
+            multiline = True
         elif arg.startswith('-addcat:'):
             add_cat = arg[len('addcat:'):]
         elif arg.startswith('-namespace:'):
@@ -641,15 +654,21 @@
             exceptions = fix['exceptions']
         replacements = fix['replacements']
 
+    #Set the regular expression flags
+    flags = re.UNICODE
+    if caseInsensitive:
+        flags = flags | re.IGNORECASE
+    if dotall:
+        flags = flags | re.DOTALL
+    if multiline:
+        flags = flags | re.MULTILINE
+
     # Pre-compile all regular expressions here to save time later
     for i in range(len(replacements)):
         old, new = replacements[i]
         if not regex:
             old = re.escape(old)
-        if caseInsensitive:
-            oldR = re.compile(old, re.UNICODE | re.IGNORECASE)
-        else:
-            oldR = re.compile(old, re.UNICODE)
+        oldR = re.compile(old, flags)
         replacements[i] = oldR, new
 
     for exceptionCategory in ['title', 'require-title', 'text-contains', 'inside']:
@@ -657,12 +676,7 @@
             patterns = exceptions[exceptionCategory]
             if not regex:
                 patterns = [re.escape(pattern) for pattern in patterns]
-            if caseInsensitive:
-                patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE)
-                            for pattern in patterns]
-            else:
-                patterns = [re.compile(pattern, re.UNICODE)
-                            for pattern in patterns]
+            patterns = [re.compile(pattern, flags) for pattern in patterns]
             exceptions[exceptionCategory] = patterns
 
     if xmlFilename:





More information about the Pywikipedia-l mailing list