[Pywikipedia-l] SVN: [6240] trunk/pywikipedia/replace.py
multichill at svn.wikimedia.org
multichill at svn.wikimedia.org
Fri Jan 9 20:19:56 UTC 2009
Revision: 6240
Author: multichill
Date: 2009-01-09 20:19:56 +0000 (Fri, 09 Jan 2009)
Log Message:
-----------
Added dotall and multiline options for the regular expressions. Info at http://docs.python.org/library/re.html#re.compile
Modified Paths:
--------------
trunk/pywikipedia/replace.py
Modified: trunk/pywikipedia/replace.py
===================================================================
--- trunk/pywikipedia/replace.py 2009-01-09 19:35:00 UTC (rev 6239)
+++ trunk/pywikipedia/replace.py 2009-01-09 20:19:56 UTC (rev 6240)
@@ -27,6 +27,11 @@
-nocase Use case insensitive regular expressions.
+-dotall Make the dot match any character at all, including a newline.
+ Without this flag, '.' will match anything except a newline.
+
+-multiline '^' and '$' will now match begin and end of each line.
+
-xmlstart (Only works with -xml) Skip all articles in the XML dump
before the one specified (may also be given as
-xmlstart:Article).
@@ -489,6 +494,10 @@
acceptall = False
# Will become True if the user inputs the commandline parameter -nocase
caseInsensitive = False
+ # Will become True if the user inputs the commandline parameter -dotall
+ dotall = False
+ # Will become True if the user inputs the commandline parameter -multiline
+ multiline = False
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
@@ -555,6 +564,10 @@
recursive = True
elif arg == '-nocase':
caseInsensitive = True
+ elif arg == '-dotall':
+ dotall = True
+ elif arg == '-multiline':
+ multiline = True
elif arg.startswith('-addcat:'):
add_cat = arg[len('addcat:'):]
elif arg.startswith('-namespace:'):
@@ -641,15 +654,21 @@
exceptions = fix['exceptions']
replacements = fix['replacements']
+ #Set the regular expression flags
+ flags = re.UNICODE
+ if caseInsensitive:
+ flags = flags | re.IGNORECASE
+ if dotall:
+ flags = flags | re.DOTALL
+ if multiline:
+ flags = flags | re.MULTILINE
+
# Pre-compile all regular expressions here to save time later
for i in range(len(replacements)):
old, new = replacements[i]
if not regex:
old = re.escape(old)
- if caseInsensitive:
- oldR = re.compile(old, re.UNICODE | re.IGNORECASE)
- else:
- oldR = re.compile(old, re.UNICODE)
+ oldR = re.compile(old, flags)
replacements[i] = oldR, new
for exceptionCategory in ['title', 'require-title', 'text-contains', 'inside']:
@@ -657,12 +676,7 @@
patterns = exceptions[exceptionCategory]
if not regex:
patterns = [re.escape(pattern) for pattern in patterns]
- if caseInsensitive:
- patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE)
- for pattern in patterns]
- else:
- patterns = [re.compile(pattern, re.UNICODE)
- for pattern in patterns]
+ patterns = [re.compile(pattern, flags) for pattern in patterns]
exceptions[exceptionCategory] = patterns
if xmlFilename:
More information about the Pywikipedia-l
mailing list