Revision: 6240 Author: multichill Date: 2009-01-09 20:19:56 +0000 (Fri, 09 Jan 2009)
Log Message: ----------- Added dotall and multiline options for the regular expressions. Info at http://docs.python.org/library/re.html#re.compile
Modified Paths: -------------- trunk/pywikipedia/replace.py
Modified: trunk/pywikipedia/replace.py =================================================================== --- trunk/pywikipedia/replace.py 2009-01-09 19:35:00 UTC (rev 6239) +++ trunk/pywikipedia/replace.py 2009-01-09 20:19:56 UTC (rev 6240) @@ -27,6 +27,11 @@
-nocase Use case insensitive regular expressions.
+-dotall Make the dot match any character at all, including a newline. + Without this flag, '.' will match anything except a newline. + +-multiline '^' and '$' will now match begin and end of each line. + -xmlstart (Only works with -xml) Skip all articles in the XML dump before the one specified (may also be given as -xmlstart:Article). @@ -489,6 +494,10 @@ acceptall = False # Will become True if the user inputs the commandline parameter -nocase caseInsensitive = False + # Will become True if the user inputs the commandline parameter -dotall + dotall = False + # Will become True if the user inputs the commandline parameter -multiline + multiline = False # Which namespaces should be processed? # default to [] which means all namespaces will be processed namespaces = [] @@ -555,6 +564,10 @@ recursive = True elif arg == '-nocase': caseInsensitive = True + elif arg == '-dotall': + dotall = True + elif arg == '-multiline': + multiline = True elif arg.startswith('-addcat:'): add_cat = arg[len('addcat:'):] elif arg.startswith('-namespace:'): @@ -641,15 +654,21 @@ exceptions = fix['exceptions'] replacements = fix['replacements']
+ #Set the regular expression flags + flags = re.UNICODE + if caseInsensitive: + flags = flags | re.IGNORECASE + if dotall: + flags = flags | re.DOTALL + if multiline: + flags = flags | re.MULTILINE + # Pre-compile all regular expressions here to save time later for i in range(len(replacements)): old, new = replacements[i] if not regex: old = re.escape(old) - if caseInsensitive: - oldR = re.compile(old, re.UNICODE | re.IGNORECASE) - else: - oldR = re.compile(old, re.UNICODE) + oldR = re.compile(old, flags) replacements[i] = oldR, new
for exceptionCategory in ['title', 'require-title', 'text-contains', 'inside']: @@ -657,12 +676,7 @@ patterns = exceptions[exceptionCategory] if not regex: patterns = [re.escape(pattern) for pattern in patterns] - if caseInsensitive: - patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE) - for pattern in patterns] - else: - patterns = [re.compile(pattern, re.UNICODE) - for pattern in patterns] + patterns = [re.compile(pattern, flags) for pattern in patterns] exceptions[exceptionCategory] = patterns
if xmlFilename:
pywikipedia-l@lists.wikimedia.org