http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11333
Revision: 11333 Author: xqt Date: 2013-04-04 05:54:15 +0000 (Thu, 04 Apr 2013) Log Message: ----------- enable nested templates handling for textlib.replaceExcept() The old implementation could only handle templates cascaded up to level 2 and in some circumstances it fails into an infinite loop. Now we use a similar code of textlib.extract_templates_and_params() resp. templatesWithParams() to hide and restore the templates.
MediaWiki variables and parser functions are handled as templates.
Bugfix for bug #3603994, bug #2819291, bug #3158761
Modified Paths: -------------- trunk/pywikipedia/pywikibot/textlib.py
Modified: trunk/pywikipedia/pywikibot/textlib.py =================================================================== --- trunk/pywikipedia/pywikibot/textlib.py 2013-04-03 22:39:10 UTC (rev 11332) +++ trunk/pywikipedia/pywikibot/textlib.py 2013-04-04 05:54:15 UTC (rev 11333) @@ -19,6 +19,7 @@ from HTMLParser import HTMLParser import config
+TEMP_REGEX = re.compile('{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}')
def unescape(s): """Replace escaped HTML-special characters by their originals""" @@ -75,14 +76,6 @@ # source code readability. # TODO: handle nested tables. 'table': re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'), - # templates with parameters often have whitespace that is used to - # improve wiki source code readability. - # 'template': re.compile(r'(?s){{.*?}}'), - # The regex above fails on nested templates. This regex can handle - # templates cascaded up to level 2, but no deeper. For arbitrary - # depth, we'd need recursion which can't be done in Python's re. - # After all, the language of correct parenthesis words is not regular. - 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'), 'hyperlink': compileLinkR(), 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), # this matches internal wikilinks, but also interwiki, categories, and @@ -107,12 +100,15 @@ old = re.compile(old)
dontTouchRegexes = [] + except_templates = False for exc in exceptions: if isinstance(exc, basestring): # assume it's a reference to the exceptionRegexes dictionary # defined above. if exc in exceptionRegexes: dontTouchRegexes.append(exceptionRegexes[exc]) + elif exc == 'template': + except_templates = True else: # nowiki, noinclude, includeonly, timeline, math ond other # extensions @@ -125,6 +121,35 @@ else: # assume it's a regular expression dontTouchRegexes.append(exc) + + # mark templates + # don't care about mw variables and parser functions + if except_templates: + marker1 = findmarker(text) + marker2 = findmarker(text, u'##', u'#') + Rvalue = re.compile('{{{.+?}}}') + Rmarker1 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker1}) + Rmarker2 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker2}) + values = {} + count = 0 + for m in Rvalue.finditer(text): + count += 1 + item = m.group() + text = text.replace(item, '%s%d%s' % (marker2, count, marker2)) + values[count] = item + inside = {} + count = 0 + while TEMP_REGEX.search(text) is not None: + for m in TEMP_REGEX.finditer(text): + count += 1 + item = m.group() + text = text.replace(item, '%s%d%s' % (marker1, count, marker1)) + + for m2 in Rmarker1.finditer(item): + item = item.replace(m2.group(), inside[int(m2.group(1))]) + for m2 in Rmarker2.finditer(item): + item = item.replace(m2.group(), values[int(m2.group(1))]) + inside[count] = item index = 0 markerpos = len(text) while True: @@ -194,6 +219,12 @@ index = match.start() + len(replacement) markerpos = match.start() + len(replacement) text = text[:markerpos] + marker + text[markerpos:] + + if except_templates: # restore templates from dict + for m2 in Rmarker1.finditer(text): + text = text.replace(m2.group(), inside[int(m2.group(1))]) + for m2 in Rmarker2.finditer(text): + text = text.replace(m2.group(), values[int(m2.group(1))]) return text
@@ -863,8 +894,6 @@ marker4 = findmarker(thistxt, u'§§', u'§')
result = [] - Rtemplate = re.compile( - ur'{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}') Rmath = re.compile(ur'<math>[^<]+</math>') Rvalue = re.compile(r'{{{.+?}}}') Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker)) @@ -891,8 +920,8 @@
inside = {} count = 0 - while Rtemplate.search(thistxt) is not None: - for m in Rtemplate.finditer(thistxt): + while TEMP_REGEX.search(thistxt) is not None: + for m in TEMP_REGEX.finditer(thistxt): # Make sure it is not detected again count += 1 text = m.group()