http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11335
Revision: 11335 Author: xqt Date: 2013-04-04 07:29:29 +0000 (Thu, 04 Apr 2013) Log Message: ----------- update from trunk r11333
Modified Paths: -------------- branches/rewrite/pywikibot/textlib.py
Modified: branches/rewrite/pywikibot/textlib.py =================================================================== --- branches/rewrite/pywikibot/textlib.py 2013-04-04 06:30:35 UTC (rev 11334) +++ branches/rewrite/pywikibot/textlib.py 2013-04-04 07:29:29 UTC (rev 11335) @@ -19,6 +19,7 @@ from HTMLParser import HTMLParser import config2 as config
+TEMP_REGEX = re.compile('{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}')
def unescape(s): """Replace escaped HTML-special characters by their originals""" @@ -75,14 +76,6 @@ # source code readability. # TODO: handle nested tables. 'table': re.compile(r'(?ims)^{|.*?^|}|<table>.*?</table>'), - # templates with parameters often have whitespace that is used to - # improve wiki source code readability. - # 'template': re.compile(r'(?s){{.*?}}'), - # The regex above fails on nested templates. This regex can handle - # templates cascaded up to level 2, but no deeper. For arbitrary - # depth, we'd need recursion which can't be done in Python's re. - # After all, the language of correct parenthesis words is not regular. - 'template': re.compile(r'(?s){{(({{.*?}})?.*?)*}}'), 'hyperlink': compileLinkR(), 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), # this matches internal wikilinks, but also interwiki, categories, and @@ -107,12 +100,15 @@ old = re.compile(old)
dontTouchRegexes = [] + except_templates = False for exc in exceptions: if isinstance(exc, basestring): # assume it's a reference to the exceptionRegexes dictionary # defined above. if exc in exceptionRegexes: dontTouchRegexes.append(exceptionRegexes[exc]) + elif exc == 'template': + except_templates = True else: # nowiki, noinclude, includeonly, timeline, math ond other # extensions @@ -125,6 +121,35 @@ else: # assume it's a regular expression dontTouchRegexes.append(exc) + + # mark templates + # don't care about mw variables and parser functions + if except_templates: + marker1 = findmarker(text) + marker2 = findmarker(text, u'##', u'#') + Rvalue = re.compile('{{{.+?}}}') + Rmarker1 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker1}) + Rmarker2 = re.compile('%(mark)s(\d+)%(mark)s' % {'mark': marker2}) + values = {} + count = 0 + for m in Rvalue.finditer(text): + count += 1 + item = m.group() + text = text.replace(item, '%s%d%s' % (marker2, count, marker2)) + values[count] = item + inside = {} + count = 0 + while TEMP_REGEX.search(text) is not None: + for m in TEMP_REGEX.finditer(text): + count += 1 + item = m.group() + text = text.replace(item, '%s%d%s' % (marker1, count, marker1)) + + for m2 in Rmarker1.finditer(item): + item = item.replace(m2.group(), inside[int(m2.group(1))]) + for m2 in Rmarker2.finditer(item): + item = item.replace(m2.group(), values[int(m2.group(1))]) + inside[count] = item index = 0 markerpos = len(text) while True: @@ -194,6 +219,12 @@ index = match.start() + len(replacement) markerpos = match.start() + len(replacement) text = text[:markerpos] + marker + text[markerpos:] + + if except_templates: # restore templates from dict + for m2 in Rmarker1.finditer(text): + text = text.replace(m2.group(), inside[int(m2.group(1))]) + for m2 in Rmarker2.finditer(text): + text = text.replace(m2.group(), values[int(m2.group(1))]) return text
@@ -831,7 +862,7 @@ #----------------------------------
def extract_templates_and_params(text): - """Return list of template calls found in text. + """Return a list of templates found in text.
Return value is a list of tuples. There is one tuple for each use of a template in the page, with the template title as the first entry and a @@ -840,6 +871,8 @@ with an integer value corresponding to its position among the unnnamed parameters, and if this results multiple parameters with the same name only the last value provided will be returned. + @param text: The wikitext from which templates are extracted + @type text: unicode or string
""" # remove commented-out stuff etc. @@ -858,8 +891,6 @@ marker4 = findmarker(thistxt, u'§§', u'§')
result = [] - Rtemplate = re.compile( - ur'{{(msg:)?(?P<name>[^{|]+?)(|(?P<params>[^{]+?))?}}') Rmath = re.compile(ur'<math>[^<]+</math>') Rvalue = re.compile(r'{{{.+?}}}') Rmarker = re.compile(ur'%s(\d+)%s' % (marker, marker)) @@ -886,8 +917,8 @@
inside = {} count = 0 - while Rtemplate.search(thistxt) is not None: - for m in Rtemplate.finditer(thistxt): + while TEMP_REGEX.search(thistxt) is not None: + for m in TEMP_REGEX.finditer(thistxt): # Make sure it is not detected again count += 1 text = m.group() @@ -909,6 +940,35 @@ # Doesn't detect templates whose name changes, # or templates whose name contains math tags continue + + # {{#if: }} + if name.startswith('#'): + continue + +## TODO: merged from wikipedia.py - implement the following +## if self.site().isInterwikiLink(name): +## continue +## # {{DEFAULTSORT:...}} +## defaultKeys = self.site().versionnumber() > 13 and \ +## self.site().getmagicwords('defaultsort') +## # It seems some wikis does not have this magic key +## if defaultKeys: +## found = False +## for key in defaultKeys: +## if name.startswith(key): +## found = True +## break +## if found: continue +## +## try: +## name = Page(self.site(), name).title() +## except InvalidTitle: +## if name: +## output( +## u"Page %s contains invalid template name {{%s}}." +## % (self.title(), name.strip())) +## continue + # Parameters paramString = m.group('params') params = {}