jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/697143 )
Change subject: [cleanup] remove deprecated extract_templates_and_params_* functions ......................................................................
[cleanup] remove deprecated extract_templates_and_params_* functions
mwparserfromhell or wikitextparser MediaWiki markup parser is mandatory. Therefore: - remove deprecated extract_templates_and_params_mwpfh function - remove deprecated extract_templates_and_params_regex function - remove tests accordingly
Change-Id: I529e846e655cbceb21672b09b2329c11e25d3307 --- M pywikibot/textlib.py M tests/textlib_tests.py 2 files changed, 70 insertions(+), 324 deletions(-)
Approvals: JJMC89: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 696c6c3..bfb1ee2 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -55,13 +55,6 @@ # cache for replaceExcept to avoid recompile or regexes each call _regex_cache = {}
-# This regex is only for use by extract_templates_and_params_regex. -# It does not support template variables consisting of nested templates, -# system variables like {{CURRENTYEAR}}, or template variables like {{{1}}}. -_ETP_REGEX = re.compile( - r'{{(?:msg:)?(?P<name>[^{|]+?)' - r'(?:|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?)?)?}}') - # The regex below collects nested templates, providing simpler # identification of templates used at the top-level of wikitext. # It doesn't match {{{1|...}}}, however it also does not match templates @@ -1662,177 +1655,6 @@ return result
-@deprecated('extract_templates_and_params', since='20210329', - future_warning=True) -def extract_templates_and_params_mwpfh(text: str, - strip: bool = False) -> ETPType: - """DEPRECATED. Extract templates with params using mwparserfromhell.""" - global wikitextparser - saved_parser = wikitextparser - import mwparserfromhell as wikitextparser - result = extract_templates_and_params(text, strip=strip) - wikitextparser = saved_parser - return result - - -@deprecated('extract_templates_and_params', since='20210331', - future_warning=True) -def extract_templates_and_params_regex(text: str, - remove_disabled_parts: bool = True, - strip: bool = True) -> ETPType: - """DEPRECATED. Extract templates with params using a regex. - - This function should not be called directly. - - Use extract_templates_and_params, which will fallback to using this - regex based implementation when the mwparserfromhell implementation - is not used. - - @param text: The wikitext from which templates are extracted - @param strip: if enabled, strip arguments and values of templates - @return: list of template name and params - """ - # remove commented-out stuff etc. - if remove_disabled_parts: - thistxt = removeDisabledParts(text) - else: - thistxt = text - - # marker for inside templates or parameters - marker1 = findmarker(thistxt) - - # marker for links - marker2 = findmarker(thistxt, '##', '#') - - # marker for math - marker3 = findmarker(thistxt, '%%', '%') - - # marker for value parameter - marker4 = findmarker(thistxt, '§§', '§') - - result = [] - Rmath = re.compile(r'<math>[^<]+</math>') - Rvalue = re.compile(r'{{{.+?}}}') - Rmarker1 = re.compile(r'{m}(\d+){m}'.format(m=marker1)) - Rmarker2 = re.compile(r'{m}(\d+){m}'.format(m=marker2)) - Rmarker3 = re.compile(r'{m}(\d+){m}'.format(m=marker3)) - Rmarker4 = re.compile(r'{m}(\d+){m}'.format(m=marker4)) - - # Replace math with markers - maths = {} - count = 0 - for m in Rmath.finditer(thistxt): - count += 1 - item = m.group() - thistxt = thistxt.replace(item, '{m}{c}{m}' - .format(m=marker3, c=count)) - maths[count] = item - - values = {} - count = 0 - for m in Rvalue.finditer(thistxt): - count += 1 - # If we have digits between brackets, restoring from dict may fail. - # So we need to change the index. We have to search in the origin text. - while '}}}%d{{{' % count in text: - count += 1 - item = m.group() - thistxt = thistxt.replace(item, '{m}{c}{m}' - .format(m=marker4, c=count)) - values[count] = item - - inside = {} - seen = set() - count = 0 - while _ETP_REGEX.search(thistxt) is not None: - for m in _ETP_REGEX.finditer(thistxt): - # Make sure it is not detected again - item = m.group() - if item in seen: - continue # speed up - seen.add(item) - count += 1 - while '}}%d{{' % count in text: - count += 1 - thistxt = thistxt.replace(item, '{m}{c}{m}' - .format(m=marker1, c=count)) - - # Make sure stored templates don't contain markers - for m2 in Rmarker1.finditer(item): - item = item.replace(m2.group(), inside[int(m2.group(1))]) - for m2 in Rmarker3.finditer(item): - item = item.replace(m2.group(), maths[int(m2.group(1))]) - for m2 in Rmarker4.finditer(item): - item = item.replace(m2.group(), values[int(m2.group(1))]) - inside[count] = item - - # Name - name = m.group('name').strip() - m2 = Rmarker1.search(name) or Rmath.search(name) - if m2 is not None: - # Doesn't detect templates whose name changes, - # or templates whose name contains math tags - continue - - # {{#if: }} - if not name or name.startswith('#'): - continue - - # Parameters - paramString = m.group('params') - params = OrderedDict() - numbered_param = 1 - if paramString: - # Replace wikilinks with markers - links = {} - count2 = 0 - for m2 in pywikibot.link_regex.finditer(paramString): - count2 += 1 - item = m2.group(0) - paramString = paramString.replace( - item, '{m}{c}{m}'.format(m=marker2, c=count2)) - links[count2] = item - # Parse string - markedParams = paramString.split('|') - # Replace markers - for param in markedParams: - if '=' in param: - param_name, param_val = param.split('=', 1) - implicit_parameter = False - else: - param_name = str(numbered_param) - param_val = param - numbered_param += 1 - implicit_parameter = True - count = len(inside) - for m2 in Rmarker1.finditer(param_val): - param_val = param_val.replace(m2.group(), - inside[int(m2.group(1))]) - for m2 in Rmarker2.finditer(param_val): - param_val = param_val.replace(m2.group(), - links[int(m2.group(1))]) - for m2 in Rmarker3.finditer(param_val): - param_val = param_val.replace(m2.group(), - maths[int(m2.group(1))]) - for m2 in Rmarker4.finditer(param_val): - param_val = param_val.replace(m2.group(), - values[int(m2.group(1))]) - if strip: - param_name = param_name.strip() - if not implicit_parameter: - param_val = param_val.strip() - params[param_name] = param_val - - # Special case for {{a|}} which has an undetected parameter - if not params and '|' in m.group(0): - params = OrderedDict({'1': ''}) - - # Add it to the result - result.append((name, params)) - - return result - - def extract_templates_and_params_regex_simple(text: str): """ Extract top-level templates with params using only a simple regex. diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py index 6904e2c..e53c245 100644 --- a/tests/textlib_tests.py +++ b/tests/textlib_tests.py @@ -18,7 +18,7 @@ from pywikibot.exceptions import UnknownSiteError from pywikibot.site._interwikimap import _IWEntry from pywikibot.textlib import MultiTemplateMatchBuilder, extract_sections -from pywikibot.tools import suppress_warnings +from pywikibot.tools import has_module, suppress_warnings from tests import mock from tests.aspects import ( DefaultDrySiteTestCase, @@ -303,10 +303,6 @@ 'Invalid category title extracted: nasty{{{!}}')
-WARNING_MSG = (r'.*extract_templates_and_params_.*' - r'is deprecated for .*; use extract_templates_and_params') - - class TestTemplateParams(TestCase):
"""Test to verify that template params extraction works.""" @@ -315,61 +311,58 @@
def _common_results(self, func): """Common cases.""" - with suppress_warnings(WARNING_MSG, category=FutureWarning): - self.assertEqual(func('{{a}}'), [('a', OrderedDict())]) + self.assertEqual(func('{{a}}'), [('a', OrderedDict())]) + self.assertEqual(func('{{ a}}'), [('a', OrderedDict())]) + self.assertEqual(func('{{a }}'), [('a', OrderedDict())]) + self.assertEqual(func('{{ a }}'), [('a', OrderedDict())]) + self.assertEqual(func('{{a|b=c}}'), + [('a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{a|b|c=d}}'), + [('a', OrderedDict((('1', 'b'), ('c', 'd'))))]) + self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'), + [('a', OrderedDict((('b', 'c'), ('f', 'g'), + ('d', 'e'), ('1', ''))))]) + self.assertEqual(func('{{a|1=2|c=d}}'), + [('a', OrderedDict((('1', '2'), ('c', 'd'))))]) + self.assertEqual(func('{{a|c=d|1=2}}'), + [('a', OrderedDict((('c', 'd'), ('1', '2'))))]) + self.assertEqual(func('{{a|5=d|a=b}}'), + [('a', OrderedDict((('5', 'd'), ('a', 'b'))))]) + self.assertEqual(func('{{a|=2}}'), + [('a', OrderedDict((('', '2'), )))]) + self.assertEqual(func('{{a|}}'), + [('a', OrderedDict((('1', ''), )))]) + self.assertEqual(func('{{a|=|}}'), + [('a', OrderedDict((('', ''), ('1', ''))))]) + self.assertEqual(func('{{a||}}'), + [('a', OrderedDict((('1', ''), ('2', ''))))]) + self.assertEqual(func('{{a|b={{{1}}}}}'), + [('a', OrderedDict((('b', '{{{1}}}'), )))]) + self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'), + [('a', + OrderedDict((('b', + '<noinclude>{{{1}}}</noinclude>'), + )))]) + self.assertEqual(func('{{Template:a|b=c}}'), + [('Template:a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{template:a|b=c}}'), + [('template:a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{:a|b=c}}'), + [(':a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'), + [('a', OrderedDict((('b', '{{{1}}}'), + ('c', '{{{2}}}'))))]) + self.assertEqual(func('{{a|b=c}}{{d|e=f}}'), + [('a', OrderedDict((('b', 'c'), ))), + ('d', OrderedDict((('e', 'f'), )))])
- with suppress_warnings(WARNING_MSG, category=FutureWarning): - self.assertEqual(func('{{ a}}'), [('a', OrderedDict())]) - self.assertEqual(func('{{a }}'), [('a', OrderedDict())]) - self.assertEqual(func('{{ a }}'), [('a', OrderedDict())]) - self.assertEqual(func('{{a|b=c}}'), - [('a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{a|b|c=d}}'), - [('a', OrderedDict((('1', 'b'), ('c', 'd'))))]) - self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'), - [('a', OrderedDict((('b', 'c'), ('f', 'g'), - ('d', 'e'), ('1', ''))))]) - self.assertEqual(func('{{a|1=2|c=d}}'), - [('a', OrderedDict((('1', '2'), ('c', 'd'))))]) - self.assertEqual(func('{{a|c=d|1=2}}'), - [('a', OrderedDict((('c', 'd'), ('1', '2'))))]) - self.assertEqual(func('{{a|5=d|a=b}}'), - [('a', OrderedDict((('5', 'd'), ('a', 'b'))))]) - self.assertEqual(func('{{a|=2}}'), - [('a', OrderedDict((('', '2'), )))]) - self.assertEqual(func('{{a|}}'), - [('a', OrderedDict((('1', ''), )))]) - self.assertEqual(func('{{a|=|}}'), - [('a', OrderedDict((('', ''), ('1', ''))))]) - self.assertEqual(func('{{a||}}'), - [('a', OrderedDict((('1', ''), ('2', ''))))]) - self.assertEqual(func('{{a|b={{{1}}}}}'), - [('a', OrderedDict((('b', '{{{1}}}'), )))]) - self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'), - [('a', - OrderedDict((('b', - '<noinclude>{{{1}}}</noinclude>'), - )))]) - self.assertEqual(func('{{Template:a|b=c}}'), - [('Template:a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{template:a|b=c}}'), - [('template:a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{:a|b=c}}'), - [(':a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'), - [('a', OrderedDict((('b', '{{{1}}}'), - ('c', '{{{2}}}'))))]) - self.assertEqual(func('{{a|b=c}}{{d|e=f}}'), - [('a', OrderedDict((('b', 'c'), ))), - ('d', OrderedDict((('e', 'f'), )))]) + # initial '{' and '}' should be ignored as outer wikitext + self.assertEqual(func('{{{a|b}}X}'), + [('a', OrderedDict((('1', 'b'), )))])
- # initial '{' and '}' should be ignored as outer wikitext - self.assertEqual(func('{{{a|b}}X}'), - [('a', OrderedDict((('1', 'b'), )))]) - - # sf.net bug 1575: unclosed template - self.assertEqual(func('{{a'), []) - self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())]) + # sf.net bug 1575: unclosed template + self.assertEqual(func('{{a'), []) + self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])
def _unstripped(self, func): """Common cases of unstripped results.""" @@ -456,11 +449,12 @@ ('2', 'd')])), ('b', OrderedDict([('1', 'c')]))])
- def _mwpfh_passes(self, func, failing=False): + def _mwpfh_passes(self, func): """Common cases failing with wikitextparser but passes with mwpfh.
Probably the behaviour of regex or mwpfh is wrong. """ + failing = has_module('wikitextparser') patterns = [ '{{subst:a|b=c}}', '{{safesubst:a|b=c}}', @@ -480,25 +474,24 @@ @require_modules('mwparserfromhell') def test_extract_templates_params_mwpfh(self): """Test using mwparserfromhell.""" - func = textlib.extract_templates_and_params_mwpfh - with suppress_warnings(WARNING_MSG, category=FutureWarning): - self._common_results(func) - self._order_differs(func) - self._unstripped(func) - self._etp_regex_differs(func) - self._mwpfh_passes(func) + func = textlib.extract_templates_and_params + self._common_results(func) + self._order_differs(func) + self._unstripped(func) + self._etp_regex_differs(func) + self._mwpfh_passes(func)
- self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'), - [('c', OrderedDict((('1', '{{d}}'), ))), - ('a', OrderedDict([('1', '{{c|{{d}}}}')])), - ('d', OrderedDict()) - ]) + self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'), + [('c', OrderedDict((('1', '{{d}}'), ))), + ('a', OrderedDict([('1', '{{c|{{d}}}}')])), + ('d', OrderedDict()) + ])
- self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'), - [('c', OrderedDict((('1', '{{d|}}'), ))), - ('a', OrderedDict([('1', '{{c|{{d|}}}}')])), - ('d', OrderedDict([('1', '')])) - ]) + self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'), + [('c', OrderedDict((('1', '{{d|}}'), ))), + ('a', OrderedDict([('1', '{{c|{{d|}}}}')])), + ('d', OrderedDict([('1', '')])) + ])
@require_modules('mwparserfromhell') def test_extract_templates_params_parser_stripped(self): @@ -518,7 +511,7 @@ self._order_differs(func) self._unstripped(func) self._etp_regex_differs(func) - self._mwpfh_passes(func, failing=True) + self._mwpfh_passes(func)
self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'), [('c', OrderedDict((('1', '{{d}}'), ))), @@ -532,42 +525,6 @@ ('d', OrderedDict([('1', '')])) ])
- def test_extract_templates_params_regex(self): - """Test using many complex regexes.""" - func = functools.partial(textlib.extract_templates_and_params_regex, - remove_disabled_parts=False, strip=False) - with suppress_warnings(WARNING_MSG, category=FutureWarning): - self._common_results(func) - self._order_differs(func) - self._unstripped(func) - # FIXME: {} is normal text - self.assertEqual(func('{{a|b={} }}'), []) - - def test_extract_templates_params_regex_stripped(self): - """Test using many complex regexes with stripping.""" - func = textlib.extract_templates_and_params_regex - with suppress_warnings(WARNING_MSG, category=FutureWarning): - self._common_results(func) - self._order_differs(func) - self._stripped(func) - - self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), - [('a', OrderedDict((('b', ''), )))]) - - # Identical to mwpfh - self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'), - [('c', OrderedDict((('1', '{{d}}'), ))), - ('a', OrderedDict([('1', '{{c|{{d}}}}')])), - ('d', OrderedDict()) - ]) - - # However fails to correctly handle three levels of balanced - # brackets with empty parameters - self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'), - [('c', OrderedDict((('1', '{{d|}}}'), ))), - ('d', OrderedDict([('1', '}')])) - ]) - @require_modules('mwparserfromhell') def test_extract_templates_params(self): """Test that the normal entry point works.""" @@ -621,39 +578,6 @@ self.assertEqual(func('{{a|{{c|{{d|{{e|}}}} }} }} foo {{b}}'), [(None, OrderedDict())])
- def test_etp_regex(self): - """Test _ETP_REGEX.""" - func = textlib._ETP_REGEX.search - - self.assertIsNotNone(func('{{{1}}}')) - self.assertIsNotNone(func('{{a|b={{{1}}} }}')) - self.assertIsNotNone(func('{{a|b={{c}} }}')) - self.assertIsNotNone(func('{{a|b={{c}} }}')) - self.assertIsNotNone(func('{{a|b={{c|d=1}} }}')) - - self.assertIsNotNone(func('{{a|{{c}} }}')) - self.assertIsNotNone(func('{{a|{{c|d}} }}')) - - func = textlib._ETP_REGEX.match - - self.assertIsNone(func('{{{1}}}')) - - self.assertIsNotNone(func('{{#if:foo}}')) - self.assertIsNotNone(func('{{foo:}}')) - - self.assertIsNotNone(func('{{CURRENTYEAR}}')) - self.assertIsNotNone(func('{{1}}')) - - self.assertIsNone(func('{{a|b={{CURRENTYEAR}} }}')) - self.assertIsNone(func('{{a|b={{{1}}} }}')) - self.assertIsNone(func('{{a|b={{c}} }}')) - self.assertIsNone(func('{{a|b={{c|d=1}} }}')) - self.assertIsNone(func('{{a|b={} }}')) - self.assertIsNone(func('{{:a|b={{c|d=1}} }}')) - - self.assertIsNone(func('{{a|{{c}} }}')) - self.assertIsNone(func('{{a|{{c|d}} }}')) - def test_nested_template_regex_search(self): """Test NESTED_TEMPLATE_REGEX search.""" func = textlib.NESTED_TEMPLATE_REGEX.search