jenkins-bot submitted this change.

View Change

Approvals: JJMC89: Looks good to me, approved jenkins-bot: Verified
[cleanup] remove deprecated extract_templates_and_params_* functions

mwparserfromhell or wikitextparser MediaWiki markup parser is mandatory.
Therefore:
- remove deprecated extract_templates_and_params_mwpfh function
- remove deprecated extract_templates_and_params_regex function
- remove tests accordingly

Change-Id: I529e846e655cbceb21672b09b2329c11e25d3307
---
M pywikibot/textlib.py
M tests/textlib_tests.py
2 files changed, 70 insertions(+), 324 deletions(-)

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 696c6c3..bfb1ee2 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -55,13 +55,6 @@
# cache for replaceExcept to avoid recompile or regexes each call
_regex_cache = {}

-# This regex is only for use by extract_templates_and_params_regex.
-# It does not support template variables consisting of nested templates,
-# system variables like {{CURRENTYEAR}}, or template variables like {{{1}}}.
-_ETP_REGEX = re.compile(
- r'{{(?:msg:)?(?P<name>[^{\|]+?)'
- r'(?:\|(?P<params>[^{]+?(?:{[^{]+?}[^{]*?)?)?)?}}')
-
# The regex below collects nested templates, providing simpler
# identification of templates used at the top-level of wikitext.
# It doesn't match {{{1|...}}}, however it also does not match templates
@@ -1662,177 +1655,6 @@
return result


-@deprecated('extract_templates_and_params', since='20210329',
- future_warning=True)
-def extract_templates_and_params_mwpfh(text: str,
- strip: bool = False) -> ETPType:
- """DEPRECATED. Extract templates with params using mwparserfromhell."""
- global wikitextparser
- saved_parser = wikitextparser
- import mwparserfromhell as wikitextparser
- result = extract_templates_and_params(text, strip=strip)
- wikitextparser = saved_parser
- return result
-
-
-@deprecated('extract_templates_and_params', since='20210331',
- future_warning=True)
-def extract_templates_and_params_regex(text: str,
- remove_disabled_parts: bool = True,
- strip: bool = True) -> ETPType:
- """DEPRECATED. Extract templates with params using a regex.
-
- This function should not be called directly.
-
- Use extract_templates_and_params, which will fallback to using this
- regex based implementation when the mwparserfromhell implementation
- is not used.
-
- @param text: The wikitext from which templates are extracted
- @param strip: if enabled, strip arguments and values of templates
- @return: list of template name and params
- """
- # remove commented-out stuff etc.
- if remove_disabled_parts:
- thistxt = removeDisabledParts(text)
- else:
- thistxt = text
-
- # marker for inside templates or parameters
- marker1 = findmarker(thistxt)
-
- # marker for links
- marker2 = findmarker(thistxt, '##', '#')
-
- # marker for math
- marker3 = findmarker(thistxt, '%%', '%')
-
- # marker for value parameter
- marker4 = findmarker(thistxt, '§§', '§')
-
- result = []
- Rmath = re.compile(r'<math>[^<]+</math>')
- Rvalue = re.compile(r'{{{.+?}}}')
- Rmarker1 = re.compile(r'{m}(\d+){m}'.format(m=marker1))
- Rmarker2 = re.compile(r'{m}(\d+){m}'.format(m=marker2))
- Rmarker3 = re.compile(r'{m}(\d+){m}'.format(m=marker3))
- Rmarker4 = re.compile(r'{m}(\d+){m}'.format(m=marker4))
-
- # Replace math with markers
- maths = {}
- count = 0
- for m in Rmath.finditer(thistxt):
- count += 1
- item = m.group()
- thistxt = thistxt.replace(item, '{m}{c}{m}'
- .format(m=marker3, c=count))
- maths[count] = item
-
- values = {}
- count = 0
- for m in Rvalue.finditer(thistxt):
- count += 1
- # If we have digits between brackets, restoring from dict may fail.
- # So we need to change the index. We have to search in the origin text.
- while '}}}%d{{{' % count in text:
- count += 1
- item = m.group()
- thistxt = thistxt.replace(item, '{m}{c}{m}'
- .format(m=marker4, c=count))
- values[count] = item
-
- inside = {}
- seen = set()
- count = 0
- while _ETP_REGEX.search(thistxt) is not None:
- for m in _ETP_REGEX.finditer(thistxt):
- # Make sure it is not detected again
- item = m.group()
- if item in seen:
- continue # speed up
- seen.add(item)
- count += 1
- while '}}%d{{' % count in text:
- count += 1
- thistxt = thistxt.replace(item, '{m}{c}{m}'
- .format(m=marker1, c=count))
-
- # Make sure stored templates don't contain markers
- for m2 in Rmarker1.finditer(item):
- item = item.replace(m2.group(), inside[int(m2.group(1))])
- for m2 in Rmarker3.finditer(item):
- item = item.replace(m2.group(), maths[int(m2.group(1))])
- for m2 in Rmarker4.finditer(item):
- item = item.replace(m2.group(), values[int(m2.group(1))])
- inside[count] = item
-
- # Name
- name = m.group('name').strip()
- m2 = Rmarker1.search(name) or Rmath.search(name)
- if m2 is not None:
- # Doesn't detect templates whose name changes,
- # or templates whose name contains math tags
- continue
-
- # {{#if: }}
- if not name or name.startswith('#'):
- continue
-
- # Parameters
- paramString = m.group('params')
- params = OrderedDict()
- numbered_param = 1
- if paramString:
- # Replace wikilinks with markers
- links = {}
- count2 = 0
- for m2 in pywikibot.link_regex.finditer(paramString):
- count2 += 1
- item = m2.group(0)
- paramString = paramString.replace(
- item, '{m}{c}{m}'.format(m=marker2, c=count2))
- links[count2] = item
- # Parse string
- markedParams = paramString.split('|')
- # Replace markers
- for param in markedParams:
- if '=' in param:
- param_name, param_val = param.split('=', 1)
- implicit_parameter = False
- else:
- param_name = str(numbered_param)
- param_val = param
- numbered_param += 1
- implicit_parameter = True
- count = len(inside)
- for m2 in Rmarker1.finditer(param_val):
- param_val = param_val.replace(m2.group(),
- inside[int(m2.group(1))])
- for m2 in Rmarker2.finditer(param_val):
- param_val = param_val.replace(m2.group(),
- links[int(m2.group(1))])
- for m2 in Rmarker3.finditer(param_val):
- param_val = param_val.replace(m2.group(),
- maths[int(m2.group(1))])
- for m2 in Rmarker4.finditer(param_val):
- param_val = param_val.replace(m2.group(),
- values[int(m2.group(1))])
- if strip:
- param_name = param_name.strip()
- if not implicit_parameter:
- param_val = param_val.strip()
- params[param_name] = param_val
-
- # Special case for {{a|}} which has an undetected parameter
- if not params and '|' in m.group(0):
- params = OrderedDict({'1': ''})
-
- # Add it to the result
- result.append((name, params))
-
- return result
-
-
def extract_templates_and_params_regex_simple(text: str):
"""
Extract top-level templates with params using only a simple regex.
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 6904e2c..e53c245 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -18,7 +18,7 @@
from pywikibot.exceptions import UnknownSiteError
from pywikibot.site._interwikimap import _IWEntry
from pywikibot.textlib import MultiTemplateMatchBuilder, extract_sections
-from pywikibot.tools import suppress_warnings
+from pywikibot.tools import has_module, suppress_warnings
from tests import mock
from tests.aspects import (
DefaultDrySiteTestCase,
@@ -303,10 +303,6 @@
'Invalid category title extracted: nasty{{{!}}')


-WARNING_MSG = (r'.*extract_templates_and_params_.*'
- r'is deprecated for .*; use extract_templates_and_params')
-
-
class TestTemplateParams(TestCase):

"""Test to verify that template params extraction works."""
@@ -315,61 +311,58 @@

def _common_results(self, func):
"""Common cases."""
- with suppress_warnings(WARNING_MSG, category=FutureWarning):
- self.assertEqual(func('{{a}}'), [('a', OrderedDict())])
+ self.assertEqual(func('{{a}}'), [('a', OrderedDict())])
+ self.assertEqual(func('{{ a}}'), [('a', OrderedDict())])
+ self.assertEqual(func('{{a }}'), [('a', OrderedDict())])
+ self.assertEqual(func('{{ a }}'), [('a', OrderedDict())])
+ self.assertEqual(func('{{a|b=c}}'),
+ [('a', OrderedDict((('b', 'c'), )))])
+ self.assertEqual(func('{{a|b|c=d}}'),
+ [('a', OrderedDict((('1', 'b'), ('c', 'd'))))])
+ self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'),
+ [('a', OrderedDict((('b', 'c'), ('f', 'g'),
+ ('d', 'e'), ('1', ''))))])
+ self.assertEqual(func('{{a|1=2|c=d}}'),
+ [('a', OrderedDict((('1', '2'), ('c', 'd'))))])
+ self.assertEqual(func('{{a|c=d|1=2}}'),
+ [('a', OrderedDict((('c', 'd'), ('1', '2'))))])
+ self.assertEqual(func('{{a|5=d|a=b}}'),
+ [('a', OrderedDict((('5', 'd'), ('a', 'b'))))])
+ self.assertEqual(func('{{a|=2}}'),
+ [('a', OrderedDict((('', '2'), )))])
+ self.assertEqual(func('{{a|}}'),
+ [('a', OrderedDict((('1', ''), )))])
+ self.assertEqual(func('{{a|=|}}'),
+ [('a', OrderedDict((('', ''), ('1', ''))))])
+ self.assertEqual(func('{{a||}}'),
+ [('a', OrderedDict((('1', ''), ('2', ''))))])
+ self.assertEqual(func('{{a|b={{{1}}}}}'),
+ [('a', OrderedDict((('b', '{{{1}}}'), )))])
+ self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'),
+ [('a',
+ OrderedDict((('b',
+ '<noinclude>{{{1}}}</noinclude>'),
+ )))])
+ self.assertEqual(func('{{Template:a|b=c}}'),
+ [('Template:a', OrderedDict((('b', 'c'), )))])
+ self.assertEqual(func('{{template:a|b=c}}'),
+ [('template:a', OrderedDict((('b', 'c'), )))])
+ self.assertEqual(func('{{:a|b=c}}'),
+ [(':a', OrderedDict((('b', 'c'), )))])
+ self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'),
+ [('a', OrderedDict((('b', '{{{1}}}'),
+ ('c', '{{{2}}}'))))])
+ self.assertEqual(func('{{a|b=c}}{{d|e=f}}'),
+ [('a', OrderedDict((('b', 'c'), ))),
+ ('d', OrderedDict((('e', 'f'), )))])

- with suppress_warnings(WARNING_MSG, category=FutureWarning):
- self.assertEqual(func('{{ a}}'), [('a', OrderedDict())])
- self.assertEqual(func('{{a }}'), [('a', OrderedDict())])
- self.assertEqual(func('{{ a }}'), [('a', OrderedDict())])
- self.assertEqual(func('{{a|b=c}}'),
- [('a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{a|b|c=d}}'),
- [('a', OrderedDict((('1', 'b'), ('c', 'd'))))])
- self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'),
- [('a', OrderedDict((('b', 'c'), ('f', 'g'),
- ('d', 'e'), ('1', ''))))])
- self.assertEqual(func('{{a|1=2|c=d}}'),
- [('a', OrderedDict((('1', '2'), ('c', 'd'))))])
- self.assertEqual(func('{{a|c=d|1=2}}'),
- [('a', OrderedDict((('c', 'd'), ('1', '2'))))])
- self.assertEqual(func('{{a|5=d|a=b}}'),
- [('a', OrderedDict((('5', 'd'), ('a', 'b'))))])
- self.assertEqual(func('{{a|=2}}'),
- [('a', OrderedDict((('', '2'), )))])
- self.assertEqual(func('{{a|}}'),
- [('a', OrderedDict((('1', ''), )))])
- self.assertEqual(func('{{a|=|}}'),
- [('a', OrderedDict((('', ''), ('1', ''))))])
- self.assertEqual(func('{{a||}}'),
- [('a', OrderedDict((('1', ''), ('2', ''))))])
- self.assertEqual(func('{{a|b={{{1}}}}}'),
- [('a', OrderedDict((('b', '{{{1}}}'), )))])
- self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'),
- [('a',
- OrderedDict((('b',
- '<noinclude>{{{1}}}</noinclude>'),
- )))])
- self.assertEqual(func('{{Template:a|b=c}}'),
- [('Template:a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{template:a|b=c}}'),
- [('template:a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{:a|b=c}}'),
- [(':a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'),
- [('a', OrderedDict((('b', '{{{1}}}'),
- ('c', '{{{2}}}'))))])
- self.assertEqual(func('{{a|b=c}}{{d|e=f}}'),
- [('a', OrderedDict((('b', 'c'), ))),
- ('d', OrderedDict((('e', 'f'), )))])
+ # initial '{' and '}' should be ignored as outer wikitext
+ self.assertEqual(func('{{{a|b}}X}'),
+ [('a', OrderedDict((('1', 'b'), )))])

- # initial '{' and '}' should be ignored as outer wikitext
- self.assertEqual(func('{{{a|b}}X}'),
- [('a', OrderedDict((('1', 'b'), )))])
-
- # sf.net bug 1575: unclosed template
- self.assertEqual(func('{{a'), [])
- self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])
+ # sf.net bug 1575: unclosed template
+ self.assertEqual(func('{{a'), [])
+ self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])

def _unstripped(self, func):
"""Common cases of unstripped results."""
@@ -456,11 +449,12 @@
('2', 'd')])),
('b', OrderedDict([('1', 'c')]))])

- def _mwpfh_passes(self, func, failing=False):
+ def _mwpfh_passes(self, func):
"""Common cases failing with wikitextparser but passes with mwpfh.

Probably the behaviour of regex or mwpfh is wrong.
"""
+ failing = has_module('wikitextparser')
patterns = [
'{{subst:a|b=c}}',
'{{safesubst:a|b=c}}',
@@ -480,25 +474,24 @@
@require_modules('mwparserfromhell')
def test_extract_templates_params_mwpfh(self):
"""Test using mwparserfromhell."""
- func = textlib.extract_templates_and_params_mwpfh
- with suppress_warnings(WARNING_MSG, category=FutureWarning):
- self._common_results(func)
- self._order_differs(func)
- self._unstripped(func)
- self._etp_regex_differs(func)
- self._mwpfh_passes(func)
+ func = textlib.extract_templates_and_params
+ self._common_results(func)
+ self._order_differs(func)
+ self._unstripped(func)
+ self._etp_regex_differs(func)
+ self._mwpfh_passes(func)

- self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
- [('c', OrderedDict((('1', '{{d}}'), ))),
- ('a', OrderedDict([('1', '{{c|{{d}}}}')])),
- ('d', OrderedDict())
- ])
+ self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
+ [('c', OrderedDict((('1', '{{d}}'), ))),
+ ('a', OrderedDict([('1', '{{c|{{d}}}}')])),
+ ('d', OrderedDict())
+ ])

- self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
- [('c', OrderedDict((('1', '{{d|}}'), ))),
- ('a', OrderedDict([('1', '{{c|{{d|}}}}')])),
- ('d', OrderedDict([('1', '')]))
- ])
+ self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
+ [('c', OrderedDict((('1', '{{d|}}'), ))),
+ ('a', OrderedDict([('1', '{{c|{{d|}}}}')])),
+ ('d', OrderedDict([('1', '')]))
+ ])

@require_modules('mwparserfromhell')
def test_extract_templates_params_parser_stripped(self):
@@ -518,7 +511,7 @@
self._order_differs(func)
self._unstripped(func)
self._etp_regex_differs(func)
- self._mwpfh_passes(func, failing=True)
+ self._mwpfh_passes(func)

self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
[('c', OrderedDict((('1', '{{d}}'), ))),
@@ -532,42 +525,6 @@
('d', OrderedDict([('1', '')]))
])

- def test_extract_templates_params_regex(self):
- """Test using many complex regexes."""
- func = functools.partial(textlib.extract_templates_and_params_regex,
- remove_disabled_parts=False, strip=False)
- with suppress_warnings(WARNING_MSG, category=FutureWarning):
- self._common_results(func)
- self._order_differs(func)
- self._unstripped(func)
- # FIXME: {} is normal text
- self.assertEqual(func('{{a|b={} }}'), [])
-
- def test_extract_templates_params_regex_stripped(self):
- """Test using many complex regexes with stripping."""
- func = textlib.extract_templates_and_params_regex
- with suppress_warnings(WARNING_MSG, category=FutureWarning):
- self._common_results(func)
- self._order_differs(func)
- self._stripped(func)
-
- self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
- [('a', OrderedDict((('b', ''), )))])
-
- # Identical to mwpfh
- self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
- [('c', OrderedDict((('1', '{{d}}'), ))),
- ('a', OrderedDict([('1', '{{c|{{d}}}}')])),
- ('d', OrderedDict())
- ])
-
- # However fails to correctly handle three levels of balanced
- # brackets with empty parameters
- self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
- [('c', OrderedDict((('1', '{{d|}}}'), ))),
- ('d', OrderedDict([('1', '}')]))
- ])
-
@require_modules('mwparserfromhell')
def test_extract_templates_params(self):
"""Test that the normal entry point works."""
@@ -621,39 +578,6 @@
self.assertEqual(func('{{a|{{c|{{d|{{e|}}}} }} }} foo {{b}}'),
[(None, OrderedDict())])

- def test_etp_regex(self):
- """Test _ETP_REGEX."""
- func = textlib._ETP_REGEX.search
-
- self.assertIsNotNone(func('{{{1}}}'))
- self.assertIsNotNone(func('{{a|b={{{1}}} }}'))
- self.assertIsNotNone(func('{{a|b={{c}} }}'))
- self.assertIsNotNone(func('{{a|b={{c}} }}'))
- self.assertIsNotNone(func('{{a|b={{c|d=1}} }}'))
-
- self.assertIsNotNone(func('{{a|{{c}} }}'))
- self.assertIsNotNone(func('{{a|{{c|d}} }}'))
-
- func = textlib._ETP_REGEX.match
-
- self.assertIsNone(func('{{{1}}}'))
-
- self.assertIsNotNone(func('{{#if:foo}}'))
- self.assertIsNotNone(func('{{foo:}}'))
-
- self.assertIsNotNone(func('{{CURRENTYEAR}}'))
- self.assertIsNotNone(func('{{1}}'))
-
- self.assertIsNone(func('{{a|b={{CURRENTYEAR}} }}'))
- self.assertIsNone(func('{{a|b={{{1}}} }}'))
- self.assertIsNone(func('{{a|b={{c}} }}'))
- self.assertIsNone(func('{{a|b={{c|d=1}} }}'))
- self.assertIsNone(func('{{a|b={} }}'))
- self.assertIsNone(func('{{:a|b={{c|d=1}} }}'))
-
- self.assertIsNone(func('{{a|{{c}} }}'))
- self.assertIsNone(func('{{a|{{c|d}} }}'))
-
def test_nested_template_regex_search(self):
"""Test NESTED_TEMPLATE_REGEX search."""
func = textlib.NESTED_TEMPLATE_REGEX.search

To view, visit change 697143. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I529e846e655cbceb21672b09b2329c11e25d3307
Gerrit-Change-Number: 697143
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: JJMC89 <JJMC89.Wikimedia@gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged