jenkins-bot has submitted this change and it was merged.
Change subject: Add extract_templates_and_params strip ......................................................................
Add extract_templates_and_params strip
- Allow strip to be disabled - Do not strip implicit parameters
Change-Id: Ic77f2115c5727ea8f573ec269886769e1c6b4333 --- M pywikibot/textlib.py M tests/textlib_tests.py 2 files changed, 141 insertions(+), 21 deletions(-)
Approvals: XZise: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 36c6e81..2b6c529 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -41,6 +41,7 @@ from pywikibot.tools import ( DeprecatedRegex, OrderedDict, + UnicodeType, issue_deprecation_warning )
@@ -1255,7 +1256,7 @@ # Functions dealing with templates # --------------------------------
-def extract_templates_and_params(text, remove_disabled_parts=None): +def extract_templates_and_params(text, remove_disabled_parts=None, strip=None): """Return a list of templates found in text.
Return value is a list of tuples. There is one tuple for each use of a @@ -1278,6 +1279,11 @@ mwpfh preserves whitespace in parameter names and values. regex excludes anything between <!-- --> before parsing the text.
+ If there are multiple numbered parameters in the wikitext for the same + position, MediaWiki will only use the last parameter value. + e.g. {{a| foo | 2 <!-- --> = bar | baz }} is {{a|1=foo|2=baz}} + To replicate that behaviour, enable both remove_disabled_parts and strip. + @param text: The wikitext from which templates are extracted @type text: unicode or string @param remove_disabled_parts: Remove disabled wikitext such as comments @@ -1285,6 +1291,11 @@ is not available or is disabled in the config, and disabled if mwparserfromhell is present and enabled in the config. @type remove_disabled_parts: bool or None + @param strip: if enabled, strip arguments and values of templates. + If None (default), this is enabled when mwparserfromhell + is not available or is disabled in the config, and disabled if + mwparserfromhell is present and enabled in the config. + @type strip: bool @return: list of template name and params @rtype: list of tuple """ @@ -1294,16 +1305,19 @@ if remove_disabled_parts is None: remove_disabled_parts = not use_mwparserfromhell
+ if strip is None: + strip = not use_mwparserfromhell + if remove_disabled_parts: text = removeDisabledParts(text)
if use_mwparserfromhell: - return extract_templates_and_params_mwpfh(text) + return extract_templates_and_params_mwpfh(text, strip) else: - return extract_templates_and_params_regex(text, False) + return extract_templates_and_params_regex(text, False, strip)
-def extract_templates_and_params_mwpfh(text): +def extract_templates_and_params_mwpfh(text, strip=False): """ Extract templates with params using mwparserfromhell.
@@ -1321,15 +1335,29 @@ """ code = mwparserfromhell.parse(text) result = [] + for template in code.filter_templates(recursive=True): params = OrderedDict() for param in template.params: - params[unicode(param.name)] = unicode(param.value) + if strip: + implicit_parameter = not param.showkey + key = param.name.strip() + if not implicit_parameter: + value = param.value.strip() + else: + value = UnicodeType(param.value) + else: + key = UnicodeType(param.name) + value = UnicodeType(param.value) + + params[key] = value + result.append((unicode(template.name.strip()), params)) return result
-def extract_templates_and_params_regex(text, remove_disabled_parts=True): +def extract_templates_and_params_regex(text, remove_disabled_parts=True, + strip=True): """ Extract templates with params using a regex with additional processing.
@@ -1473,10 +1501,12 @@ for param in markedParams: if "=" in param: param_name, param_val = param.split("=", 1) + implicit_parameter = False else: param_name = unicode(numbered_param) param_val = param numbered_param += 1 + implicit_parameter = True count = len(inside) for m2 in Rmarker1.finditer(param_val): param_val = param_val.replace(m2.group(), @@ -1490,7 +1520,11 @@ for m2 in Rmarker4.finditer(param_val): param_val = param_val.replace(m2.group(), values[int(m2.group(1))]) - params[param_name.strip()] = param_val.strip() + if strip: + param_name = param_name.strip() + if not implicit_parameter: + param_val = param_val.strip() + params[param_name] = param_val
# Special case for {{a|}} which has an undetected parameter if not params and '|' in m.group(0): diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py index 0b0205e..5e84526 100644 --- a/tests/textlib_tests.py +++ b/tests/textlib_tests.py @@ -324,9 +324,6 @@ [('a', OrderedDict((('b', 'c'), ))), ('d', OrderedDict((('e', 'f'), )))])
- self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), - [('a', OrderedDict((('b', '<!--{{{1}}}-->'), )))]) - # initial '{' and '}' should be ignored as outer wikitext self.assertEqual(func('{{{a|b}}X}'), [('a', OrderedDict((('1', 'b'), )))]) @@ -335,13 +332,52 @@ self.assertEqual(func('{{a'), []) self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])
- def _etp_regex_differs(self, func): - """Common cases not handled the same by ETP_REGEX.""" + def _unstripped(self, func): + """Common cases of unstripped results.""" + self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), + [('a', OrderedDict((('b', '<!--{{{1}}}-->'), )))]) + + self.assertEqual(func('{{a| }}'), [('a', OrderedDict((('1', ' '), )))]) + self.assertEqual(func('{{a| | }}'), [('a', OrderedDict((('1', ' '), ('2', ' '))))]) + self.assertEqual(func('{{a| =|}}'), [('a', OrderedDict(((' ', ''), ('1', ''))))]) + self.assertEqual(func('{{a| b=c}}'), [('a', OrderedDict(((' b', 'c'), )))]) self.assertEqual(func('{{a|b =c}}'), [('a', OrderedDict((('b ', 'c'), )))]) self.assertEqual(func('{{a|b= c}}'), [('a', OrderedDict((('b', ' c'), )))]) self.assertEqual(func('{{a|b=c }}'), [('a', OrderedDict((('b', 'c '), )))])
+ self.assertEqual(func('{{a| foo |2= bar }}'), + [('a', OrderedDict((('1', ' foo '), ('2', ' bar '))))]) + + # The correct entry 'bar' is removed + self.assertEqual(func('{{a| foo |2= bar | baz }}'), + [('a', OrderedDict((('1', ' foo '), ('2', ' baz '))))]) + # However whitespace prevents the correct item from being removed + self.assertEqual(func('{{a| foo | 2 = bar | baz }}'), + [('a', OrderedDict((('1', ' foo '), (' 2 ', ' bar '), ('2', ' baz '))))]) + + def _stripped(self, func): + """Common cases of stripped results.""" + self.assertEqual(func('{{a| }}'), [('a', OrderedDict((('1', ' '), )))]) + self.assertEqual(func('{{a| | }}'), [('a', OrderedDict((('1', ' '), ('2', ' '))))]) + self.assertEqual(func('{{a| =|}}'), [('a', OrderedDict((('', ''), ('1', ''))))]) + + self.assertEqual(func('{{a| b=c}}'), [('a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{a|b =c}}'), [('a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{a|b= c}}'), [('a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{a|b=c }}'), [('a', OrderedDict((('b', 'c'), )))]) + + self.assertEqual(func('{{a| foo |2= bar }}'), + [('a', OrderedDict((('1', ' foo '), ('2', 'bar'))))]) + + # 'bar' is always removed + self.assertEqual(func('{{a| foo |2= bar | baz }}'), + [('a', OrderedDict((('1', ' foo '), ('2', ' baz '))))]) + self.assertEqual(func('{{a| foo | 2 = bar | baz }}'), + [('a', OrderedDict((('1', ' foo '), ('2', ' baz '))))]) + + def _etp_regex_differs(self, func): + """Common cases not handled the same by ETP_REGEX.""" # inner {} should be treated as part of the value self.assertEqual(func('{{a|b={} }}'), [('a', OrderedDict((('b', '{} '), )))])
@@ -367,6 +403,7 @@ func = textlib.extract_templates_and_params_mwpfh self._common_results(func) self._order_differs(func) + self._unstripped(func) self._etp_regex_differs(func)
self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'), @@ -381,21 +418,34 @@ ('d', OrderedDict([('1', '')])) ])
+ @require_modules('mwparserfromhell') + def test_extract_templates_params_mwpfh_stripped(self): + """Test using mwparserfromhell with stripping.""" + func = functools.partial(textlib.extract_templates_and_params_mwpfh, + strip=True) + + self._common_results(func) + self._order_differs(func) + self._stripped(func) + def test_extract_templates_params_regex(self): """Test using many complex regexes.""" func = functools.partial(textlib.extract_templates_and_params_regex, - remove_disabled_parts=False) + remove_disabled_parts=False, strip=False) self._common_results(func) self._order_differs(func) + self._unstripped(func)
self.assertEqual(func('{{a|b={} }}'), []) # FIXME: {} is normal text
- self.assertEqual(func('{{a| b=c}}'), [('a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{a|b =c}}'), [('a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{a|b= c}}'), [('a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{a|b=c }}'), [('a', OrderedDict((('b', 'c'), )))]) - + def test_extract_templates_params_regex_stripped(self): + """Test using many complex regexes with stripping.""" func = textlib.extract_templates_and_params_regex + + self._common_results(func) + self._order_differs(func) + self._stripped(func) + self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), [('a', OrderedDict((('b', ''), )))])
@@ -415,9 +465,16 @@
def test_extract_templates_params(self): """Test that the normal entry point works.""" - self._common_results( - functools.partial(textlib.extract_templates_and_params, - remove_disabled_parts=False)) + func = functools.partial(textlib.extract_templates_and_params, + remove_disabled_parts=False, strip=False) + + self._common_results(func) + self._unstripped(func) + + func = functools.partial(textlib.extract_templates_and_params, + remove_disabled_parts=False, strip=True) + self._common_results(func) + self._stripped(func)
def test_template_simple_regex(self): """Test using simple regex.""" @@ -569,12 +626,14 @@ def extract_mwpfh(self, text, *args, **kwargs): """Patched call to extract_templates_and_params_mwpfh.""" self._text = text + self._args = args self._mwpfh = True
@PatchingTestCase.patched(textlib, 'extract_templates_and_params_regex') def extract_regex(self, text, *args, **kwargs): """Patched call to extract_templates_and_params_regex.""" self._text = text + self._args = args self._mwpfh = False
def test_removing_disabled_parts_regex(self): @@ -604,6 +663,33 @@ self.assertEqual(self._text, '{{a<!-- -->}}') self.assertTrue(self._mwpfh)
+ def test_strip_regex(self): + """Test stripping values when using the regex variant.""" + self.patch(config, 'use_mwparserfromhell', False) + textlib.extract_templates_and_params('{{a| foo }}', False, True) + self.assertEqual(self._args, (False, True)) + self.assertFalse(self._mwpfh) + textlib.extract_templates_and_params('{{a| foo }}', False, False) + self.assertEqual(self._args, (False, False)) + self.assertFalse(self._mwpfh) + textlib.extract_templates_and_params('{{a| foo }}', False) + self.assertEqual(self._args, (False, True)) + self.assertFalse(self._mwpfh) + + @require_modules('mwparserfromhell') + def test_strip_mwpfh(self): + """Test stripping values when using the mwpfh variant.""" + self.patch(config, 'use_mwparserfromhell', True) + textlib.extract_templates_and_params('{{a| foo }}', None, True) + self.assertEqual(self._args, (True, )) + self.assertTrue(self._mwpfh) + textlib.extract_templates_and_params('{{a| foo }}', None, False) + self.assertEqual(self._args, (False, )) + self.assertTrue(self._mwpfh) + textlib.extract_templates_and_params('{{a| foo }}') + self.assertEqual(self._args, (False, )) + self.assertTrue(self._mwpfh) +
class TestReplaceLinks(TestCase):