[Pywikibot-commits] [Gerrit] Add extract_templates_and_params strip - change (pywikibot/core)

28 Sep 2015

jenkins-bot has submitted this change and it was merged.
Change subject: Add extract_templates_and_params strip
......................................................................
Add extract_templates_and_params strip
- Allow strip to be disabled
- Do not strip implicit parameters
Change-Id: Ic77f2115c5727ea8f573ec269886769e1c6b4333
---
M pywikibot/textlib.py
M tests/textlib_tests.py
2 files changed, 141 insertions(+), 21 deletions(-)
Approvals:
  XZise: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 36c6e81..2b6c529 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -41,6 +41,7 @@
 from pywikibot.tools import (
     DeprecatedRegex,
     OrderedDict,
+    UnicodeType,
     issue_deprecation_warning
 )
@@ -1255,7 +1256,7 @@
 # Functions dealing with templates
 # --------------------------------
-def extract_templates_and_params(text, remove_disabled_parts=None):
+def extract_templates_and_params(text, remove_disabled_parts=None, strip=None):
     """Return a list of templates found in text.
Return value is a list of tuples. There is one tuple for each use of a
@@ -1278,6 +1279,11 @@
     mwpfh preserves whitespace in parameter names and values.  regex excludes
     anything between <!-- --> before parsing the text.
+    If there are multiple numbered parameters in the wikitext for the same
+    position, MediaWiki will only use the last parameter value.
+    e.g. {{a| foo | 2 <!-- --> = bar | baz }} is {{a|1=foo|2=baz}}
+    To replicate that behaviour, enable both remove_disabled_parts and strip.
+
     @param text: The wikitext from which templates are extracted
     @type text: unicode or string
     @param remove_disabled_parts: Remove disabled wikitext such as comments
@@ -1285,6 +1291,11 @@
         is not available or is disabled in the config, and disabled if
         mwparserfromhell is present and enabled in the config.
     @type remove_disabled_parts: bool or None
+    @param strip: if enabled, strip arguments and values of templates.
+        If None (default), this is enabled when mwparserfromhell
+        is not available or is disabled in the config, and disabled if
+        mwparserfromhell is present and enabled in the config.
+    @type strip: bool
     @return: list of template name and params
     @rtype: list of tuple
     """
@@ -1294,16 +1305,19 @@
     if remove_disabled_parts is None:
         remove_disabled_parts = not use_mwparserfromhell
+    if strip is None:
+        strip = not use_mwparserfromhell
+
     if remove_disabled_parts:
         text = removeDisabledParts(text)
if use_mwparserfromhell:
-        return extract_templates_and_params_mwpfh(text)
+        return extract_templates_and_params_mwpfh(text, strip)
     else:
-        return extract_templates_and_params_regex(text, False)
+        return extract_templates_and_params_regex(text, False, strip)
-def extract_templates_and_params_mwpfh(text):
+def extract_templates_and_params_mwpfh(text, strip=False):
     """
     Extract templates with params using mwparserfromhell.
@@ -1321,15 +1335,29 @@
     """
     code = mwparserfromhell.parse(text)
     result = []
+
     for template in code.filter_templates(recursive=True):
         params = OrderedDict()
         for param in template.params:
-            params[unicode(param.name)] = unicode(param.value)
+            if strip:
+                implicit_parameter = not param.showkey
+                key = param.name.strip()
+                if not implicit_parameter:
+                    value = param.value.strip()
+                else:
+                    value = UnicodeType(param.value)
+            else:
+                key = UnicodeType(param.name)
+                value = UnicodeType(param.value)
+
+            params[key] = value
+
         result.append((unicode(template.name.strip()), params))
     return result
-def extract_templates_and_params_regex(text, remove_disabled_parts=True):
+def extract_templates_and_params_regex(text, remove_disabled_parts=True,
+                                       strip=True):
     """
     Extract templates with params using a regex with additional processing.
@@ -1473,10 +1501,12 @@
                 for param in markedParams:
                     if "=" in param:
                         param_name, param_val = param.split("=", 1)
+                        implicit_parameter = False
                     else:
                         param_name = unicode(numbered_param)
                         param_val = param
                         numbered_param += 1
+                        implicit_parameter = True
                     count = len(inside)
                     for m2 in Rmarker1.finditer(param_val):
                         param_val = param_val.replace(m2.group(),
@@ -1490,7 +1520,11 @@
                     for m2 in Rmarker4.finditer(param_val):
                         param_val = param_val.replace(m2.group(),
                                                       values[int(m2.group(1))])
-                    params[param_name.strip()] = param_val.strip()
+                    if strip:
+                        param_name = param_name.strip()
+                        if not implicit_parameter:
+                            param_val = param_val.strip()
+                    params[param_name] = param_val
# Special case for {{a|}} which has an undetected parameter
             if not params and '|' in m.group(0):
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 0b0205e..5e84526 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -324,9 +324,6 @@
                          [('a', OrderedDict((('b', 'c'), ))),
                           ('d', OrderedDict((('e', 'f'), )))])
-        self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
-                         [('a', OrderedDict((('b', '<!--{{{1}}}-->'), )))])
-
         # initial '{' and '}' should be ignored as outer wikitext
         self.assertEqual(func('{{{a|b}}X}'),
                          [('a', OrderedDict((('1', 'b'), )))])
@@ -335,13 +332,52 @@
         self.assertEqual(func('{{a'), [])
         self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])
-    def _etp_regex_differs(self, func):
-        """Common cases not handled the same by ETP_REGEX."""
+    def _unstripped(self, func):
+        """Common cases of unstripped results."""
+        self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
+                         [('a', OrderedDict((('b', '<!--{{{1}}}-->'), )))])
+
+        self.assertEqual(func('{{a|  }}'), [('a', OrderedDict((('1', '  '), )))])
+        self.assertEqual(func('{{a| | }}'), [('a', OrderedDict((('1', ' '), ('2', ' '))))])
+        self.assertEqual(func('{{a| =|}}'), [('a', OrderedDict(((' ', ''), ('1', ''))))])
+
         self.assertEqual(func('{{a| b=c}}'), [('a', OrderedDict(((' b', 'c'), )))])
         self.assertEqual(func('{{a|b =c}}'), [('a', OrderedDict((('b ', 'c'), )))])
         self.assertEqual(func('{{a|b= c}}'), [('a', OrderedDict((('b', ' c'), )))])
         self.assertEqual(func('{{a|b=c }}'), [('a', OrderedDict((('b', 'c '), )))])
+        self.assertEqual(func('{{a| foo |2= bar }}'),
+                         [('a', OrderedDict((('1', ' foo '), ('2', ' bar '))))])
+
+        # The correct entry 'bar' is removed
+        self.assertEqual(func('{{a| foo |2= bar | baz }}'),
+                         [('a', OrderedDict((('1', ' foo '), ('2', ' baz '))))])
+        # However whitespace prevents the correct item from being removed
+        self.assertEqual(func('{{a| foo | 2 = bar | baz }}'),
+                         [('a', OrderedDict((('1', ' foo '), (' 2 ', ' bar '), ('2', ' baz '))))])
+
+    def _stripped(self, func):
+        """Common cases of stripped results."""
+        self.assertEqual(func('{{a|  }}'), [('a', OrderedDict((('1', '  '), )))])
+        self.assertEqual(func('{{a| | }}'), [('a', OrderedDict((('1', ' '), ('2', ' '))))])
+        self.assertEqual(func('{{a| =|}}'), [('a', OrderedDict((('', ''), ('1', ''))))])
+
+        self.assertEqual(func('{{a| b=c}}'), [('a', OrderedDict((('b', 'c'), )))])
+        self.assertEqual(func('{{a|b =c}}'), [('a', OrderedDict((('b', 'c'), )))])
+        self.assertEqual(func('{{a|b= c}}'), [('a', OrderedDict((('b', 'c'), )))])
+        self.assertEqual(func('{{a|b=c }}'), [('a', OrderedDict((('b', 'c'), )))])
+
+        self.assertEqual(func('{{a| foo |2= bar }}'),
+                         [('a', OrderedDict((('1', ' foo '), ('2', 'bar'))))])
+
+        # 'bar' is always removed
+        self.assertEqual(func('{{a| foo |2= bar | baz }}'),
+                         [('a', OrderedDict((('1', ' foo '), ('2', ' baz '))))])
+        self.assertEqual(func('{{a| foo | 2 = bar | baz }}'),
+                         [('a', OrderedDict((('1', ' foo '), ('2', ' baz '))))])
+
+    def _etp_regex_differs(self, func):
+        """Common cases not handled the same by ETP_REGEX."""
         # inner {} should be treated as part of the value
         self.assertEqual(func('{{a|b={} }}'), [('a', OrderedDict((('b', '{} '), )))])
@@ -367,6 +403,7 @@
         func = textlib.extract_templates_and_params_mwpfh
         self._common_results(func)
         self._order_differs(func)
+        self._unstripped(func)
         self._etp_regex_differs(func)
self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
@@ -381,21 +418,34 @@
                                ('d', OrderedDict([('1', '')]))
                                ])
+    @require_modules('mwparserfromhell')
+    def test_extract_templates_params_mwpfh_stripped(self):
+        """Test using mwparserfromhell with stripping."""
+        func = functools.partial(textlib.extract_templates_and_params_mwpfh,
+                                 strip=True)
+
+        self._common_results(func)
+        self._order_differs(func)
+        self._stripped(func)
+
     def test_extract_templates_params_regex(self):
         """Test using many complex regexes."""
         func = functools.partial(textlib.extract_templates_and_params_regex,
-                                 remove_disabled_parts=False)
+                                 remove_disabled_parts=False, strip=False)
         self._common_results(func)
         self._order_differs(func)
+        self._unstripped(func)
self.assertEqual(func('{{a|b={} }}'), [])  # FIXME: {} is normal text
-        self.assertEqual(func('{{a| b=c}}'), [('a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{a|b =c}}'), [('a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{a|b= c}}'), [('a', OrderedDict((('b', 'c'), )))])
-        self.assertEqual(func('{{a|b=c }}'), [('a', OrderedDict((('b', 'c'), )))])
-
+    def test_extract_templates_params_regex_stripped(self):
+        """Test using many complex regexes with stripping."""
         func = textlib.extract_templates_and_params_regex
+
+        self._common_results(func)
+        self._order_differs(func)
+        self._stripped(func)
+
         self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
                          [('a', OrderedDict((('b', ''), )))])
@@ -415,9 +465,16 @@
def test_extract_templates_params(self):
         """Test that the normal entry point works."""
-        self._common_results(
-            functools.partial(textlib.extract_templates_and_params,
-                              remove_disabled_parts=False))
+        func = functools.partial(textlib.extract_templates_and_params,
+                                 remove_disabled_parts=False, strip=False)
+
+        self._common_results(func)
+        self._unstripped(func)
+
+        func = functools.partial(textlib.extract_templates_and_params,
+                                 remove_disabled_parts=False, strip=True)
+        self._common_results(func)
+        self._stripped(func)
def test_template_simple_regex(self):
         """Test using simple regex."""
@@ -569,12 +626,14 @@
     def extract_mwpfh(self, text, *args, **kwargs):
         """Patched call to extract_templates_and_params_mwpfh."""
         self._text = text
+        self._args = args
         self._mwpfh = True
@PatchingTestCase.patched(textlib, 'extract_templates_and_params_regex')
     def extract_regex(self, text, *args, **kwargs):
         """Patched call to extract_templates_and_params_regex."""
         self._text = text
+        self._args = args
         self._mwpfh = False
def test_removing_disabled_parts_regex(self):
@@ -604,6 +663,33 @@
         self.assertEqual(self._text, '{{a<!-- -->}}')
         self.assertTrue(self._mwpfh)
+    def test_strip_regex(self):
+        """Test stripping values when using the regex variant."""
+        self.patch(config, 'use_mwparserfromhell', False)
+        textlib.extract_templates_and_params('{{a| foo }}', False, True)
+        self.assertEqual(self._args, (False, True))
+        self.assertFalse(self._mwpfh)
+        textlib.extract_templates_and_params('{{a| foo }}', False, False)
+        self.assertEqual(self._args, (False, False))
+        self.assertFalse(self._mwpfh)
+        textlib.extract_templates_and_params('{{a| foo }}', False)
+        self.assertEqual(self._args, (False, True))
+        self.assertFalse(self._mwpfh)
+
+    @require_modules('mwparserfromhell')
+    def test_strip_mwpfh(self):
+        """Test stripping values when using the mwpfh variant."""
+        self.patch(config, 'use_mwparserfromhell', True)
+        textlib.extract_templates_and_params('{{a| foo }}', None, True)
+        self.assertEqual(self._args, (True, ))
+        self.assertTrue(self._mwpfh)
+        textlib.extract_templates_and_params('{{a| foo }}', None, False)
+        self.assertEqual(self._args, (False, ))
+        self.assertTrue(self._mwpfh)
+        textlib.extract_templates_and_params('{{a| foo }}')
+        self.assertEqual(self._args, (False, ))
+        self.assertTrue(self._mwpfh)
+
class TestReplaceLinks(TestCase):
-- 
To view, visit https://gerrit.wikimedia.org/r/234832
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ic77f2115c5727ea8f573ec269886769e1c6b4333
Gerrit-PatchSet: 11
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: John Vandenberg jayvdb@gmail.com
Gerrit-Reviewer: Ladsgroup ladsgroup@gmail.com
Gerrit-Reviewer: XZise CommodoreFabianus@gmx.de
Gerrit-Reviewer: jenkins-bot <>


    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

[Pywikibot-commits] [Gerrit] Add extract_templates_and_params strip - change (pywikibot/core)