jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/675351 )
Change subject: [IMPR] Enable extract_templates_and_params with wikitextparser package ......................................................................
[IMPR] Enable extract_templates_and_params with wikitextparser package
- rename extract_templates_and_params_mwpfh to _extract_templates_and_params_parser and check for the currently used wikitext parser. All different package behaviours ar handled there. Test for ImportError instead of generic Exception. - deprecate extract_templates_and_params_mwpfh but force to take mwparserfromhell as wikitextparser - the fallback path is: wikitextparser, mwparserfromhell, _ETP_REGEX - update documentation - add OrderedDict and nullcontext to backports.py - update tests accordingly
Change-Id: Ibf4472b309672e7189042875c460b3665ec810ed --- M pywikibot/backports.py M pywikibot/textlib.py M scripts/i18n M tests/textlib_tests.py 4 files changed, 211 insertions(+), 101 deletions(-)
Approvals: JJMC89: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/backports.py b/pywikibot/backports.py index 487447a..32e2666 100644 --- a/pywikibot/backports.py +++ b/pywikibot/backports.py @@ -14,6 +14,25 @@ cache = _lru_cache(None)
+# context +if PYTHON_VERSION < (3, 7): + + class nullcontext: # noqa: N801 + + """Dummy context manager for Python 3.5/3.6 that does nothing.""" + + def __init__(self, result=None): # noqa: D107 + self.result = result + + def __enter__(self): + return self.result + + def __exit__(self, *args): + pass +else: + from contextlib import nullcontext + + # typing if PYTHON_VERSION < (3, 5, 2): from typing import Dict as DefaultDict @@ -22,6 +41,15 @@ else: from collections import defaultdict as DefaultDict # noqa: N812
+ +if PYTHON_VERSION < (3, 7, 2): + from typing import Dict as OrderedDict +elif PYTHON_VERSION < (3, 9): + from typing import OrderedDict +else: + from collections import OrderedDict + + if PYTHON_VERSION >= (3, 9): from collections.abc import Iterable, Sequence Dict = dict diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 204bd68..75ae916 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -22,15 +22,22 @@ import pywikibot
from pywikibot.backports import List, Tuple +from pywikibot.backports import OrderedDict as OrderedDictType from pywikibot.exceptions import InvalidTitle, SiteDefinitionError from pywikibot.family import Family -from pywikibot.tools import deprecate_arg, issue_deprecation_warning +from pywikibot.tools import ( + deprecated, deprecate_arg, issue_deprecation_warning, +)
try: - import mwparserfromhell -except ImportError as e: - mwparserfromhell = e + import wikitextparser +except ImportError: + try: + import mwparserfromhell as wikitextparser + except ImportError as e: + wikitextparser = e
+ETPType = List[Tuple[str, OrderedDictType[str, str]]]
# cache for replaceExcept to avoid recompile or regexes each call _regex_cache = {} @@ -1566,8 +1573,9 @@ # Functions dealing with templates # --------------------------------
-def extract_templates_and_params(text: str, remove_disabled_parts=None, - strip: Optional[bool] = None): +def extract_templates_and_params(text: str, + remove_disabled_parts: Optional[bool] = None, + strip: Optional[bool] = None) -> ETPType: """Return a list of templates found in text.
Return value is a list of tuples. There is one tuple for each use of a @@ -1596,16 +1604,15 @@
@param text: The wikitext from which templates are extracted @param remove_disabled_parts: Remove disabled wikitext such as comments - and pre. If None (default), this is enabled when mwparserfromhell - is not available and disabled if mwparserfromhell is present. - @type remove_disabled_parts: bool or None + and pre. If None (default), this is enabled when neither + mwparserfromhell not wikitextparser package is available and + disabled otherwise. @param strip: if enabled, strip arguments and values of templates. - If None (default), this is enabled when mwparserfromhell - is not available and disabled if mwparserfromhell is present. + If None (default), this is enabled when neither mwparserfromhell + nor wikitextparser package is available and disabled otherwise. @return: list of template name and params - @rtype: list of tuple """ - use_regex = isinstance(mwparserfromhell, Exception) + use_regex = isinstance(wikitextparser, ImportError)
if remove_disabled_parts is None: remove_disabled_parts = use_regex @@ -1617,41 +1624,58 @@
if use_regex: return extract_templates_and_params_regex(text, False, strip) - return extract_templates_and_params_mwpfh(text, strip) + return _extract_templates_and_params_parser(text, strip)
-def extract_templates_and_params_mwpfh(text: str, strip=False): +def _extract_templates_and_params_parser(text: str, + strip: bool = False) -> ETPType: """ Extract templates with params using mwparserfromhell.
This function should not be called directly.
- Use extract_templates_and_params, which will select this - mwparserfromhell implementation if based on whether the - mwparserfromhell package is installed. + Use extract_templates_and_params, which will select this parser + implementation if the mwparserfromhell or wikitextparser package is + installed.
@param text: The wikitext from which templates are extracted + @param strip: if enabled, strip arguments and values of templates @return: list of template name and params - @rtype: list of tuple """ - code = mwparserfromhell.parse(text) - result = [] + def explicit(param): + try: + attr = param.showkey + except AttributeError: + attr = not param.positional + return attr
- for template in code.ifilter_templates( + parser_name = wikitextparser.__name__ + pywikibot.log('Using {!r} wikitext parser'.format(parser_name)) + + result = [] + parsed = wikitextparser.parse(text) + if parser_name == 'wikitextparser': + templates = parsed.templates + arguments = 'arguments' + else: + templates = parsed.ifilter_templates( matches=lambda x: not x.name.lstrip().startswith('#'), - recursive=True): + recursive=True) + arguments = 'params' + + for template in templates: params = OrderedDict() - for param in template.params: + for param in getattr(template, arguments): + value = str(param.value) # mwpfh needs upcast to str + if strip: - implicit_parameter = not param.showkey key = param.name.strip() - if not implicit_parameter: + if explicit(param): value = param.value.strip() else: value = str(param.value) else: key = str(param.name) - value = str(param.value)
params[key] = value
@@ -1659,9 +1683,22 @@ return result
+@deprecated('extract_templates_and_params', since='20210329', + future_warning=True) +def extract_templates_and_params_mwpfh(text: str, + strip: bool = False) -> ETPType: + """Extract templates with params using mwparserfromhell.""" + global wikitextparser + saved_parser = wikitextparser + import mwparserfromhell as wikitextparser + result = _extract_templates_and_params_parser(text, strip) + wikitextparser = saved_parser + return result + + def extract_templates_and_params_regex(text: str, remove_disabled_parts: bool = True, - strip: bool = True): + strip: bool = True) -> ETPType: """ Extract templates with params using a regex with additional processing.
@@ -1672,8 +1709,8 @@ is not used.
@param text: The wikitext from which templates are extracted + @param strip: if enabled, strip arguments and values of templates @return: list of template name and params - @rtype: list of tuple """ # remove commented-out stuff etc. if remove_disabled_parts: diff --git a/scripts/i18n b/scripts/i18n index 362e42e..4591fe6 160000 --- a/scripts/i18n +++ b/scripts/i18n @@ -1 +1 @@ -Subproject commit 362e42e36ae45005e8019ae4107eac2adb736a55 +Subproject commit 4591fe605f89e29e14f730f1416542acfea135bd diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py index 139cb99..8b537b3 100644 --- a/tests/textlib_tests.py +++ b/tests/textlib_tests.py @@ -16,8 +16,10 @@ import pywikibot import pywikibot.textlib as textlib
+from pywikibot.backports import nullcontext from pywikibot.site._interwikimap import _IWEntry from pywikibot.textlib import _MultiTemplateMatchBuilder, extract_sections +from pywikibot.tools import suppress_warnings from pywikibot import UnknownSite
from tests.aspects import ( @@ -300,6 +302,10 @@ 'Invalid category title extracted: nasty{{{!}}')
+WARNING_MSG = (r'.*extract_templates_and_params_mwpfh .*' + r'is deprecated for .*; use extract_templates_and_params') + + class TestTemplateParams(TestCase):
"""Test to verify that template params extraction works.""" @@ -308,67 +314,61 @@
def _common_results(self, func): """Common cases.""" - self.assertEqual(func('{{a}}'), [('a', OrderedDict())]) - self.assertEqual(func('{{ a}}'), [('a', OrderedDict())]) - self.assertEqual(func('{{a }}'), [('a', OrderedDict())]) - self.assertEqual(func('{{ a }}'), [('a', OrderedDict())]) + with suppress_warnings(WARNING_MSG, category=FutureWarning): + self.assertEqual(func('{{a}}'), [('a', OrderedDict())])
- self.assertEqual(func('{{a|b=c}}'), - [('a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{a|b|c=d}}'), - [('a', OrderedDict((('1', 'b'), ('c', 'd'))))]) - self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'), - [('a', OrderedDict((('b', 'c'), ('f', 'g'), - ('d', 'e'), ('1', ''))))]) - self.assertEqual(func('{{a|1=2|c=d}}'), - [('a', OrderedDict((('1', '2'), ('c', 'd'))))]) - self.assertEqual(func('{{a|c=d|1=2}}'), - [('a', OrderedDict((('c', 'd'), ('1', '2'))))]) - self.assertEqual(func('{{a|5=d|a=b}}'), - [('a', OrderedDict((('5', 'd'), ('a', 'b'))))]) - self.assertEqual(func('{{a|=2}}'), - [('a', OrderedDict((('', '2'), )))]) + with suppress_warnings(WARNING_MSG, category=FutureWarning): + self.assertEqual(func('{{ a}}'), [('a', OrderedDict())]) + self.assertEqual(func('{{a }}'), [('a', OrderedDict())]) + self.assertEqual(func('{{ a }}'), [('a', OrderedDict())]) + self.assertEqual(func('{{a|b=c}}'), + [('a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{a|b|c=d}}'), + [('a', OrderedDict((('1', 'b'), ('c', 'd'))))]) + self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'), + [('a', OrderedDict((('b', 'c'), ('f', 'g'), + ('d', 'e'), ('1', ''))))]) + self.assertEqual(func('{{a|1=2|c=d}}'), + [('a', OrderedDict((('1', '2'), ('c', 'd'))))]) + self.assertEqual(func('{{a|c=d|1=2}}'), + [('a', OrderedDict((('c', 'd'), ('1', '2'))))]) + self.assertEqual(func('{{a|5=d|a=b}}'), + [('a', OrderedDict((('5', 'd'), ('a', 'b'))))]) + self.assertEqual(func('{{a|=2}}'), + [('a', OrderedDict((('', '2'), )))]) + self.assertEqual(func('{{a|}}'), + [('a', OrderedDict((('1', ''), )))]) + self.assertEqual(func('{{a|=|}}'), + [('a', OrderedDict((('', ''), ('1', ''))))]) + self.assertEqual(func('{{a||}}'), + [('a', OrderedDict((('1', ''), ('2', ''))))]) + self.assertEqual(func('{{a|b={{{1}}}}}'), + [('a', OrderedDict((('b', '{{{1}}}'), )))]) + self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'), + [('a', + OrderedDict((('b', + '<noinclude>{{{1}}}</noinclude>'), + )))]) + self.assertEqual(func('{{Template:a|b=c}}'), + [('Template:a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{template:a|b=c}}'), + [('template:a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{:a|b=c}}'), + [(':a', OrderedDict((('b', 'c'), )))]) + self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'), + [('a', OrderedDict((('b', '{{{1}}}'), + ('c', '{{{2}}}'))))]) + self.assertEqual(func('{{a|b=c}}{{d|e=f}}'), + [('a', OrderedDict((('b', 'c'), ))), + ('d', OrderedDict((('e', 'f'), )))])
- self.assertEqual(func('{{a|}}'), [('a', OrderedDict((('1', ''), )))]) - self.assertEqual(func('{{a|=|}}'), - [('a', OrderedDict((('', ''), ('1', ''))))]) - self.assertEqual(func('{{a||}}'), - [('a', OrderedDict((('1', ''), ('2', ''))))]) + # initial '{' and '}' should be ignored as outer wikitext + self.assertEqual(func('{{{a|b}}X}'), + [('a', OrderedDict((('1', 'b'), )))])
- self.assertEqual(func('{{a|b={{{1}}}}}'), - [('a', OrderedDict((('b', '{{{1}}}'), )))]) - self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'), - [('a', OrderedDict( - (('b', '<noinclude>{{{1}}}</noinclude>'), )))]) - self.assertEqual(func('{{subst:a|b=c}}'), - [('subst:a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{safesubst:a|b=c}}'), - [('safesubst:a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{msgnw:a|b=c}}'), - [('msgnw:a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{Template:a|b=c}}'), - [('Template:a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{template:a|b=c}}'), - [('template:a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{:a|b=c}}'), - [(':a', OrderedDict((('b', 'c'), )))]) - self.assertEqual(func('{{subst::a|b=c}}'), - [('subst::a', OrderedDict((('b', 'c'), )))]) - - self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'), - [('a', OrderedDict((('b', '{{{1}}}'), - ('c', '{{{2}}}'))))]) - self.assertEqual(func('{{a|b=c}}{{d|e=f}}'), - [('a', OrderedDict((('b', 'c'), ))), - ('d', OrderedDict((('e', 'f'), )))]) - - # initial '{' and '}' should be ignored as outer wikitext - self.assertEqual(func('{{{a|b}}X}'), - [('a', OrderedDict((('1', 'b'), )))]) - - # sf.net bug 1575: unclosed template - self.assertEqual(func('{{a'), []) - self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())]) + # sf.net bug 1575: unclosed template + self.assertEqual(func('{{a'), []) + self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])
def _unstripped(self, func): """Common cases of unstripped results.""" @@ -455,14 +455,69 @@ ('2', 'd')])), ('b', OrderedDict([('1', 'c')]))])
+ def _mwpfh_passes(self, func, failing=False): + """Common cases failing with wikitextparser but passes with mwpfh. + + Probably the behaviour of regex or mwpfh is wrong. + """ + patterns = [ + '{{subst:a|b=c}}', + '{{safesubst:a|b=c}}', + '{{msgnw:a|b=c}}', + '{{subst::a|b=c}}' + ] + context = self.assertRaises(AssertionError) \ + if failing else nullcontext() + + for template in patterns: + with self.subTest(template=template, failing=failing): + name = template.strip('{}').split('|')[0] + with context: + self.assertEqual(func(template), + [(name, OrderedDict((('b', 'c'), )))]) + @require_modules('mwparserfromhell') def test_extract_templates_params_mwpfh(self): """Test using mwparserfromhell.""" func = textlib.extract_templates_and_params_mwpfh + with suppress_warnings(WARNING_MSG, category=FutureWarning): + self._common_results(func) + self._order_differs(func) + self._unstripped(func) + self._etp_regex_differs(func) + self._mwpfh_passes(func) + + self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'), + [('c', OrderedDict((('1', '{{d}}'), ))), + ('a', OrderedDict([('1', '{{c|{{d}}}}')])), + ('d', OrderedDict()) + ]) + + self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'), + [('c', OrderedDict((('1', '{{d|}}'), ))), + ('a', OrderedDict([('1', '{{c|{{d|}}}}')])), + ('d', OrderedDict([('1', '')])) + ]) + + @require_modules('mwparserfromhell') + def test_extract_templates_params_parser_stripped(self): + """Test using mwparserfromhell with stripping.""" + func = functools.partial(textlib._extract_templates_and_params_parser, + strip=True) + + self._common_results(func) + self._order_differs(func) + self._stripped(func) + + @require_modules('wikitextparser') + def test_extract_templates_params_parser(self): + """Test using wikitextparser.""" + func = textlib.extract_templates_and_params self._common_results(func) self._order_differs(func) self._unstripped(func) self._etp_regex_differs(func) + self._mwpfh_passes(func, failing=True)
self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'), [('c', OrderedDict((('1', '{{d}}'), ))), @@ -476,16 +531,6 @@ ('d', OrderedDict([('1', '')])) ])
- @require_modules('mwparserfromhell') - def test_extract_templates_params_mwpfh_stripped(self): - """Test using mwparserfromhell with stripping.""" - func = functools.partial(textlib.extract_templates_and_params_mwpfh, - strip=True) - - self._common_results(func) - self._order_differs(func) - self._stripped(func) - def test_extract_templates_params_regex(self): """Test using many complex regexes.""" func = functools.partial(textlib.extract_templates_and_params_regex, @@ -660,7 +705,7 @@
net = False
- @PatchingTestCase.patched(textlib, 'extract_templates_and_params_mwpfh') + @PatchingTestCase.patched(textlib, '_extract_templates_and_params_parser') def extract_mwpfh(self, text, *args, **kwargs): """Patched call to extract_templates_and_params_mwpfh.""" self._text = text @@ -676,7 +721,7 @@
def test_removing_disabled_parts_regex(self): """Test removing disabled parts when using the regex variant.""" - self.patch(textlib, 'mwparserfromhell', Exception()) + self.patch(textlib, 'wikitextparser', ImportError()) textlib.extract_templates_and_params('{{a<!-- -->}}', True) self.assertEqual(self._text, '{{a}}') self.assertFalse(self._mwpfh) @@ -702,7 +747,7 @@
def test_strip_regex(self): """Test stripping values when using the regex variant.""" - self.patch(textlib, 'mwparserfromhell', Exception()) + self.patch(textlib, 'wikitextparser', ImportError()) textlib.extract_templates_and_params('{{a| foo }}', False, True) self.assertEqual(self._args, (False, True)) self.assertFalse(self._mwpfh)
pywikibot-commits@lists.wikimedia.org