jenkins-bot submitted this change.

View Change

Approvals: JJMC89: Looks good to me, approved jenkins-bot: Verified
[IMPR] Enable extract_templates_and_params with wikitextparser package

- rename extract_templates_and_params_mwpfh to
_extract_templates_and_params_parser and check for the currently used
wikitext parser. All different package behaviours ar handled there.
Test for ImportError instead of generic Exception.
- deprecate extract_templates_and_params_mwpfh but force to take
mwparserfromhell as wikitextparser
- the fallback path is: wikitextparser, mwparserfromhell, _ETP_REGEX
- update documentation
- add OrderedDict and nullcontext to backports.py
- update tests accordingly

Change-Id: Ibf4472b309672e7189042875c460b3665ec810ed
---
M pywikibot/backports.py
M pywikibot/textlib.py
M scripts/i18n
M tests/textlib_tests.py
4 files changed, 211 insertions(+), 101 deletions(-)

diff --git a/pywikibot/backports.py b/pywikibot/backports.py
index 487447a..32e2666 100644
--- a/pywikibot/backports.py
+++ b/pywikibot/backports.py
@@ -14,6 +14,25 @@
cache = _lru_cache(None)


+# context
+if PYTHON_VERSION < (3, 7):
+
+ class nullcontext: # noqa: N801
+
+ """Dummy context manager for Python 3.5/3.6 that does nothing."""
+
+ def __init__(self, result=None): # noqa: D107
+ self.result = result
+
+ def __enter__(self):
+ return self.result
+
+ def __exit__(self, *args):
+ pass
+else:
+ from contextlib import nullcontext
+
+
# typing
if PYTHON_VERSION < (3, 5, 2):
from typing import Dict as DefaultDict
@@ -22,6 +41,15 @@
else:
from collections import defaultdict as DefaultDict # noqa: N812

+
+if PYTHON_VERSION < (3, 7, 2):
+ from typing import Dict as OrderedDict
+elif PYTHON_VERSION < (3, 9):
+ from typing import OrderedDict
+else:
+ from collections import OrderedDict
+
+
if PYTHON_VERSION >= (3, 9):
from collections.abc import Iterable, Sequence
Dict = dict
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 204bd68..75ae916 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -22,15 +22,22 @@
import pywikibot

from pywikibot.backports import List, Tuple
+from pywikibot.backports import OrderedDict as OrderedDictType
from pywikibot.exceptions import InvalidTitle, SiteDefinitionError
from pywikibot.family import Family
-from pywikibot.tools import deprecate_arg, issue_deprecation_warning
+from pywikibot.tools import (
+ deprecated, deprecate_arg, issue_deprecation_warning,
+)

try:
- import mwparserfromhell
-except ImportError as e:
- mwparserfromhell = e
+ import wikitextparser
+except ImportError:
+ try:
+ import mwparserfromhell as wikitextparser
+ except ImportError as e:
+ wikitextparser = e

+ETPType = List[Tuple[str, OrderedDictType[str, str]]]

# cache for replaceExcept to avoid recompile or regexes each call
_regex_cache = {}
@@ -1566,8 +1573,9 @@
# Functions dealing with templates
# --------------------------------

-def extract_templates_and_params(text: str, remove_disabled_parts=None,
- strip: Optional[bool] = None):
+def extract_templates_and_params(text: str,
+ remove_disabled_parts: Optional[bool] = None,
+ strip: Optional[bool] = None) -> ETPType:
"""Return a list of templates found in text.

Return value is a list of tuples. There is one tuple for each use of a
@@ -1596,16 +1604,15 @@

@param text: The wikitext from which templates are extracted
@param remove_disabled_parts: Remove disabled wikitext such as comments
- and pre. If None (default), this is enabled when mwparserfromhell
- is not available and disabled if mwparserfromhell is present.
- @type remove_disabled_parts: bool or None
+ and pre. If None (default), this is enabled when neither
+ mwparserfromhell not wikitextparser package is available and
+ disabled otherwise.
@param strip: if enabled, strip arguments and values of templates.
- If None (default), this is enabled when mwparserfromhell
- is not available and disabled if mwparserfromhell is present.
+ If None (default), this is enabled when neither mwparserfromhell
+ nor wikitextparser package is available and disabled otherwise.
@return: list of template name and params
- @rtype: list of tuple
"""
- use_regex = isinstance(mwparserfromhell, Exception)
+ use_regex = isinstance(wikitextparser, ImportError)

if remove_disabled_parts is None:
remove_disabled_parts = use_regex
@@ -1617,41 +1624,58 @@

if use_regex:
return extract_templates_and_params_regex(text, False, strip)
- return extract_templates_and_params_mwpfh(text, strip)
+ return _extract_templates_and_params_parser(text, strip)


-def extract_templates_and_params_mwpfh(text: str, strip=False):
+def _extract_templates_and_params_parser(text: str,
+ strip: bool = False) -> ETPType:
"""
Extract templates with params using mwparserfromhell.

This function should not be called directly.

- Use extract_templates_and_params, which will select this
- mwparserfromhell implementation if based on whether the
- mwparserfromhell package is installed.
+ Use extract_templates_and_params, which will select this parser
+ implementation if the mwparserfromhell or wikitextparser package is
+ installed.

@param text: The wikitext from which templates are extracted
+ @param strip: if enabled, strip arguments and values of templates
@return: list of template name and params
- @rtype: list of tuple
"""
- code = mwparserfromhell.parse(text)
- result = []
+ def explicit(param):
+ try:
+ attr = param.showkey
+ except AttributeError:
+ attr = not param.positional
+ return attr

- for template in code.ifilter_templates(
+ parser_name = wikitextparser.__name__
+ pywikibot.log('Using {!r} wikitext parser'.format(parser_name))
+
+ result = []
+ parsed = wikitextparser.parse(text)
+ if parser_name == 'wikitextparser':
+ templates = parsed.templates
+ arguments = 'arguments'
+ else:
+ templates = parsed.ifilter_templates(
matches=lambda x: not x.name.lstrip().startswith('#'),
- recursive=True):
+ recursive=True)
+ arguments = 'params'
+
+ for template in templates:
params = OrderedDict()
- for param in template.params:
+ for param in getattr(template, arguments):
+ value = str(param.value) # mwpfh needs upcast to str
+
if strip:
- implicit_parameter = not param.showkey
key = param.name.strip()
- if not implicit_parameter:
+ if explicit(param):
value = param.value.strip()
else:
value = str(param.value)
else:
key = str(param.name)
- value = str(param.value)

params[key] = value

@@ -1659,9 +1683,22 @@
return result


+@deprecated('extract_templates_and_params', since='20210329',
+ future_warning=True)
+def extract_templates_and_params_mwpfh(text: str,
+ strip: bool = False) -> ETPType:
+ """Extract templates with params using mwparserfromhell."""
+ global wikitextparser
+ saved_parser = wikitextparser
+ import mwparserfromhell as wikitextparser
+ result = _extract_templates_and_params_parser(text, strip)
+ wikitextparser = saved_parser
+ return result
+
+
def extract_templates_and_params_regex(text: str,
remove_disabled_parts: bool = True,
- strip: bool = True):
+ strip: bool = True) -> ETPType:
"""
Extract templates with params using a regex with additional processing.

@@ -1672,8 +1709,8 @@
is not used.

@param text: The wikitext from which templates are extracted
+ @param strip: if enabled, strip arguments and values of templates
@return: list of template name and params
- @rtype: list of tuple
"""
# remove commented-out stuff etc.
if remove_disabled_parts:
diff --git a/scripts/i18n b/scripts/i18n
index 362e42e..4591fe6 160000
--- a/scripts/i18n
+++ b/scripts/i18n
@@ -1 +1 @@
-Subproject commit 362e42e36ae45005e8019ae4107eac2adb736a55
+Subproject commit 4591fe605f89e29e14f730f1416542acfea135bd
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index 139cb99..8b537b3 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -16,8 +16,10 @@
import pywikibot
import pywikibot.textlib as textlib

+from pywikibot.backports import nullcontext
from pywikibot.site._interwikimap import _IWEntry
from pywikibot.textlib import _MultiTemplateMatchBuilder, extract_sections
+from pywikibot.tools import suppress_warnings
from pywikibot import UnknownSite

from tests.aspects import (
@@ -300,6 +302,10 @@
'Invalid category title extracted: nasty{{{!}}')


+WARNING_MSG = (r'.*extract_templates_and_params_mwpfh .*'
+ r'is deprecated for .*; use extract_templates_and_params')
+
+
class TestTemplateParams(TestCase):

"""Test to verify that template params extraction works."""
@@ -308,67 +314,61 @@

def _common_results(self, func):
"""Common cases."""
- self.assertEqual(func('{{a}}'), [('a', OrderedDict())])
- self.assertEqual(func('{{ a}}'), [('a', OrderedDict())])
- self.assertEqual(func('{{a }}'), [('a', OrderedDict())])
- self.assertEqual(func('{{ a }}'), [('a', OrderedDict())])
+ with suppress_warnings(WARNING_MSG, category=FutureWarning):
+ self.assertEqual(func('{{a}}'), [('a', OrderedDict())])

- self.assertEqual(func('{{a|b=c}}'),
- [('a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{a|b|c=d}}'),
- [('a', OrderedDict((('1', 'b'), ('c', 'd'))))])
- self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'),
- [('a', OrderedDict((('b', 'c'), ('f', 'g'),
- ('d', 'e'), ('1', ''))))])
- self.assertEqual(func('{{a|1=2|c=d}}'),
- [('a', OrderedDict((('1', '2'), ('c', 'd'))))])
- self.assertEqual(func('{{a|c=d|1=2}}'),
- [('a', OrderedDict((('c', 'd'), ('1', '2'))))])
- self.assertEqual(func('{{a|5=d|a=b}}'),
- [('a', OrderedDict((('5', 'd'), ('a', 'b'))))])
- self.assertEqual(func('{{a|=2}}'),
- [('a', OrderedDict((('', '2'), )))])
+ with suppress_warnings(WARNING_MSG, category=FutureWarning):
+ self.assertEqual(func('{{ a}}'), [('a', OrderedDict())])
+ self.assertEqual(func('{{a }}'), [('a', OrderedDict())])
+ self.assertEqual(func('{{ a }}'), [('a', OrderedDict())])
+ self.assertEqual(func('{{a|b=c}}'),
+ [('a', OrderedDict((('b', 'c'), )))])
+ self.assertEqual(func('{{a|b|c=d}}'),
+ [('a', OrderedDict((('1', 'b'), ('c', 'd'))))])
+ self.assertEqual(func('{{a|b=c|f=g|d=e|1=}}'),
+ [('a', OrderedDict((('b', 'c'), ('f', 'g'),
+ ('d', 'e'), ('1', ''))))])
+ self.assertEqual(func('{{a|1=2|c=d}}'),
+ [('a', OrderedDict((('1', '2'), ('c', 'd'))))])
+ self.assertEqual(func('{{a|c=d|1=2}}'),
+ [('a', OrderedDict((('c', 'd'), ('1', '2'))))])
+ self.assertEqual(func('{{a|5=d|a=b}}'),
+ [('a', OrderedDict((('5', 'd'), ('a', 'b'))))])
+ self.assertEqual(func('{{a|=2}}'),
+ [('a', OrderedDict((('', '2'), )))])
+ self.assertEqual(func('{{a|}}'),
+ [('a', OrderedDict((('1', ''), )))])
+ self.assertEqual(func('{{a|=|}}'),
+ [('a', OrderedDict((('', ''), ('1', ''))))])
+ self.assertEqual(func('{{a||}}'),
+ [('a', OrderedDict((('1', ''), ('2', ''))))])
+ self.assertEqual(func('{{a|b={{{1}}}}}'),
+ [('a', OrderedDict((('b', '{{{1}}}'), )))])
+ self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'),
+ [('a',
+ OrderedDict((('b',
+ '<noinclude>{{{1}}}</noinclude>'),
+ )))])
+ self.assertEqual(func('{{Template:a|b=c}}'),
+ [('Template:a', OrderedDict((('b', 'c'), )))])
+ self.assertEqual(func('{{template:a|b=c}}'),
+ [('template:a', OrderedDict((('b', 'c'), )))])
+ self.assertEqual(func('{{:a|b=c}}'),
+ [(':a', OrderedDict((('b', 'c'), )))])
+ self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'),
+ [('a', OrderedDict((('b', '{{{1}}}'),
+ ('c', '{{{2}}}'))))])
+ self.assertEqual(func('{{a|b=c}}{{d|e=f}}'),
+ [('a', OrderedDict((('b', 'c'), ))),
+ ('d', OrderedDict((('e', 'f'), )))])

- self.assertEqual(func('{{a|}}'), [('a', OrderedDict((('1', ''), )))])
- self.assertEqual(func('{{a|=|}}'),
- [('a', OrderedDict((('', ''), ('1', ''))))])
- self.assertEqual(func('{{a||}}'),
- [('a', OrderedDict((('1', ''), ('2', ''))))])
+ # initial '{' and '}' should be ignored as outer wikitext
+ self.assertEqual(func('{{{a|b}}X}'),
+ [('a', OrderedDict((('1', 'b'), )))])

- self.assertEqual(func('{{a|b={{{1}}}}}'),
- [('a', OrderedDict((('b', '{{{1}}}'), )))])
- self.assertEqual(func('{{a|b=<noinclude>{{{1}}}</noinclude>}}'),
- [('a', OrderedDict(
- (('b', '<noinclude>{{{1}}}</noinclude>'), )))])
- self.assertEqual(func('{{subst:a|b=c}}'),
- [('subst:a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{safesubst:a|b=c}}'),
- [('safesubst:a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{msgnw:a|b=c}}'),
- [('msgnw:a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{Template:a|b=c}}'),
- [('Template:a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{template:a|b=c}}'),
- [('template:a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{:a|b=c}}'),
- [(':a', OrderedDict((('b', 'c'), )))])
- self.assertEqual(func('{{subst::a|b=c}}'),
- [('subst::a', OrderedDict((('b', 'c'), )))])
-
- self.assertEqual(func('{{a|b={{{1}}}|c={{{2}}}}}'),
- [('a', OrderedDict((('b', '{{{1}}}'),
- ('c', '{{{2}}}'))))])
- self.assertEqual(func('{{a|b=c}}{{d|e=f}}'),
- [('a', OrderedDict((('b', 'c'), ))),
- ('d', OrderedDict((('e', 'f'), )))])
-
- # initial '{' and '}' should be ignored as outer wikitext
- self.assertEqual(func('{{{a|b}}X}'),
- [('a', OrderedDict((('1', 'b'), )))])
-
- # sf.net bug 1575: unclosed template
- self.assertEqual(func('{{a'), [])
- self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])
+ # sf.net bug 1575: unclosed template
+ self.assertEqual(func('{{a'), [])
+ self.assertEqual(func('{{a}}{{foo|'), [('a', OrderedDict())])

def _unstripped(self, func):
"""Common cases of unstripped results."""
@@ -455,14 +455,69 @@
('2', 'd')])),
('b', OrderedDict([('1', 'c')]))])

+ def _mwpfh_passes(self, func, failing=False):
+ """Common cases failing with wikitextparser but passes with mwpfh.
+
+ Probably the behaviour of regex or mwpfh is wrong.
+ """
+ patterns = [
+ '{{subst:a|b=c}}',
+ '{{safesubst:a|b=c}}',
+ '{{msgnw:a|b=c}}',
+ '{{subst::a|b=c}}'
+ ]
+ context = self.assertRaises(AssertionError) \
+ if failing else nullcontext()
+
+ for template in patterns:
+ with self.subTest(template=template, failing=failing):
+ name = template.strip('{}').split('|')[0]
+ with context:
+ self.assertEqual(func(template),
+ [(name, OrderedDict((('b', 'c'), )))])
+
@require_modules('mwparserfromhell')
def test_extract_templates_params_mwpfh(self):
"""Test using mwparserfromhell."""
func = textlib.extract_templates_and_params_mwpfh
+ with suppress_warnings(WARNING_MSG, category=FutureWarning):
+ self._common_results(func)
+ self._order_differs(func)
+ self._unstripped(func)
+ self._etp_regex_differs(func)
+ self._mwpfh_passes(func)
+
+ self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
+ [('c', OrderedDict((('1', '{{d}}'), ))),
+ ('a', OrderedDict([('1', '{{c|{{d}}}}')])),
+ ('d', OrderedDict())
+ ])
+
+ self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
+ [('c', OrderedDict((('1', '{{d|}}'), ))),
+ ('a', OrderedDict([('1', '{{c|{{d|}}}}')])),
+ ('d', OrderedDict([('1', '')]))
+ ])
+
+ @require_modules('mwparserfromhell')
+ def test_extract_templates_params_parser_stripped(self):
+ """Test using mwparserfromhell with stripping."""
+ func = functools.partial(textlib._extract_templates_and_params_parser,
+ strip=True)
+
+ self._common_results(func)
+ self._order_differs(func)
+ self._stripped(func)
+
+ @require_modules('wikitextparser')
+ def test_extract_templates_params_parser(self):
+ """Test using wikitextparser."""
+ func = textlib.extract_templates_and_params
self._common_results(func)
self._order_differs(func)
self._unstripped(func)
self._etp_regex_differs(func)
+ self._mwpfh_passes(func, failing=True)

self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
[('c', OrderedDict((('1', '{{d}}'), ))),
@@ -476,16 +531,6 @@
('d', OrderedDict([('1', '')]))
])

- @require_modules('mwparserfromhell')
- def test_extract_templates_params_mwpfh_stripped(self):
- """Test using mwparserfromhell with stripping."""
- func = functools.partial(textlib.extract_templates_and_params_mwpfh,
- strip=True)
-
- self._common_results(func)
- self._order_differs(func)
- self._stripped(func)
-
def test_extract_templates_params_regex(self):
"""Test using many complex regexes."""
func = functools.partial(textlib.extract_templates_and_params_regex,
@@ -660,7 +705,7 @@

net = False

- @PatchingTestCase.patched(textlib, 'extract_templates_and_params_mwpfh')
+ @PatchingTestCase.patched(textlib, '_extract_templates_and_params_parser')
def extract_mwpfh(self, text, *args, **kwargs):
"""Patched call to extract_templates_and_params_mwpfh."""
self._text = text
@@ -676,7 +721,7 @@

def test_removing_disabled_parts_regex(self):
"""Test removing disabled parts when using the regex variant."""
- self.patch(textlib, 'mwparserfromhell', Exception())
+ self.patch(textlib, 'wikitextparser', ImportError())
textlib.extract_templates_and_params('{{a<!-- -->}}', True)
self.assertEqual(self._text, '{{a}}')
self.assertFalse(self._mwpfh)
@@ -702,7 +747,7 @@

def test_strip_regex(self):
"""Test stripping values when using the regex variant."""
- self.patch(textlib, 'mwparserfromhell', Exception())
+ self.patch(textlib, 'wikitextparser', ImportError())
textlib.extract_templates_and_params('{{a| foo }}', False, True)
self.assertEqual(self._args, (False, True))
self.assertFalse(self._mwpfh)

To view, visit change 675351. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ibf4472b309672e7189042875c460b3665ec810ed
Gerrit-Change-Number: 675351
Gerrit-PatchSet: 15
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: JJMC89 <JJMC89.Wikimedia@gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged