jenkins-bot submitted this change.

View Change

Approvals: JJMC89: Looks good to me, approved jenkins-bot: Verified
[parser] Make mwparserfromhell or wikitextparser mandatory

- check whether mwparserfromhell or wikitextparser is installed and
raise ImportError if no package is available
- catch the ImportError in pwb wrapper like for other dependency checks
- deprecate extract_templates_and_params_regex
- update tests and remove TestGenericTemplateParams because dispatcher
was removed
- add wikitextparser to setup.py extra_deps
- update tox.ini
- update some documentation hints

Bug: T106763
Change-Id: Ie35c6399616d3fbe660b3387f16146d939fd51b7
---
M README.rst
M pwb.py
M pywikibot/textlib.py
M requirements.txt
M setup.py
M tests/__init__.py
M tests/textlib_tests.py
M tox.ini
8 files changed, 100 insertions(+), 165 deletions(-)

diff --git a/README.rst b/README.rst
index 6b1a91f..8872f7d 100644
--- a/README.rst
+++ b/README.rst
@@ -44,6 +44,7 @@

::

+ pip install requests
git clone https://gerrit.wikimedia.org/r/pywikibot/core.git
cd core
git submodule update --init
@@ -56,6 +57,18 @@
pip install -U setuptools
pip install pywikibot

+In addition a MediaWiki markup parser is required. Please install one of them:
+
+::
+
+ pip install mwparserfromhell
+
+or
+
+::
+
+ pip install wikitextparser
+
Our `installation
guide <https://www.mediawiki.org/wiki/Manual:Pywikibot/Installation>`_
has more details for advanced usage.
diff --git a/pwb.py b/pwb.py
index 80bbc7e..95abb09 100755
--- a/pwb.py
+++ b/pwb.py
@@ -14,7 +14,7 @@

python pwb.py -lang:de bot_tests -v
"""
-# (C) Pywikibot team, 2012-2020
+# (C) Pywikibot team, 2012-2021
#
# Distributed under the terms of the MIT license.
#
@@ -182,9 +182,8 @@
try:
if not check_modules():
raise RuntimeError('') # no further output needed
-except RuntimeError as e:
- print(e)
- sys.exit()
+except RuntimeError as e: # setup.py may also raise RuntimeError
+ sys.exit(e)

from pathlib import Path # noqa: E402

@@ -213,6 +212,8 @@
# we need to re-start the entire process. Ask the user to do so.
print('Now, you have to re-execute the command to start your script.')
sys.exit(1)
+except ImportError as e: # raised in textlib
+ sys.exit(e)


def find_alternates(filename, script_paths):
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 2ee62d1..8807069 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -35,8 +35,18 @@
except ImportError:
try:
import mwparserfromhell as wikitextparser
- except ImportError as e:
- wikitextparser = e
+ except ImportError:
+ # print required because pywikibot is not imported completely
+ raise ImportError("""
+Pywikibot is missing a MediaWiki markup parser which is necessary.
+Please update the required module with either
+
+ pip install "mwparserfromhell>=0.5.0"
+
+or
+
+ pip install "wikitextparser>=0.47.0"
+""") from None

ETPType = List[Tuple[str, OrderedDictType[str, str]]]

@@ -1570,8 +1580,8 @@
# --------------------------------

def extract_templates_and_params(text: str,
- remove_disabled_parts: Optional[bool] = None,
- strip: Optional[bool] = None) -> ETPType:
+ remove_disabled_parts: bool = False,
+ strip: bool = False) -> ETPType:
"""Return a list of templates found in text.

Return value is a list of tuples. There is one tuple for each use of a
@@ -1582,16 +1592,14 @@
parameters, and if this results multiple parameters with the same name
only the last value provided will be returned.

- This uses the package L{mwparserfromhell} (mwpfh) if it is installed.
- Otherwise it falls back on a regex based implementation.
+ This uses the package L{mwparserfromhell} or L{wikitextparser} as
+ MediaWiki markup parser. It is mandatory that one of them is
+ installed.

There are minor differences between the two implementations.

- The two implementations return nested templates in a different order.
- i.e. for {{a|b={{c}}}}, mwpfh returns [a, c], whereas regex returns [c, a].
-
- mwpfh preserves whitespace in parameter names and values. regex excludes
- anything between <!-- --> before parsing the text.
+ The parser packages preserves whitespace in parameter names and
+ values.

If there are multiple numbered parameters in the wikitext for the same
position, MediaWiki will only use the last parameter value.
@@ -1599,43 +1607,9 @@
To replicate that behaviour, enable both remove_disabled_parts and strip.

@param text: The wikitext from which templates are extracted
- @param remove_disabled_parts: Remove disabled wikitext such as comments
- and pre. If None (default), this is enabled when neither
- mwparserfromhell not wikitextparser package is available and
- disabled otherwise.
- @param strip: if enabled, strip arguments and values of templates.
- If None (default), this is enabled when neither mwparserfromhell
- nor wikitextparser package is available and disabled otherwise.
- @return: list of template name and params
- """
- use_regex = isinstance(wikitextparser, ImportError)
-
- if remove_disabled_parts is None:
- remove_disabled_parts = use_regex
- if remove_disabled_parts:
- text = removeDisabledParts(text)
-
- if strip is None:
- strip = use_regex
-
- if use_regex:
- return extract_templates_and_params_regex(text, False, strip)
- return _extract_templates_and_params_parser(text, strip)
-
-
-def _extract_templates_and_params_parser(text: str,
- strip: bool = False) -> ETPType:
- """
- Extract templates with params using mwparserfromhell.
-
- This function should not be called directly.
-
- Use extract_templates_and_params, which will select this parser
- implementation if the mwparserfromhell or wikitextparser package is
- installed.
-
- @param text: The wikitext from which templates are extracted
- @param strip: if enabled, strip arguments and values of templates
+ @param remove_disabled_parts: If enabled, remove disabled wikitext
+ such as comments and pre.
+ @param strip: If enabled, strip arguments and values of templates.
@return: list of template name and params
"""
def explicit(param):
@@ -1645,6 +1619,9 @@
attr = not param.positional
return attr

+ if remove_disabled_parts:
+ text = removeDisabledParts(text)
+
parser_name = wikitextparser.__name__
pywikibot.log('Using {!r} wikitext parser'.format(parser_name))

@@ -1683,20 +1660,21 @@
future_warning=True)
def extract_templates_and_params_mwpfh(text: str,
strip: bool = False) -> ETPType:
- """Extract templates with params using mwparserfromhell."""
+ """DEPRECATED. Extract templates with params using mwparserfromhell."""
global wikitextparser
saved_parser = wikitextparser
import mwparserfromhell as wikitextparser
- result = _extract_templates_and_params_parser(text, strip)
+ result = extract_templates_and_params(text, strip=strip)
wikitextparser = saved_parser
return result


+@deprecated('extract_templates_and_params', since='20210331',
+ future_warning=True)
def extract_templates_and_params_regex(text: str,
remove_disabled_parts: bool = True,
strip: bool = True) -> ETPType:
- """
- Extract templates with params using a regex with additional processing.
+ """DEPRECATED. Extract templates with params using a regex.

This function should not be called directly.

diff --git a/requirements.txt b/requirements.txt
index 8b64aec..2a6cbc4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,6 @@
# It is organised so that simple requirements
# are processed first, and more difficult packages
# are last.
-# All dependencies other than requests are optional.
#
# It is good practise to install packages using the system
# package manager if it has a packaged version. If you are
@@ -18,12 +17,18 @@
# or
# $ awk -F '[#>=]' '{print $1}' requirements.txt | xargs apt-cache search

-# mandatory
+# mandatory dependencies, others are optional
requests>=2.20.1, < 2.26.0; python_version < '3.6'
requests>=2.20.1 ; python_version >= '3.6'
setuptools>=20.2, !=50.0.0, <50.2.0 ; python_version < '3.6'
setuptools>=20.2 ; python_version >= '3.6'

+# MediaWiki markup parser
+# mwparserfromhell is default, wikitextparser can be used instead
+# mwparserfromhell is still required for commons_information.py and patrol.py
+# wikitextparser>=0.47.0
+mwparserfromhell>=0.5.0
+
# OAuth support
# mwoauth 0.2.4 is needed because it supports getting identity information
# about the user
@@ -42,9 +47,6 @@
google >= 1.7
sseclient >= 0.0.18,!=0.0.23,!=0.0.24

-# textlib.py, commons_information and patrol.py
-mwparserfromhell>=0.5.0
-
# The mysql generator in pagegenerators depends on PyMySQL
PyMySQL >= 0.6.7, < 1.0.0 ; python_version < '3.6'
PyMySQL >= 1.0.0 ; python_version >= '3.6'
diff --git a/setup.py b/setup.py
index 6b34da3..2ae5ca7 100644
--- a/setup.py
+++ b/setup.py
@@ -62,6 +62,7 @@
'Graphviz': ['pydot>=1.2'],
'Google': ['google>=1.7'],
'mwparserfromhell': ['mwparserfromhell>=0.5.0'],
+ 'wikitextparser': ['wikitextparser>=0.47.0'],
'Tkinter': [ # vulnerability found in Pillow<8.1.1
'Pillow>=8.1.1;python_version>="3.6"',
],
@@ -110,7 +111,7 @@
'setuptools>=20.2, !=50.0.0, <50.2.0 ; python_version < "3.6"',
'setuptools>=20.2 ; python_version >= "3.6"',
]
-
+# in addition either mwparserfromhell or wikitextparser is required

# ------- setup tests_require ------- #
test_deps = ['mock']
diff --git a/tests/__init__.py b/tests/__init__.py
index 04f61c3..596f1ce 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -24,7 +24,8 @@
# Verify that the unit tests have a base working environment:
# - requests is mandatory
# however if unavailable this will fail on use; see pywikibot/tools.py
-# - mwparserfromhell is optional, so is only imported in textlib_tests
+# - mwparserfromhell or wikitextparser is mandatory but the dependency
+# is checked by textlib already
import requests # noqa: F401

import pywikibot.data.api
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index dadc2a3..aed174f 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -23,8 +23,10 @@
from pywikibot import UnknownSite

from tests.aspects import (
- require_modules, TestCase, DefaultDrySiteTestCase,
- PatchingTestCase, SiteAttributeTestCase,
+ DefaultDrySiteTestCase,
+ require_modules,
+ SiteAttributeTestCase,
+ TestCase,
)
from tests import mock

@@ -302,7 +304,7 @@
'Invalid category title extracted: nasty{{{!}}')


-WARNING_MSG = (r'.*extract_templates_and_params_mwpfh .*'
+WARNING_MSG = (r'.*extract_templates_and_params_.*'
r'is deprecated for .*; use extract_templates_and_params')


@@ -502,7 +504,7 @@
@require_modules('mwparserfromhell')
def test_extract_templates_params_parser_stripped(self):
"""Test using mwparserfromhell with stripping."""
- func = functools.partial(textlib._extract_templates_and_params_parser,
+ func = functools.partial(textlib.extract_templates_and_params,
strip=True)

self._common_results(func)
@@ -535,37 +537,39 @@
"""Test using many complex regexes."""
func = functools.partial(textlib.extract_templates_and_params_regex,
remove_disabled_parts=False, strip=False)
- self._common_results(func)
- self._order_differs(func)
- self._unstripped(func)
-
- self.assertEqual(func('{{a|b={} }}'), []) # FIXME: {} is normal text
+ with suppress_warnings(WARNING_MSG, category=FutureWarning):
+ self._common_results(func)
+ self._order_differs(func)
+ self._unstripped(func)
+ # FIXME: {} is normal text
+ self.assertEqual(func('{{a|b={} }}'), [])

def test_extract_templates_params_regex_stripped(self):
"""Test using many complex regexes with stripping."""
func = textlib.extract_templates_and_params_regex
+ with suppress_warnings(WARNING_MSG, category=FutureWarning):
+ self._common_results(func)
+ self._order_differs(func)
+ self._stripped(func)

- self._common_results(func)
- self._order_differs(func)
- self._stripped(func)
+ self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
+ [('a', OrderedDict((('b', ''), )))])

- self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
- [('a', OrderedDict((('b', ''), )))])
+ # Identical to mwpfh
+ self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
+ [('c', OrderedDict((('1', '{{d}}'), ))),
+ ('a', OrderedDict([('1', '{{c|{{d}}}}')])),
+ ('d', OrderedDict())
+ ])

- # Identical to mwpfh
- self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
- [('c', OrderedDict((('1', '{{d}}'), ))),
- ('a', OrderedDict([('1', '{{c|{{d}}}}')])),
- ('d', OrderedDict())
- ])
+ # However fails to correctly handle three levels of balanced
+ # brackets with empty parameters
+ self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
+ [('c', OrderedDict((('1', '{{d|}}}'), ))),
+ ('d', OrderedDict([('1', '}')]))
+ ])

- # However fails to correctly handle three levels of balanced brackets
- # with empty parameters
- self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
- [('c', OrderedDict((('1', '{{d|}}}'), ))),
- ('d', OrderedDict([('1', '}')]))
- ])
-
+ @require_modules('mwparserfromhell')
def test_extract_templates_params(self):
"""Test that the normal entry point works."""
func = functools.partial(textlib.extract_templates_and_params,
@@ -699,79 +703,6 @@
self.assertTrue(m.group(0).endswith('foo {{bar}}'))


-class TestGenericTemplateParams(PatchingTestCase):
-
- """Test whether the generic function forwards the call correctly."""
-
- net = False
-
- @PatchingTestCase.patched(textlib, '_extract_templates_and_params_parser')
- def extract_mwpfh(self, text, *args, **kwargs):
- """Patched call to extract_templates_and_params_mwpfh."""
- self._text = text
- self._args = args
- self._mwpfh = True
-
- @PatchingTestCase.patched(textlib, 'extract_templates_and_params_regex')
- def extract_regex(self, text, *args, **kwargs):
- """Patched call to extract_templates_and_params_regex."""
- self._text = text
- self._args = args
- self._mwpfh = False
-
- def test_removing_disabled_parts_regex(self):
- """Test removing disabled parts when using the regex variant."""
- self.patch(textlib, 'wikitextparser', ImportError())
- textlib.extract_templates_and_params('{{a<!-- -->}}', True)
- self.assertEqual(self._text, '{{a}}')
- self.assertFalse(self._mwpfh)
- textlib.extract_templates_and_params('{{a<!-- -->}}', False)
- self.assertEqual(self._text, '{{a<!-- -->}}')
- self.assertFalse(self._mwpfh)
- textlib.extract_templates_and_params('{{a<!-- -->}}')
- self.assertEqual(self._text, '{{a}}')
- self.assertFalse(self._mwpfh)
-
- @require_modules('mwparserfromhell')
- def test_removing_disabled_parts_mwpfh(self):
- """Test removing disabled parts when using the mwpfh variant."""
- textlib.extract_templates_and_params('{{a<!-- -->}}', True)
- self.assertEqual(self._text, '{{a}}')
- self.assertTrue(self._mwpfh)
- textlib.extract_templates_and_params('{{a<!-- -->}}', False)
- self.assertEqual(self._text, '{{a<!-- -->}}')
- self.assertTrue(self._mwpfh)
- textlib.extract_templates_and_params('{{a<!-- -->}}')
- self.assertEqual(self._text, '{{a<!-- -->}}')
- self.assertTrue(self._mwpfh)
-
- def test_strip_regex(self):
- """Test stripping values when using the regex variant."""
- self.patch(textlib, 'wikitextparser', ImportError())
- textlib.extract_templates_and_params('{{a| foo }}', False, True)
- self.assertEqual(self._args, (False, True))
- self.assertFalse(self._mwpfh)
- textlib.extract_templates_and_params('{{a| foo }}', False, False)
- self.assertEqual(self._args, (False, False))
- self.assertFalse(self._mwpfh)
- textlib.extract_templates_and_params('{{a| foo }}', False)
- self.assertEqual(self._args, (False, True))
- self.assertFalse(self._mwpfh)
-
- @require_modules('mwparserfromhell')
- def test_strip_mwpfh(self):
- """Test stripping values when using the mwpfh variant."""
- textlib.extract_templates_and_params('{{a| foo }}', None, True)
- self.assertEqual(self._args, (True, ))
- self.assertTrue(self._mwpfh)
- textlib.extract_templates_and_params('{{a| foo }}', None, False)
- self.assertEqual(self._args, (False, ))
- self.assertTrue(self._mwpfh)
- textlib.extract_templates_and_params('{{a| foo }}')
- self.assertEqual(self._args, (False, ))
- self.assertTrue(self._mwpfh)
-
-
class TestDisabledParts(DefaultDrySiteTestCase):

"""Test the removeDisabledParts function in textlib."""
diff --git a/tox.ini b/tox.ini
index d68e21d..cb76128 100644
--- a/tox.ini
+++ b/tox.ini
@@ -38,10 +38,17 @@
fasttest: pytest-attrib>=0.1.3
fasttest: pytest-subtests >= 0.3.2
fasttest: mock
+ fasttest: .[mwparserfromhell]
fasttest: .[scripts]

- fasttest-py35: mwparserfromhell
- fasttest-py35: beautifulsoup4
+ fasttest-py35: .[html]
+ fasttest-py37: .[wikitextparser]
+
+ deeptest: .[html]
+ deeptest: .[mwparserfromhell]
+ deeptest: .[scripts]
+ deeptest: .[wikitextparser]
+

[testenv:commit-message]
basepython = python3
@@ -56,6 +63,7 @@
nosetests --with-doctest pywikibot {[params]doctest_skip}
deps =
nose
+ .[mwparserfromhell]

[testenv:venv]
commands = {posargs}

To view, visit change 675170. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie35c6399616d3fbe660b3387f16146d939fd51b7
Gerrit-Change-Number: 675170
Gerrit-PatchSet: 13
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: DannyS712 <dannys712.wiki@gmail.com>
Gerrit-Reviewer: JJMC89 <JJMC89.Wikimedia@gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Dalba <dalba.wiki@gmail.com>
Gerrit-CC: Dvorapa <dvorapa@seznam.cz>
Gerrit-MessageType: merged