jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/675170 )
Change subject: [parser] Make mwparserfromhell or wikitextparser mandatory ......................................................................
[parser] Make mwparserfromhell or wikitextparser mandatory
- check whether mwparserfromhell or wikitextparser is installed and raise ImportError if no package is available - catch the ImportError in pwb wrapper like for other dependency checks - deprecate extract_templates_and_params_regex - update tests and remove TestGenericTemplateParams because dispatcher was removed - add wikitextparser to setup.py extra_deps - update tox.ini - update some documentation hints
Bug: T106763 Change-Id: Ie35c6399616d3fbe660b3387f16146d939fd51b7 --- M README.rst M pwb.py M pywikibot/textlib.py M requirements.txt M setup.py M tests/__init__.py M tests/textlib_tests.py M tox.ini 8 files changed, 100 insertions(+), 165 deletions(-)
Approvals: JJMC89: Looks good to me, approved jenkins-bot: Verified
diff --git a/README.rst b/README.rst index 6b1a91f..8872f7d 100644 --- a/README.rst +++ b/README.rst @@ -44,6 +44,7 @@
::
+ pip install requests git clone https://gerrit.wikimedia.org/r/pywikibot/core.git cd core git submodule update --init @@ -56,6 +57,18 @@ pip install -U setuptools pip install pywikibot
+In addition a MediaWiki markup parser is required. Please install one of them: + +:: + + pip install mwparserfromhell + +or + +:: + + pip install wikitextparser + Our `installation guide https://www.mediawiki.org/wiki/Manual:Pywikibot/Installation`_ has more details for advanced usage. diff --git a/pwb.py b/pwb.py index 80bbc7e..95abb09 100755 --- a/pwb.py +++ b/pwb.py @@ -14,7 +14,7 @@
python pwb.py -lang:de bot_tests -v """ -# (C) Pywikibot team, 2012-2020 +# (C) Pywikibot team, 2012-2021 # # Distributed under the terms of the MIT license. # @@ -182,9 +182,8 @@ try: if not check_modules(): raise RuntimeError('') # no further output needed -except RuntimeError as e: - print(e) - sys.exit() +except RuntimeError as e: # setup.py may also raise RuntimeError + sys.exit(e)
from pathlib import Path # noqa: E402
@@ -213,6 +212,8 @@ # we need to re-start the entire process. Ask the user to do so. print('Now, you have to re-execute the command to start your script.') sys.exit(1) +except ImportError as e: # raised in textlib + sys.exit(e)
def find_alternates(filename, script_paths): diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 2ee62d1..8807069 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -35,8 +35,18 @@ except ImportError: try: import mwparserfromhell as wikitextparser - except ImportError as e: - wikitextparser = e + except ImportError: + # print required because pywikibot is not imported completely + raise ImportError(""" +Pywikibot is missing a MediaWiki markup parser which is necessary. +Please update the required module with either + + pip install "mwparserfromhell>=0.5.0" + +or + + pip install "wikitextparser>=0.47.0" +""") from None
ETPType = List[Tuple[str, OrderedDictType[str, str]]]
@@ -1570,8 +1580,8 @@ # --------------------------------
def extract_templates_and_params(text: str, - remove_disabled_parts: Optional[bool] = None, - strip: Optional[bool] = None) -> ETPType: + remove_disabled_parts: bool = False, + strip: bool = False) -> ETPType: """Return a list of templates found in text.
Return value is a list of tuples. There is one tuple for each use of a @@ -1582,16 +1592,14 @@ parameters, and if this results multiple parameters with the same name only the last value provided will be returned.
- This uses the package L{mwparserfromhell} (mwpfh) if it is installed. - Otherwise it falls back on a regex based implementation. + This uses the package L{mwparserfromhell} or L{wikitextparser} as + MediaWiki markup parser. It is mandatory that one of them is + installed.
There are minor differences between the two implementations.
- The two implementations return nested templates in a different order. - i.e. for {{a|b={{c}}}}, mwpfh returns [a, c], whereas regex returns [c, a]. - - mwpfh preserves whitespace in parameter names and values. regex excludes - anything between <!-- --> before parsing the text. + The parser packages preserves whitespace in parameter names and + values.
If there are multiple numbered parameters in the wikitext for the same position, MediaWiki will only use the last parameter value. @@ -1599,43 +1607,9 @@ To replicate that behaviour, enable both remove_disabled_parts and strip.
@param text: The wikitext from which templates are extracted - @param remove_disabled_parts: Remove disabled wikitext such as comments - and pre. If None (default), this is enabled when neither - mwparserfromhell not wikitextparser package is available and - disabled otherwise. - @param strip: if enabled, strip arguments and values of templates. - If None (default), this is enabled when neither mwparserfromhell - nor wikitextparser package is available and disabled otherwise. - @return: list of template name and params - """ - use_regex = isinstance(wikitextparser, ImportError) - - if remove_disabled_parts is None: - remove_disabled_parts = use_regex - if remove_disabled_parts: - text = removeDisabledParts(text) - - if strip is None: - strip = use_regex - - if use_regex: - return extract_templates_and_params_regex(text, False, strip) - return _extract_templates_and_params_parser(text, strip) - - -def _extract_templates_and_params_parser(text: str, - strip: bool = False) -> ETPType: - """ - Extract templates with params using mwparserfromhell. - - This function should not be called directly. - - Use extract_templates_and_params, which will select this parser - implementation if the mwparserfromhell or wikitextparser package is - installed. - - @param text: The wikitext from which templates are extracted - @param strip: if enabled, strip arguments and values of templates + @param remove_disabled_parts: If enabled, remove disabled wikitext + such as comments and pre. + @param strip: If enabled, strip arguments and values of templates. @return: list of template name and params """ def explicit(param): @@ -1645,6 +1619,9 @@ attr = not param.positional return attr
+ if remove_disabled_parts: + text = removeDisabledParts(text) + parser_name = wikitextparser.__name__ pywikibot.log('Using {!r} wikitext parser'.format(parser_name))
@@ -1683,20 +1660,21 @@ future_warning=True) def extract_templates_and_params_mwpfh(text: str, strip: bool = False) -> ETPType: - """Extract templates with params using mwparserfromhell.""" + """DEPRECATED. Extract templates with params using mwparserfromhell.""" global wikitextparser saved_parser = wikitextparser import mwparserfromhell as wikitextparser - result = _extract_templates_and_params_parser(text, strip) + result = extract_templates_and_params(text, strip=strip) wikitextparser = saved_parser return result
+@deprecated('extract_templates_and_params', since='20210331', + future_warning=True) def extract_templates_and_params_regex(text: str, remove_disabled_parts: bool = True, strip: bool = True) -> ETPType: - """ - Extract templates with params using a regex with additional processing. + """DEPRECATED. Extract templates with params using a regex.
This function should not be called directly.
diff --git a/requirements.txt b/requirements.txt index 8b64aec..2a6cbc4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,6 @@ # It is organised so that simple requirements # are processed first, and more difficult packages # are last. -# All dependencies other than requests are optional. # # It is good practise to install packages using the system # package manager if it has a packaged version. If you are @@ -18,12 +17,18 @@ # or # $ awk -F '[#>=]' '{print $1}' requirements.txt | xargs apt-cache search
-# mandatory +# mandatory dependencies, others are optional requests>=2.20.1, < 2.26.0; python_version < '3.6' requests>=2.20.1 ; python_version >= '3.6' setuptools>=20.2, !=50.0.0, <50.2.0 ; python_version < '3.6' setuptools>=20.2 ; python_version >= '3.6'
+# MediaWiki markup parser +# mwparserfromhell is default, wikitextparser can be used instead +# mwparserfromhell is still required for commons_information.py and patrol.py +# wikitextparser>=0.47.0 +mwparserfromhell>=0.5.0 + # OAuth support # mwoauth 0.2.4 is needed because it supports getting identity information # about the user @@ -42,9 +47,6 @@ google >= 1.7 sseclient >= 0.0.18,!=0.0.23,!=0.0.24
-# textlib.py, commons_information and patrol.py -mwparserfromhell>=0.5.0 - # The mysql generator in pagegenerators depends on PyMySQL PyMySQL >= 0.6.7, < 1.0.0 ; python_version < '3.6' PyMySQL >= 1.0.0 ; python_version >= '3.6' diff --git a/setup.py b/setup.py index 6b34da3..2ae5ca7 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ 'Graphviz': ['pydot>=1.2'], 'Google': ['google>=1.7'], 'mwparserfromhell': ['mwparserfromhell>=0.5.0'], + 'wikitextparser': ['wikitextparser>=0.47.0'], 'Tkinter': [ # vulnerability found in Pillow<8.1.1 'Pillow>=8.1.1;python_version>="3.6"', ], @@ -110,7 +111,7 @@ 'setuptools>=20.2, !=50.0.0, <50.2.0 ; python_version < "3.6"', 'setuptools>=20.2 ; python_version >= "3.6"', ] - +# in addition either mwparserfromhell or wikitextparser is required
# ------- setup tests_require ------- # test_deps = ['mock'] diff --git a/tests/__init__.py b/tests/__init__.py index 04f61c3..596f1ce 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -24,7 +24,8 @@ # Verify that the unit tests have a base working environment: # - requests is mandatory # however if unavailable this will fail on use; see pywikibot/tools.py -# - mwparserfromhell is optional, so is only imported in textlib_tests +# - mwparserfromhell or wikitextparser is mandatory but the dependency +# is checked by textlib already import requests # noqa: F401
import pywikibot.data.api diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py index dadc2a3..aed174f 100644 --- a/tests/textlib_tests.py +++ b/tests/textlib_tests.py @@ -23,8 +23,10 @@ from pywikibot import UnknownSite
from tests.aspects import ( - require_modules, TestCase, DefaultDrySiteTestCase, - PatchingTestCase, SiteAttributeTestCase, + DefaultDrySiteTestCase, + require_modules, + SiteAttributeTestCase, + TestCase, ) from tests import mock
@@ -302,7 +304,7 @@ 'Invalid category title extracted: nasty{{{!}}')
-WARNING_MSG = (r'.*extract_templates_and_params_mwpfh .*' +WARNING_MSG = (r'.*extract_templates_and_params_.*' r'is deprecated for .*; use extract_templates_and_params')
@@ -502,7 +504,7 @@ @require_modules('mwparserfromhell') def test_extract_templates_params_parser_stripped(self): """Test using mwparserfromhell with stripping.""" - func = functools.partial(textlib._extract_templates_and_params_parser, + func = functools.partial(textlib.extract_templates_and_params, strip=True)
self._common_results(func) @@ -535,37 +537,39 @@ """Test using many complex regexes.""" func = functools.partial(textlib.extract_templates_and_params_regex, remove_disabled_parts=False, strip=False) - self._common_results(func) - self._order_differs(func) - self._unstripped(func) - - self.assertEqual(func('{{a|b={} }}'), []) # FIXME: {} is normal text + with suppress_warnings(WARNING_MSG, category=FutureWarning): + self._common_results(func) + self._order_differs(func) + self._unstripped(func) + # FIXME: {} is normal text + self.assertEqual(func('{{a|b={} }}'), [])
def test_extract_templates_params_regex_stripped(self): """Test using many complex regexes with stripping.""" func = textlib.extract_templates_and_params_regex + with suppress_warnings(WARNING_MSG, category=FutureWarning): + self._common_results(func) + self._order_differs(func) + self._stripped(func)
- self._common_results(func) - self._order_differs(func) - self._stripped(func) + self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), + [('a', OrderedDict((('b', ''), )))])
- self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'), - [('a', OrderedDict((('b', ''), )))]) + # Identical to mwpfh + self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'), + [('c', OrderedDict((('1', '{{d}}'), ))), + ('a', OrderedDict([('1', '{{c|{{d}}}}')])), + ('d', OrderedDict()) + ])
- # Identical to mwpfh - self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'), - [('c', OrderedDict((('1', '{{d}}'), ))), - ('a', OrderedDict([('1', '{{c|{{d}}}}')])), - ('d', OrderedDict()) - ]) + # However fails to correctly handle three levels of balanced + # brackets with empty parameters + self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'), + [('c', OrderedDict((('1', '{{d|}}}'), ))), + ('d', OrderedDict([('1', '}')])) + ])
- # However fails to correctly handle three levels of balanced brackets - # with empty parameters - self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'), - [('c', OrderedDict((('1', '{{d|}}}'), ))), - ('d', OrderedDict([('1', '}')])) - ]) - + @require_modules('mwparserfromhell') def test_extract_templates_params(self): """Test that the normal entry point works.""" func = functools.partial(textlib.extract_templates_and_params, @@ -699,79 +703,6 @@ self.assertTrue(m.group(0).endswith('foo {{bar}}'))
-class TestGenericTemplateParams(PatchingTestCase): - - """Test whether the generic function forwards the call correctly.""" - - net = False - - @PatchingTestCase.patched(textlib, '_extract_templates_and_params_parser') - def extract_mwpfh(self, text, *args, **kwargs): - """Patched call to extract_templates_and_params_mwpfh.""" - self._text = text - self._args = args - self._mwpfh = True - - @PatchingTestCase.patched(textlib, 'extract_templates_and_params_regex') - def extract_regex(self, text, *args, **kwargs): - """Patched call to extract_templates_and_params_regex.""" - self._text = text - self._args = args - self._mwpfh = False - - def test_removing_disabled_parts_regex(self): - """Test removing disabled parts when using the regex variant.""" - self.patch(textlib, 'wikitextparser', ImportError()) - textlib.extract_templates_and_params('{{a<!-- -->}}', True) - self.assertEqual(self._text, '{{a}}') - self.assertFalse(self._mwpfh) - textlib.extract_templates_and_params('{{a<!-- -->}}', False) - self.assertEqual(self._text, '{{a<!-- -->}}') - self.assertFalse(self._mwpfh) - textlib.extract_templates_and_params('{{a<!-- -->}}') - self.assertEqual(self._text, '{{a}}') - self.assertFalse(self._mwpfh) - - @require_modules('mwparserfromhell') - def test_removing_disabled_parts_mwpfh(self): - """Test removing disabled parts when using the mwpfh variant.""" - textlib.extract_templates_and_params('{{a<!-- -->}}', True) - self.assertEqual(self._text, '{{a}}') - self.assertTrue(self._mwpfh) - textlib.extract_templates_and_params('{{a<!-- -->}}', False) - self.assertEqual(self._text, '{{a<!-- -->}}') - self.assertTrue(self._mwpfh) - textlib.extract_templates_and_params('{{a<!-- -->}}') - self.assertEqual(self._text, '{{a<!-- -->}}') - self.assertTrue(self._mwpfh) - - def test_strip_regex(self): - """Test stripping values when using the regex variant.""" - self.patch(textlib, 'wikitextparser', ImportError()) - textlib.extract_templates_and_params('{{a| foo }}', False, True) - self.assertEqual(self._args, (False, True)) - self.assertFalse(self._mwpfh) - textlib.extract_templates_and_params('{{a| foo }}', False, False) - self.assertEqual(self._args, (False, False)) - self.assertFalse(self._mwpfh) - textlib.extract_templates_and_params('{{a| foo }}', False) - self.assertEqual(self._args, (False, True)) - self.assertFalse(self._mwpfh) - - @require_modules('mwparserfromhell') - def test_strip_mwpfh(self): - """Test stripping values when using the mwpfh variant.""" - textlib.extract_templates_and_params('{{a| foo }}', None, True) - self.assertEqual(self._args, (True, )) - self.assertTrue(self._mwpfh) - textlib.extract_templates_and_params('{{a| foo }}', None, False) - self.assertEqual(self._args, (False, )) - self.assertTrue(self._mwpfh) - textlib.extract_templates_and_params('{{a| foo }}') - self.assertEqual(self._args, (False, )) - self.assertTrue(self._mwpfh) - - class TestDisabledParts(DefaultDrySiteTestCase):
"""Test the removeDisabledParts function in textlib.""" diff --git a/tox.ini b/tox.ini index d68e21d..cb76128 100644 --- a/tox.ini +++ b/tox.ini @@ -38,10 +38,17 @@ fasttest: pytest-attrib>=0.1.3 fasttest: pytest-subtests >= 0.3.2 fasttest: mock + fasttest: .[mwparserfromhell] fasttest: .[scripts]
- fasttest-py35: mwparserfromhell - fasttest-py35: beautifulsoup4 + fasttest-py35: .[html] + fasttest-py37: .[wikitextparser] + + deeptest: .[html] + deeptest: .[mwparserfromhell] + deeptest: .[scripts] + deeptest: .[wikitextparser] +
[testenv:commit-message] basepython = python3 @@ -56,6 +63,7 @@ nosetests --with-doctest pywikibot {[params]doctest_skip} deps = nose + .[mwparserfromhell]
[testenv:venv] commands = {posargs}
pywikibot-commits@lists.wikimedia.org