jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[parser] Deprecate textlib usage without MediaWiki markup parser

mwparserfromhell or wikitextparser will be mandatory in near future

- check whether mwparserfromhell or wikitextparser is installed and
show a FutureWarning in extract_templates_and_params if no package is
available
- catch an ImportError in pwb wrapper like for other dependency checks.
The ImportError will be raised when parser is mandatory.
- deprecate extract_templates_and_params_regex
- update tests and remove TestGenericTemplateParams because dispatcher
will be removed. Don't test the outdated regex parser anymore.
- add wikitextparser to setup.py extra_deps
- update tox.ini
- update some documentation hints

Patch detached from 908abeb94b2ee

Bug: T106763
Change-Id: I967698f79d958f233becb6e7a60336361f57c0f5
---
M README.rst
M pwb.py
M pywikibot/textlib.py
M requirements.txt
M setup.py
M tests/__init__.py
M tests/textlib_tests.py
M tox.ini
8 files changed, 111 insertions(+), 137 deletions(-)

diff --git a/README.rst b/README.rst
index 6b1a91f..8872f7d 100644
--- a/README.rst
+++ b/README.rst
@@ -44,6 +44,7 @@

::

+ pip install requests
git clone https://gerrit.wikimedia.org/r/pywikibot/core.git
cd core
git submodule update --init
@@ -56,6 +57,18 @@
pip install -U setuptools
pip install pywikibot

+In addition a MediaWiki markup parser is required. Please install one of them:
+
+::
+
+ pip install mwparserfromhell
+
+or
+
+::
+
+ pip install wikitextparser
+
Our `installation
guide <https://www.mediawiki.org/wiki/Manual:Pywikibot/Installation>`_
has more details for advanced usage.
diff --git a/pwb.py b/pwb.py
index 80bbc7e..95abb09 100755
--- a/pwb.py
+++ b/pwb.py
@@ -14,7 +14,7 @@

python pwb.py -lang:de bot_tests -v
"""
-# (C) Pywikibot team, 2012-2020
+# (C) Pywikibot team, 2012-2021
#
# Distributed under the terms of the MIT license.
#
@@ -182,9 +182,8 @@
try:
if not check_modules():
raise RuntimeError('') # no further output needed
-except RuntimeError as e:
- print(e)
- sys.exit()
+except RuntimeError as e: # setup.py may also raise RuntimeError
+ sys.exit(e)

from pathlib import Path # noqa: E402

@@ -213,6 +212,8 @@
# we need to re-start the entire process. Ask the user to do so.
print('Now, you have to re-execute the command to start your script.')
sys.exit(1)
+except ImportError as e: # raised in textlib
+ sys.exit(e)


def find_alternates(filename, script_paths):
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 2ee62d1..83f8872 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -1570,8 +1570,8 @@
# --------------------------------

def extract_templates_and_params(text: str,
- remove_disabled_parts: Optional[bool] = None,
- strip: Optional[bool] = None) -> ETPType:
+ remove_disabled_parts: bool = False,
+ strip: bool = False) -> ETPType:
"""Return a list of templates found in text.

Return value is a list of tuples. There is one tuple for each use of a
@@ -1582,16 +1582,19 @@
parameters, and if this results multiple parameters with the same name
only the last value provided will be returned.

- This uses the package L{mwparserfromhell} (mwpfh) if it is installed.
- Otherwise it falls back on a regex based implementation.
+ This uses the package L{mwparserfromhell} or L{wikitextparser} as
+ MediaWiki markup parser. Otherwise it falls back on a regex based
+ implementation but it becomes mandatory that one of them is
+ installed.

There are minor differences between the two implementations.

- The two implementations return nested templates in a different order.
- i.e. for {{a|b={{c}}}}, mwpfh returns [a, c], whereas regex returns [c, a].
+ The two implementations return nested templates in a different
+ order, i.e. for {{a|b={{c}}}}, parsers returns [a, c], whereas regex
+ returns [c, a].

- mwpfh preserves whitespace in parameter names and values. regex excludes
- anything between <!-- --> before parsing the text.
+ The parser packages preserves whitespace in parameter names and
+ values.

If there are multiple numbered parameters in the wikitext for the same
position, MediaWiki will only use the last parameter value.
@@ -1599,27 +1602,32 @@
To replicate that behaviour, enable both remove_disabled_parts and strip.

@param text: The wikitext from which templates are extracted
- @param remove_disabled_parts: Remove disabled wikitext such as comments
- and pre. If None (default), this is enabled when neither
- mwparserfromhell not wikitextparser package is available and
- disabled otherwise.
- @param strip: if enabled, strip arguments and values of templates.
- If None (default), this is enabled when neither mwparserfromhell
- nor wikitextparser package is available and disabled otherwise.
+ @param remove_disabled_parts: If enabled, remove disabled wikitext
+ such as comments and pre.
+ @param strip: If enabled, strip arguments and values of templates.
@return: list of template name and params
"""
use_regex = isinstance(wikitextparser, ImportError)

- if remove_disabled_parts is None:
- remove_disabled_parts = use_regex
if remove_disabled_parts:
text = removeDisabledParts(text)

- if strip is None:
- strip = use_regex
-
if use_regex:
- return extract_templates_and_params_regex(text, False, strip)
+ issue_deprecation_warning("""
+Pywikibot needs a MediaWiki markup parser.
+Please install the requested module with either
+
+ pip install "mwparserfromhell>=0.5.0"
+
+or
+
+ pip install "wikitextparser>=0.47.0"
+
+Using pywikibot without MediaWiki markup parser""",
+ warning_class=FutureWarning,
+ since='20210416')
+
+ return _extract_templates_and_params_regex(text, False, strip)
return _extract_templates_and_params_parser(text, strip)


@@ -1683,7 +1691,7 @@
future_warning=True)
def extract_templates_and_params_mwpfh(text: str,
strip: bool = False) -> ETPType:
- """Extract templates with params using mwparserfromhell."""
+ """DEPRECATED. Extract templates with params using mwparserfromhell."""
global wikitextparser
saved_parser = wikitextparser
import mwparserfromhell as wikitextparser
@@ -1692,11 +1700,12 @@
return result


+@deprecated('extract_templates_and_params', since='20210331',
+ future_warning=True)
def extract_templates_and_params_regex(text: str,
remove_disabled_parts: bool = True,
strip: bool = True) -> ETPType:
- """
- Extract templates with params using a regex with additional processing.
+ """DEPRECATED. Extract templates with params using a regex.

This function should not be called directly.

@@ -1708,6 +1717,14 @@
@param strip: if enabled, strip arguments and values of templates
@return: list of template name and params
"""
+ return _extract_templates_and_params_regex(text, remove_disabled_parts,
+ strip)
+
+
+def _extract_templates_and_params_regex(text: str,
+ remove_disabled_parts: bool = True,
+ strip: bool = True) -> ETPType:
+ """DEPRECATED. Extract templates with params using a regex."""
# remove commented-out stuff etc.
if remove_disabled_parts:
thistxt = removeDisabledParts(text)
diff --git a/requirements.txt b/requirements.txt
index 8b64aec..2a6cbc4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,6 @@
# It is organised so that simple requirements
# are processed first, and more difficult packages
# are last.
-# All dependencies other than requests are optional.
#
# It is good practise to install packages using the system
# package manager if it has a packaged version. If you are
@@ -18,12 +17,18 @@
# or
# $ awk -F '[#>=]' '{print $1}' requirements.txt | xargs apt-cache search

-# mandatory
+# mandatory dependencies, others are optional
requests>=2.20.1, < 2.26.0; python_version < '3.6'
requests>=2.20.1 ; python_version >= '3.6'
setuptools>=20.2, !=50.0.0, <50.2.0 ; python_version < '3.6'
setuptools>=20.2 ; python_version >= '3.6'

+# MediaWiki markup parser
+# mwparserfromhell is default, wikitextparser can be used instead
+# mwparserfromhell is still required for commons_information.py and patrol.py
+# wikitextparser>=0.47.0
+mwparserfromhell>=0.5.0
+
# OAuth support
# mwoauth 0.2.4 is needed because it supports getting identity information
# about the user
@@ -42,9 +47,6 @@
google >= 1.7
sseclient >= 0.0.18,!=0.0.23,!=0.0.24

-# textlib.py, commons_information and patrol.py
-mwparserfromhell>=0.5.0
-
# The mysql generator in pagegenerators depends on PyMySQL
PyMySQL >= 0.6.7, < 1.0.0 ; python_version < '3.6'
PyMySQL >= 1.0.0 ; python_version >= '3.6'
diff --git a/setup.py b/setup.py
index 6b34da3..2ae5ca7 100644
--- a/setup.py
+++ b/setup.py
@@ -62,6 +62,7 @@
'Graphviz': ['pydot>=1.2'],
'Google': ['google>=1.7'],
'mwparserfromhell': ['mwparserfromhell>=0.5.0'],
+ 'wikitextparser': ['wikitextparser>=0.47.0'],
'Tkinter': [ # vulnerability found in Pillow<8.1.1
'Pillow>=8.1.1;python_version>="3.6"',
],
@@ -110,7 +111,7 @@
'setuptools>=20.2, !=50.0.0, <50.2.0 ; python_version < "3.6"',
'setuptools>=20.2 ; python_version >= "3.6"',
]
-
+# in addition either mwparserfromhell or wikitextparser is required

# ------- setup tests_require ------- #
test_deps = ['mock']
diff --git a/tests/__init__.py b/tests/__init__.py
index 04f61c3..e0ac4b6 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -24,7 +24,8 @@
# Verify that the unit tests have a base working environment:
# - requests is mandatory
# however if unavailable this will fail on use; see pywikibot/tools.py
-# - mwparserfromhell is optional, so is only imported in textlib_tests
+# - mwparserfromhell or wikitextparser should be used but the dependency
+# is checked by textlib already
import requests # noqa: F401

import pywikibot.data.api
diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py
index dadc2a3..aed174f 100644
--- a/tests/textlib_tests.py
+++ b/tests/textlib_tests.py
@@ -23,8 +23,10 @@
from pywikibot import UnknownSite

from tests.aspects import (
- require_modules, TestCase, DefaultDrySiteTestCase,
- PatchingTestCase, SiteAttributeTestCase,
+ DefaultDrySiteTestCase,
+ require_modules,
+ SiteAttributeTestCase,
+ TestCase,
)
from tests import mock

@@ -302,7 +304,7 @@
'Invalid category title extracted: nasty{{{!}}')


-WARNING_MSG = (r'.*extract_templates_and_params_mwpfh .*'
+WARNING_MSG = (r'.*extract_templates_and_params_.*'
r'is deprecated for .*; use extract_templates_and_params')


@@ -502,7 +504,7 @@
@require_modules('mwparserfromhell')
def test_extract_templates_params_parser_stripped(self):
"""Test using mwparserfromhell with stripping."""
- func = functools.partial(textlib._extract_templates_and_params_parser,
+ func = functools.partial(textlib.extract_templates_and_params,
strip=True)

self._common_results(func)
@@ -535,37 +537,39 @@
"""Test using many complex regexes."""
func = functools.partial(textlib.extract_templates_and_params_regex,
remove_disabled_parts=False, strip=False)
- self._common_results(func)
- self._order_differs(func)
- self._unstripped(func)
-
- self.assertEqual(func('{{a|b={} }}'), []) # FIXME: {} is normal text
+ with suppress_warnings(WARNING_MSG, category=FutureWarning):
+ self._common_results(func)
+ self._order_differs(func)
+ self._unstripped(func)
+ # FIXME: {} is normal text
+ self.assertEqual(func('{{a|b={} }}'), [])

def test_extract_templates_params_regex_stripped(self):
"""Test using many complex regexes with stripping."""
func = textlib.extract_templates_and_params_regex
+ with suppress_warnings(WARNING_MSG, category=FutureWarning):
+ self._common_results(func)
+ self._order_differs(func)
+ self._stripped(func)

- self._common_results(func)
- self._order_differs(func)
- self._stripped(func)
+ self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
+ [('a', OrderedDict((('b', ''), )))])

- self.assertEqual(func('{{a|b=<!--{{{1}}}-->}}'),
- [('a', OrderedDict((('b', ''), )))])
+ # Identical to mwpfh
+ self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
+ [('c', OrderedDict((('1', '{{d}}'), ))),
+ ('a', OrderedDict([('1', '{{c|{{d}}}}')])),
+ ('d', OrderedDict())
+ ])

- # Identical to mwpfh
- self.assertCountEqual(func('{{a|{{c|{{d}}}}}}'),
- [('c', OrderedDict((('1', '{{d}}'), ))),
- ('a', OrderedDict([('1', '{{c|{{d}}}}')])),
- ('d', OrderedDict())
- ])
+ # However fails to correctly handle three levels of balanced
+ # brackets with empty parameters
+ self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
+ [('c', OrderedDict((('1', '{{d|}}}'), ))),
+ ('d', OrderedDict([('1', '}')]))
+ ])

- # However fails to correctly handle three levels of balanced brackets
- # with empty parameters
- self.assertCountEqual(func('{{a|{{c|{{d|}}}}}}'),
- [('c', OrderedDict((('1', '{{d|}}}'), ))),
- ('d', OrderedDict([('1', '}')]))
- ])
-
+ @require_modules('mwparserfromhell')
def test_extract_templates_params(self):
"""Test that the normal entry point works."""
func = functools.partial(textlib.extract_templates_and_params,
@@ -699,79 +703,6 @@
self.assertTrue(m.group(0).endswith('foo {{bar}}'))


-class TestGenericTemplateParams(PatchingTestCase):
-
- """Test whether the generic function forwards the call correctly."""
-
- net = False
-
- @PatchingTestCase.patched(textlib, '_extract_templates_and_params_parser')
- def extract_mwpfh(self, text, *args, **kwargs):
- """Patched call to extract_templates_and_params_mwpfh."""
- self._text = text
- self._args = args
- self._mwpfh = True
-
- @PatchingTestCase.patched(textlib, 'extract_templates_and_params_regex')
- def extract_regex(self, text, *args, **kwargs):
- """Patched call to extract_templates_and_params_regex."""
- self._text = text
- self._args = args
- self._mwpfh = False
-
- def test_removing_disabled_parts_regex(self):
- """Test removing disabled parts when using the regex variant."""
- self.patch(textlib, 'wikitextparser', ImportError())
- textlib.extract_templates_and_params('{{a<!-- -->}}', True)
- self.assertEqual(self._text, '{{a}}')
- self.assertFalse(self._mwpfh)
- textlib.extract_templates_and_params('{{a<!-- -->}}', False)
- self.assertEqual(self._text, '{{a<!-- -->}}')
- self.assertFalse(self._mwpfh)
- textlib.extract_templates_and_params('{{a<!-- -->}}')
- self.assertEqual(self._text, '{{a}}')
- self.assertFalse(self._mwpfh)
-
- @require_modules('mwparserfromhell')
- def test_removing_disabled_parts_mwpfh(self):
- """Test removing disabled parts when using the mwpfh variant."""
- textlib.extract_templates_and_params('{{a<!-- -->}}', True)
- self.assertEqual(self._text, '{{a}}')
- self.assertTrue(self._mwpfh)
- textlib.extract_templates_and_params('{{a<!-- -->}}', False)
- self.assertEqual(self._text, '{{a<!-- -->}}')
- self.assertTrue(self._mwpfh)
- textlib.extract_templates_and_params('{{a<!-- -->}}')
- self.assertEqual(self._text, '{{a<!-- -->}}')
- self.assertTrue(self._mwpfh)
-
- def test_strip_regex(self):
- """Test stripping values when using the regex variant."""
- self.patch(textlib, 'wikitextparser', ImportError())
- textlib.extract_templates_and_params('{{a| foo }}', False, True)
- self.assertEqual(self._args, (False, True))
- self.assertFalse(self._mwpfh)
- textlib.extract_templates_and_params('{{a| foo }}', False, False)
- self.assertEqual(self._args, (False, False))
- self.assertFalse(self._mwpfh)
- textlib.extract_templates_and_params('{{a| foo }}', False)
- self.assertEqual(self._args, (False, True))
- self.assertFalse(self._mwpfh)
-
- @require_modules('mwparserfromhell')
- def test_strip_mwpfh(self):
- """Test stripping values when using the mwpfh variant."""
- textlib.extract_templates_and_params('{{a| foo }}', None, True)
- self.assertEqual(self._args, (True, ))
- self.assertTrue(self._mwpfh)
- textlib.extract_templates_and_params('{{a| foo }}', None, False)
- self.assertEqual(self._args, (False, ))
- self.assertTrue(self._mwpfh)
- textlib.extract_templates_and_params('{{a| foo }}')
- self.assertEqual(self._args, (False, ))
- self.assertTrue(self._mwpfh)
-
-
class TestDisabledParts(DefaultDrySiteTestCase):

"""Test the removeDisabledParts function in textlib."""
diff --git a/tox.ini b/tox.ini
index d68e21d..97ebeaf 100644
--- a/tox.ini
+++ b/tox.ini
@@ -40,8 +40,15 @@
fasttest: mock
fasttest: .[scripts]

- fasttest-py35: mwparserfromhell
- fasttest-py35: beautifulsoup4
+ fasttest-py35: .[html]
+ fasttest-py35: .[mwparserfromhell]
+ fasttest-py37: .[wikitextparser]
+
+ deeptest: .[html]
+ deeptest: .[mwparserfromhell]
+ deeptest: .[scripts]
+ deeptest: .[wikitextparser]
+

[testenv:commit-message]
basepython = python3
@@ -56,6 +63,7 @@
nosetests --with-doctest pywikibot {[params]doctest_skip}
deps =
nose
+ .[mwparserfromhell]

[testenv:venv]
commands = {posargs}

To view, visit change 680294. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I967698f79d958f233becb6e7a60336361f57c0f5
Gerrit-Change-Number: 680294
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged