jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/754066 )
Change subject: Avoid non-deteministic behavior in removeDisableParts ......................................................................
Avoid non-deteministic behavior in removeDisableParts
The order of iteration over a set can change everytime a new Python process is run because the string hashes are different.[1] This means the function can apply replacements in different order which may result in different output. So don't cast the sequence to a set and use any sets at all, it isn't really necessary.
Add a regression test.
[1] https://docs.python.org/3/reference/datamodel.html#object.__hash__
Change-Id: If66173a7301d308b7addd7f63c7a1d2a2771abbc --- M pywikibot/textlib.py M tests/textlib_tests.py 2 files changed, 28 insertions(+), 4 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index c8d89b6..fdffe94 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -457,13 +457,20 @@ :type site: pywikibot.Site
:return: text stripped from disabled parts. + .. note:: the order of removals will correspond to the tags argument + if provided as an ordered sequence (list, tuple) """ if not tags: - tags = {'comment', 'includeonly', 'nowiki', 'pre', 'syntaxhighlight'} - else: - tags = set(tags) + tags = ['comment', 'includeonly', 'nowiki', 'pre', 'syntaxhighlight'] + # avoid set(tags) because sets are internally ordered using the hash + # which for strings is salted per Python process => the output of + # this function would likely be different per script run because + # the replacements would be done in different order and the disabled + # parts may overlap and suppress each other + # see https://docs.python.org/3/reference/datamodel.html#object.__hash__ + # ("Note" at the end of the section) if include: - tags -= set(include) + tags = [tag for tag in tags if tag not in include] regexes = _get_regexes(tags, site) for regex in regexes: text = regex.sub('', text) diff --git a/tests/textlib_tests.py b/tests/textlib_tests.py index f52dc4d..5b1bcec 100644 --- a/tests/textlib_tests.py +++ b/tests/textlib_tests.py @@ -657,6 +657,23 @@ self.assertEqual( textlib.removeDisabledParts(pattern, tags=[test]), '')
+ def test_remove_disabled_parts_include(self): + """Test removeDisabledParts function with the include argument.""" + text = 'text <nowiki>tag</nowiki> text' + self.assertEqual( + textlib.removeDisabledParts(text, include=['nowiki']), text) + + def test_remove_disabled_parts_order(self): + """Test the order of the replacements in removeDisabledParts.""" + text = 'text <ref>This is a reference.</ref> text' + regex = re.compile('</?ref>') + self.assertEqual( + textlib.removeDisabledParts(text, tags=['ref', regex]), + 'text text') + self.assertEqual( + textlib.removeDisabledParts(text, tags=[regex, 'ref']), + 'text This is a reference. text') +
class TestReplaceLinks(TestCase):
pywikibot-commits@lists.wikimedia.org