jenkins-bot submitted this change.

View Change


Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] decrease nested flow statements in textlib.replaceExcept

Also add type hints and update documentation

Change-Id: Iaf8e7927f574c188cf2cfff78365c0883b66aea3
---
M pywikibot/textlib.py
1 file changed, 87 insertions(+), 58 deletions(-)

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 2ed436e..7439253 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -16,7 +16,14 @@
from typing import NamedTuple, Optional, Union

import pywikibot
-from pywikibot.backports import Container, Dict, Iterable, List
+from pywikibot.backports import (
+ Callable,
+ Container,
+ Dict,
+ Iterable,
+ Match,
+ List,
+)
from pywikibot.backports import OrderedDict as OrderedDictType
from pywikibot.backports import Pattern
from pywikibot.backports import Sequence as SequenceType
@@ -352,30 +359,38 @@
return result


-def replaceExcept(text: str, old, new, exceptions: list,
- caseInsensitive: bool = False, allowoverlap: bool = False,
- marker: str = '', site=None, count: int = 0) -> str:
+def replaceExcept(text: str,
+ old: Union[str, Pattern[str]],
+ new: Union[str, Callable[[Match[str]], str]],
+ exceptions: List[Union[str, Pattern[str]]],
+ caseInsensitive: bool = False,
+ allowoverlap: bool = False,
+ marker: str = '',
+ site: Optional['pywikibot.site.BaseSite'] = None,
+ count: int = 0) -> str:
"""
- Return text with 'old' replaced by 'new', ignoring specified types of text.
+ Return text with *old* replaced by *new*, ignoring specified types of text.

- Skips occurrences of 'old' within exceptions; e.g., within nowiki tags or
- HTML comments. If caseInsensitive is true, then use case insensitive
- regex matching. If allowoverlap is true, overlapping occurrences are all
- replaced (watch out when using this, it might lead to infinite loops!).
+ Skip occurrences of *old* within *exceptions*; e.g. within nowiki
+ tags or HTML comments. If *caseInsensitive* is true, then use case
+ insensitive regex matching. If *allowoverlap* is true, overlapping
+ occurrences are all replaced
+
+ .. caution:: Watch out when using *allowoverlap*, it might lead to
+ infinite loops!

:param text: text to be modified
:param old: a compiled or uncompiled regular expression
- :param new: a unicode string (which can contain regular
- expression references), or a function which takes
- a match object as parameter. See parameter repl of
- re.sub().
+ :param new: a string (which can contain regular expression
+ references), or a function which takes a match object as
+ parameter. See parameter *repl* of ``re.sub()``.
:param exceptions: a list of strings or already compiled regex
- objects which signal what to leave out. Strings might be like
- ['math', 'table', 'template'] for example.
+ objects which signal what to leave out. List of strings might be
+ like ``['math', 'table', 'template']`` for example.
:param marker: a string that will be added to the last replacement;
if nothing is changed, it is added at the end
:param count: how many replacements to do at most. See parameter
- count of re.sub().
+ *count* of ``re.sub()``.
"""
# if we got a string, compile it as a regular expression
if isinstance(old, str):
@@ -393,6 +408,7 @@
while not count or replaced < count:
if index > len(text):
break
+
match = old.search(text, index)
if not match:
# nothing left to replace
@@ -412,55 +428,57 @@
# an HTML comment or text in nowiki tags stands before the next
# valid match. Skip.
index = nextExceptionMatch.end()
+ continue
+
+ # We found a valid match. Replace it.
+ if callable(new):
+ # the parameter new can be a function which takes the match
+ # as a parameter.
+ replacement = new(match)
else:
- # We found a valid match. Replace it.
- if callable(new):
- # the parameter new can be a function which takes the match
- # as a parameter.
- replacement = new(match)
- else:
- # it is not a function, but a string.
+ # it is not a function, but a string.

- # it is a little hack to make \n work. It would be better
- # to fix it previously, but better than nothing.
- new = new.replace('\\n', '\n')
+ # it is a little hack to make \n work. It would be better
+ # to fix it previously, but better than nothing.
+ new = new.replace('\\n', '\n')

- # We cannot just insert the new string, as it may contain regex
- # group references such as \2 or \g<name>.
- # On the other hand, this approach does not work because it
- # can't handle lookahead or lookbehind (see bug T123185).
- # So we have to process the group references manually.
- replacement = ''
+ # We cannot just insert the new string, as it may contain regex
+ # group references such as \2 or \g<name>.
+ # On the other hand, this approach does not work because it
+ # can't handle lookahead or lookbehind (see bug T123185).
+ # So we have to process the group references manually.
+ replacement = ''

- group_regex = re.compile(r'\\(\d+)|\\g<(.+?)>')
- last = 0
- for group_match in group_regex.finditer(new):
- group_id = group_match[1] or group_match[2]
- with suppress(ValueError):
- group_id = int(group_id)
+ group_regex = re.compile(r'\\(\d+)|\\g<(.+?)>')
+ last = 0
+ for group_match in group_regex.finditer(new):
+ group_id = group_match[1] or group_match[2]
+ with suppress(ValueError):
+ group_id = int(group_id)

- try:
- replacement += new[last:group_match.start()]
- replacement += match[group_id] or ''
- except IndexError:
- raise IndexError('Invalid group reference: {}\n'
- 'Groups found: {}'
- .format(group_id, match.groups()))
- last = group_match.end()
- replacement += new[last:]
+ try:
+ replacement += new[last:group_match.start()]
+ replacement += match[group_id] or ''
+ except IndexError:
+ raise IndexError(f'Invalid group reference: {group_id}\n'
+ f'Groups found: {match.groups()}')
+ last = group_match.end()
+ replacement += new[last:]

- text = text[:match.start()] + replacement + text[match.end():]
+ text = text[:match.start()] + replacement + text[match.end():]

- # continue the search on the remaining text
- if allowoverlap:
- index = match.start() + 1
- else:
- index = match.start() + len(replacement)
- if not match.group():
- # When the regex allows to match nothing, shift by one char
- index += 1
- markerpos = match.start() + len(replacement)
- replaced += 1
+ # continue the search on the remaining text
+ if allowoverlap:
+ index = match.start() + 1
+ else:
+ index = match.start() + len(replacement)
+
+ if not match.group():
+ # When the regex allows to match nothing, shift by one char
+ index += 1
+
+ markerpos = match.start() + len(replacement)
+ replaced += 1

return text[:markerpos] + marker + text[markerpos:]


To view, visit change 920316. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Iaf8e7927f574c188cf2cfff78365c0883b66aea3
Gerrit-Change-Number: 920316
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged