jenkins-bot submitted this change.

View Change


Approvals: Matěj Suchánek: Looks good to me, approved jenkins-bot: Verified
[IMPR] use inline re.IGNORECASE flag in textlib.case_escape function

- use inline re.IGNORECASE flag for the first letter of string argument
- add underscore parameter to detect interchangeable and collapsible
spaces/underscores in string
- use underscore parameter within scripts

Bug: T308265
Change-Id: I58df8260db97c45cde6e959ada7e5a8acc959d79
---
M pywikibot/textlib.py
M scripts/image.py
M scripts/delinker.py
3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 39f8d50..b8ac31f 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -163,19 +163,24 @@
return phrase


-def case_escape(case: str, string: str) -> str:
+def case_escape(case: str, string: str, *, underscore: bool = False) -> str:
"""Return an escaped regex pattern which depends on 'first-letter' case.

.. versionadded:: 7.0
+ .. versionchanged:: 8.4
+ Added the optional *underscore* parameter.

- :param case: if `case` is 'first-letter' the regex contains an
- upper/lower case set for the first letter
+ :param case: if `case` is 'first-letter', the regex contains an
+ inline re.IGNORECASE flag for the first letter
+ :param underscore: if True, expand the regex to detect spaces and
+ underscores which are interchangeable and collapsible
"""
- first = string[0]
- if first.isalpha() and case == 'first-letter':
- pattern = f'[{first.upper()}{first.lower()}]{re.escape(string[1:])}'
+ if case == 'first-letter':
+ pattern = f'(?i:{string[:1]}){re.escape(string[1:])}'
else:
pattern = re.escape(string)
+ if underscore:
+ pattern = re.sub(r'_|\\ ', '[_ ]+', pattern)
return pattern


@@ -1557,9 +1562,7 @@
return oldtext

# title might contain regex special characters
- title = case_escape(site.namespaces[14].case, title)
- # spaces and underscores in page titles are interchangeable and collapsible
- title = title.replace(r'\ ', '[ _]+').replace(r'\_', '[ _]+')
+ title = case_escape(site.namespaces[14].case, title, underscore=True)
categoryR = re.compile(r'\[\[\s*({})\s*:\s*{}[\s\u200e\u200f]*'
r'((?:\|[^]]+)?\]\])'
.format(catNamespace, title), re.I)
diff --git a/scripts/delinker.py b/scripts/delinker.py
index 6282cd6..4d3d0b4 100755
--- a/scripts/delinker.py
+++ b/scripts/delinker.py
@@ -100,9 +100,9 @@
"""Set page to current page and delink that page."""
# use image_regex from image.py
namespace = file_page.site.namespaces[6]
- escaped = case_escape(namespace.case, file_page.title(with_ns=False))
- # Be careful, spaces and _ have been converted to '\ ' and '\_'
- escaped = re.sub('\\\\[_ ]', '[_ ]', escaped)
+ escaped = case_escape(namespace.case,
+ file_page.title(with_ns=False),
+ underscore=True)
self.image_regex = re.compile(
r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'
r'(?:[^\[\]]|\[\[[^\]]+\]\]|\[[^\]]+\])*|) *\]\]'
diff --git a/scripts/image.py b/scripts/image.py
index 1b5d05d..167fa78 100755
--- a/scripts/image.py
+++ b/scripts/image.py
@@ -85,10 +85,8 @@
param)

namespace = self.site.namespaces[6]
- escaped = case_escape(namespace.case, self.old_image)
+ escaped = case_escape(namespace.case, self.old_image, underscore=True)

- # Be careful, spaces and _ have been converted to '\ ' and '\_'
- escaped = re.sub('\\\\[_ ]', '[_ ]', escaped)
if not self.opt.loose or not self.new_image:
image_regex = re.compile(
r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'

To view, visit change 942668. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I58df8260db97c45cde6e959ada7e5a8acc959d79
Gerrit-Change-Number: 942668
Gerrit-PatchSet: 2
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <dalangi-ctr@wikimedia.org>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged