jenkins-bot submitted this change.

View Change


Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] Make textlib._get_regexes a puplic function

- Make textlib._get_regexes a puplic function as get_regexes
- keys parameter may be a plain str
- site may be optional because it is only used for several keys
- use plain str if only one key is used
- remove site parameter if is is not used
- remove unused site parameter of _extract_headings function
- replace _get_regexes with get_regexes for all its usage
- add _create_default_regexes to documentation because it is used in
replace.py

Bug: T336144
Change-Id: Ic9f673e508228998b28375563027d6e9631f4e4a
---
M pywikibot/textlib.py
M pywikibot/cosmetic_changes.py
M scripts/replace.py
M scripts/category.py
4 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index 003d670..9235462 100644
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -51,7 +51,7 @@
'your_script_name_2']
"""
#
-# (C) Pywikibot team, 2006-2022
+# (C) Pywikibot team, 2006-2023
#
# Distributed under the terms of the MIT license.
#
@@ -68,7 +68,7 @@
from pywikibot.textlib import (
FILE_LINK_REGEX,
MultiTemplateMatchBuilder,
- _get_regexes,
+ get_regexes,
)
from pywikibot.tools import first_lower, first_upper
from pywikibot.tools.chars import url2string
@@ -682,7 +682,7 @@
return text

skippings = ['comment', 'category']
- skip_regexes = _get_regexes(skippings, self.site)
+ skip_regexes = get_regexes(skippings, self.site)
# site defined templates
skip_templates = {
'cs': ('Pahýl[ _]část',), # stub section
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index ce63410..3c73b34 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -46,7 +46,7 @@
ETPType = List[Tuple[str, OrderedDictType[str, str]]]

# cache for replaceExcept to avoid recompile or regexes each call
-_regex_cache = {}
+_regex_cache: Dict[str, Pattern[str]] = {}

# The regex below collects nested templates, providing simpler
# identification of templates used at the top-level of wikitext.
@@ -253,7 +253,15 @@


def _create_default_regexes() -> None:
- """Fill (and possibly overwrite) _regex_cache with default regexes."""
+ """Fill (and possibly overwrite) ``_regex_cache`` with default regexes.
+
+ The following keys are provided: ``category``, ``comment``, ``file``,
+ ``header``, ``hyperlink``, ``interwiki``, ``invoke``, ``link``,
+ ``pagelist``, ``property``, ``startcolon``, ``startspace``, ``table``,
+ ``template``.
+
+ :meta public:
+ """
_regex_cache.update({
# categories
'category': (r'\[\[ *(?:%s)\s*:.*?\]\]',
@@ -304,16 +312,29 @@
})


-def _get_regexes(keys: Iterable, site) -> List[Pattern[str]]:
+def get_regexes(
+ keys: Union[str, Iterable[str]],
+ site: Optional['pywikibot.site.BaseSite'] = None
+) -> List[Pattern[str]]:
"""Fetch compiled regexes.

- :meta public:
+ .. versionchanged:: 8.2
+ ``_get_regexes`` becomes a public function.
+ *keys* may be a single string; *site* is optional.
+
+ :param keys: a single key or an iterable of keys whose regex pattern
+ should be given
+ :param site: a BaseSite object needed for ``category``, ``file``,
+ ``interwiki``, ``invoke`` and ``property`` keys
+ :raises ValueError: site cannot be None.
"""
if not _regex_cache:
_create_default_regexes()

- result = []
+ if isinstance(keys, str):
+ keys = [keys]

+ result = []
for exc in keys:
if not isinstance(exc, str):
# assume it's a regular expression
@@ -332,7 +353,7 @@
else:
if not site and exc in ('interwiki', 'property', 'invoke',
'category', 'file'):
- raise ValueError(f'Site cannot be None for the {exc!r} regex')
+ raise ValueError(f'site cannot be None for the {exc!r} regex')

if (exc, site) not in _regex_cache:
re_text, re_var = _regex_cache[exc]
@@ -396,7 +417,7 @@
if not old.search(text):
return text + marker

- dontTouchRegexes = _get_regexes(exceptions, site)
+ dontTouchRegexes = get_regexes(exceptions, site)

index = 0
replaced = 0
@@ -500,7 +521,7 @@
if provided as an ordered collection (list, tuple)

:param tags: The exact set of parts which should be removed using
- keywords from textlib._get_regexes().
+ keywords from :func:`get_regexes`.
:param include: Or, in alternative, default parts that shall not
be removed.
:param site: Site to be used for site-dependent regexes. Default
@@ -518,7 +539,7 @@
# ("Note" at the end of the section)
if include:
tags = [tag for tag in tags if tag not in include]
- regexes = _get_regexes(tags, site)
+ regexes = get_regexes(tags, site)
for regex in regexes:
text = regex.sub('', text)
return text
@@ -917,10 +938,10 @@
_Content = namedtuple('_Content', ('header', 'sections', 'footer'))


-def _extract_headings(text: str, site) -> list:
+def _extract_headings(text: str) -> list:
"""Return _Heading objects."""
headings = []
- heading_regex = _get_regexes(['header'], site)[0]
+ heading_regex = get_regexes('header')[0]
for match in heading_regex.finditer(text):
start, end = match.span()
if not isDisabled(text, start) and not isDisabled(text, end):
@@ -981,11 +1002,11 @@

.. versionadded:: 3.0
"""
- headings = _extract_headings(text, site)
+ headings = _extract_headings(text)
sections = _extract_sections(text, headings)
# Find header and footer contents
header = text[:headings[0].start] if headings else text
- cat_regex, interwiki_regex = _get_regexes(('category', 'interwiki'), site)
+ cat_regex, interwiki_regex = get_regexes(['category', 'interwiki'], site)
langlink_pattern = interwiki_regex.pattern.replace(':?', '')
last_section_content = sections[-1].content if sections else header
footer = re.search(
@@ -1251,7 +1272,7 @@
above_interwiki.append(comment)

if above_interwiki:
- interwiki = _get_regexes(['interwiki'], site)[0]
+ interwiki = get_regexes('interwiki', site)[0]
first_interwiki = interwiki.search(newtext)
for reg in above_interwiki:
special = reg.search(newtext)
@@ -1565,7 +1586,7 @@
under_categories.append(stub)

if under_categories:
- category = _get_regexes(['category'], site)[0]
+ category = get_regexes('category', site)[0]
for last_category in category.finditer(newtext):
pass
for reg in under_categories:
diff --git a/scripts/category.py b/scripts/category.py
index 6df0aea..4c3a7a9 100755
--- a/scripts/category.py
+++ b/scripts/category.py
@@ -143,7 +143,7 @@
:mod:`pagegenerators` are supported with "move" and "remove" action.
"""
#
-# (C) Pywikibot team, 2004-2022
+# (C) Pywikibot team, 2004-2023
#
# Distributed under the terms of the MIT license.
#
@@ -1184,7 +1184,7 @@
# skip initial templates, images and comments for articles.
if member.namespace() == member.site.namespaces.MAIN:
excludes = ('template', 'file', 'comment')
- regexes = textlib._get_regexes(excludes, member.site)
+ regexes = textlib.get_regexes(excludes, member.site)
i = 0
while i < 3:
i = 0
diff --git a/scripts/replace.py b/scripts/replace.py
index 2a19090..b9d2b66 100755
--- a/scripts/replace.py
+++ b/scripts/replace.py
@@ -142,7 +142,7 @@
the top of the help.
"""
#
-# (C) Pywikibot team, 2004-2022
+# (C) Pywikibot team, 2004-2023
#
# Distributed under the terms of the MIT license.
#
@@ -504,8 +504,8 @@
regular expressions.
inside-tags
A list of strings. These strings must be keys from the
- dictionary in textlib._create_default_regexes() or must be
- accepted by textlib._get_regexes().
+ dictionary in :func:`textlib._create_default_regexes` or must be
+ accepted by :func:`textlib.get_regexes`.

:keyword allowoverlap: when matches overlap, all of them are replaced.
:type allowoverlap: bool

To view, visit change 920662. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ic9f673e508228998b28375563027d6e9631f4e4a
Gerrit-Change-Number: 920662
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: JJMC89 <JJMC89.Wikimedia@gmail.com>
Gerrit-MessageType: merged