jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/920662 )
Change subject: [IMPR] Make textlib._get_regexes a puplic function ......................................................................
[IMPR] Make textlib._get_regexes a puplic function
- Make textlib._get_regexes a puplic function as get_regexes - keys parameter may be a plain str - site may be optional because it is only used for several keys - use plain str if only one key is used - remove site parameter if is is not used - remove unused site parameter of _extract_headings function - replace _get_regexes with get_regexes for all its usage - add _create_default_regexes to documentation because it is used in replace.py
Bug: T336144 Change-Id: Ic9f673e508228998b28375563027d6e9631f4e4a --- M pywikibot/textlib.py M pywikibot/cosmetic_changes.py M scripts/replace.py M scripts/category.py 4 files changed, 64 insertions(+), 23 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py index 003d670..9235462 100644 --- a/pywikibot/cosmetic_changes.py +++ b/pywikibot/cosmetic_changes.py @@ -51,7 +51,7 @@ 'your_script_name_2'] """ # -# (C) Pywikibot team, 2006-2022 +# (C) Pywikibot team, 2006-2023 # # Distributed under the terms of the MIT license. # @@ -68,7 +68,7 @@ from pywikibot.textlib import ( FILE_LINK_REGEX, MultiTemplateMatchBuilder, - _get_regexes, + get_regexes, ) from pywikibot.tools import first_lower, first_upper from pywikibot.tools.chars import url2string @@ -682,7 +682,7 @@ return text
skippings = ['comment', 'category'] - skip_regexes = _get_regexes(skippings, self.site) + skip_regexes = get_regexes(skippings, self.site) # site defined templates skip_templates = { 'cs': ('Pahýl[ _]část',), # stub section diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index ce63410..3c73b34 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -46,7 +46,7 @@ ETPType = List[Tuple[str, OrderedDictType[str, str]]]
# cache for replaceExcept to avoid recompile or regexes each call -_regex_cache = {} +_regex_cache: Dict[str, Pattern[str]] = {}
# The regex below collects nested templates, providing simpler # identification of templates used at the top-level of wikitext. @@ -253,7 +253,15 @@
def _create_default_regexes() -> None: - """Fill (and possibly overwrite) _regex_cache with default regexes.""" + """Fill (and possibly overwrite) ``_regex_cache`` with default regexes. + + The following keys are provided: ``category``, ``comment``, ``file``, + ``header``, ``hyperlink``, ``interwiki``, ``invoke``, ``link``, + ``pagelist``, ``property``, ``startcolon``, ``startspace``, ``table``, + ``template``. + + :meta public: + """ _regex_cache.update({ # categories 'category': (r'[[ *(?:%s)\s*:.*?]]', @@ -304,16 +312,29 @@ })
-def _get_regexes(keys: Iterable, site) -> List[Pattern[str]]: +def get_regexes( + keys: Union[str, Iterable[str]], + site: Optional['pywikibot.site.BaseSite'] = None +) -> List[Pattern[str]]: """Fetch compiled regexes.
- :meta public: + .. versionchanged:: 8.2 + ``_get_regexes`` becomes a public function. + *keys* may be a single string; *site* is optional. + + :param keys: a single key or an iterable of keys whose regex pattern + should be given + :param site: a BaseSite object needed for ``category``, ``file``, + ``interwiki``, ``invoke`` and ``property`` keys + :raises ValueError: site cannot be None. """ if not _regex_cache: _create_default_regexes()
- result = [] + if isinstance(keys, str): + keys = [keys]
+ result = [] for exc in keys: if not isinstance(exc, str): # assume it's a regular expression @@ -332,7 +353,7 @@ else: if not site and exc in ('interwiki', 'property', 'invoke', 'category', 'file'): - raise ValueError(f'Site cannot be None for the {exc!r} regex') + raise ValueError(f'site cannot be None for the {exc!r} regex')
if (exc, site) not in _regex_cache: re_text, re_var = _regex_cache[exc] @@ -396,7 +417,7 @@ if not old.search(text): return text + marker
- dontTouchRegexes = _get_regexes(exceptions, site) + dontTouchRegexes = get_regexes(exceptions, site)
index = 0 replaced = 0 @@ -500,7 +521,7 @@ if provided as an ordered collection (list, tuple)
:param tags: The exact set of parts which should be removed using - keywords from textlib._get_regexes(). + keywords from :func:`get_regexes`. :param include: Or, in alternative, default parts that shall not be removed. :param site: Site to be used for site-dependent regexes. Default @@ -518,7 +539,7 @@ # ("Note" at the end of the section) if include: tags = [tag for tag in tags if tag not in include] - regexes = _get_regexes(tags, site) + regexes = get_regexes(tags, site) for regex in regexes: text = regex.sub('', text) return text @@ -917,10 +938,10 @@ _Content = namedtuple('_Content', ('header', 'sections', 'footer'))
-def _extract_headings(text: str, site) -> list: +def _extract_headings(text: str) -> list: """Return _Heading objects.""" headings = [] - heading_regex = _get_regexes(['header'], site)[0] + heading_regex = get_regexes('header')[0] for match in heading_regex.finditer(text): start, end = match.span() if not isDisabled(text, start) and not isDisabled(text, end): @@ -981,11 +1002,11 @@
.. versionadded:: 3.0 """ - headings = _extract_headings(text, site) + headings = _extract_headings(text) sections = _extract_sections(text, headings) # Find header and footer contents header = text[:headings[0].start] if headings else text - cat_regex, interwiki_regex = _get_regexes(('category', 'interwiki'), site) + cat_regex, interwiki_regex = get_regexes(['category', 'interwiki'], site) langlink_pattern = interwiki_regex.pattern.replace(':?', '') last_section_content = sections[-1].content if sections else header footer = re.search( @@ -1251,7 +1272,7 @@ above_interwiki.append(comment)
if above_interwiki: - interwiki = _get_regexes(['interwiki'], site)[0] + interwiki = get_regexes('interwiki', site)[0] first_interwiki = interwiki.search(newtext) for reg in above_interwiki: special = reg.search(newtext) @@ -1565,7 +1586,7 @@ under_categories.append(stub)
if under_categories: - category = _get_regexes(['category'], site)[0] + category = get_regexes('category', site)[0] for last_category in category.finditer(newtext): pass for reg in under_categories: diff --git a/scripts/category.py b/scripts/category.py index 6df0aea..4c3a7a9 100755 --- a/scripts/category.py +++ b/scripts/category.py @@ -143,7 +143,7 @@ :mod:`pagegenerators` are supported with "move" and "remove" action. """ # -# (C) Pywikibot team, 2004-2022 +# (C) Pywikibot team, 2004-2023 # # Distributed under the terms of the MIT license. # @@ -1184,7 +1184,7 @@ # skip initial templates, images and comments for articles. if member.namespace() == member.site.namespaces.MAIN: excludes = ('template', 'file', 'comment') - regexes = textlib._get_regexes(excludes, member.site) + regexes = textlib.get_regexes(excludes, member.site) i = 0 while i < 3: i = 0 diff --git a/scripts/replace.py b/scripts/replace.py index 2a19090..b9d2b66 100755 --- a/scripts/replace.py +++ b/scripts/replace.py @@ -142,7 +142,7 @@ the top of the help. """ # -# (C) Pywikibot team, 2004-2022 +# (C) Pywikibot team, 2004-2023 # # Distributed under the terms of the MIT license. # @@ -504,8 +504,8 @@ regular expressions. inside-tags A list of strings. These strings must be keys from the - dictionary in textlib._create_default_regexes() or must be - accepted by textlib._get_regexes(). + dictionary in :func:`textlib._create_default_regexes` or must be + accepted by :func:`textlib.get_regexes`.
:keyword allowoverlap: when matches overlap, all of them are replaced. :type allowoverlap: bool