jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/703859 )
Change subject: [bugfix] Don't ignore identical references with newline in ref content ......................................................................
[bugfix] Don't ignore identical references with newline in ref content
If a line feed was inside the ref content the processing for this reference was skipped because the regex wasn't able to detect this reference. Fix this issue with following steps:
- use DOTALL flag with self.REFS regex - remove catchall start and endpattern of self.NAMES and self.GROUPS regex and use re.search instead of re.match later - use a set for found_ref_names instead a dict. The only value assigned to the dict was 1. - don't use None as found_refs key but an empty list - introduce an IntEnum for named index constants for the reference values; this improves readability - remove debugging stuff of 'population' name - rename variables for better readability - add some comments
Bug: T286369 Change-Id: I47f81dcc9537591efea7bbb4628c55af0789cc14 --- M scripts/reflinks.py 1 file changed, 48 insertions(+), 38 deletions(-)
Approvals: Rubin: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/scripts/reflinks.py b/scripts/reflinks.py index 0fd5da7..c753dc0 100755 --- a/scripts/reflinks.py +++ b/scripts/reflinks.py @@ -49,6 +49,7 @@ import tempfile
from contextlib import suppress +from enum import IntEnum from functools import partial from http import HTTPStatus from textwrap import shorten @@ -274,6 +275,16 @@ self.title = self.title.title()
+class IX(IntEnum): + + """Index class for references data.""" + + name = 0 + reflist = 1 + quoted = 2 + change_needed = 3 + + class DuplicateReferences:
"""Helper to de-duplicate references in text. @@ -289,8 +300,8 @@
# Match references self.REFS = re.compile( - r'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>') - fmt = r'(?i).*{0}\s*=\s*(?P<quote>["']?)\s*(?P<{0}>.+)\s*(?P=quote).*' + r'(?is)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>') + fmt = r'(?i){0}\s*=\s*(?P<quote>["']?)\s*(?P<{0}>.+)\s*(?P=quote)' self.NAMES = re.compile(fmt.format('name')) self.GROUPS = re.compile(fmt.format('group')) self.autogen = i18n.twtranslate(site, 'reflinks-autogen') @@ -303,101 +314,100 @@ # values are [name, [list of full ref matches], # quoted, need_to_change] found_refs = {} - found_ref_names = {} + found_ref_names = set() # Replace key by [value, quoted] named_repl = {}
+ # Parse references for match in self.REFS.finditer(text): content = match.group('content') if not content.strip(): continue
params = match.group('params') - group = self.GROUPS.match(params) + group = self.GROUPS.search(params) or '' if group not in found_refs: found_refs[group] = {}
groupdict = found_refs[group] if content in groupdict: v = groupdict[content] - v[1].append(match.group()) + v[IX.reflist].append(match.group()) else: v = [None, [match.group()], False, False]
- found = self.NAMES.match(params) + found = self.NAMES.search(params) if found: quoted = found.group('quote') in ['"', "'"] name = found.group('name') - if v[0]: - if v[0] != name: - named_repl[name] = [v[0], v[2]] - else: + + if not v[IX.name]: # First name associated with this content - if name == 'population': - pywikibot.output(content) if name not in found_ref_names: # first time ever we meet this name - if name == 'population': - pywikibot.output('in') - v[2] = quoted - v[0] = name + v[IX.quoted] = quoted + v[IX.name] = name else: # if has_key, means that this name is used # with another content. We'll need to change it - v[3] = True - found_ref_names[name] = 1 + v[IX.change_needed] = True + elif v[IX.name] != name: + named_repl[name] = [v[IX.name], v[IX.quoted]] + + found_ref_names.add(name) groupdict[content] = v
+ # Find used autogenerated numbers used_numbers = set() for name in found_ref_names: number = removeprefix(name, self.autogen) with suppress(ValueError): used_numbers.add(int(number))
- # generator to give the next free number + # generator to give the next free number for autogenerating names free_number = (str(i) for i in itertools.count(start=1) if i not in used_numbers)
- for (g, d) in found_refs.items(): - group = '' - if g: - group = 'group="{}" '.format(group) + # Fix references + for groupname, references in found_refs.items(): + group = 'group="{}" '.format(group) if groupname else ''
- for (k, v) in d.items(): - if len(v[1]) == 1 and not v[3]: + for ref, v in references.items(): + if len(v[IX.reflist]) == 1 and not v[IX.change_needed]: continue
- name = v[0] + name = v[IX.name] if not name: name = '"{}{}"'.format(self.autogen, next(free_number)) - elif v[2]: + elif v[IX.quoted]: name = '"{}"'.format(name)
- named = '<ref {}name={}>{}</ref>'.format(group, name, k) - text = text.replace(v[1][0], named, 1) + named = '<ref {}name={}>{}</ref>'.format(group, name, ref) + text = text.replace(v[IX.reflist][0], named, 1)
- # make sure that the first (named ref) is not - # removed later : + # make sure that the first (named ref) is not removed later pos = text.index(named) + len(named) header = text[:pos] end = text[pos:]
- unnamed = '<ref {}name={} />'.format(group, name) - for ref in v[1][1:]: + # replace multiple identical references with repeated ref + repeated_ref = '<ref {}name={} />'.format(group, name) + for ref in v[IX.reflist][1:]: # Don't replace inside templates (T266411) - end = replaceExcept(end, re.escape(ref), unnamed, + end = replaceExcept(end, re.escape(ref), repeated_ref, exceptions=['template']) text = header + end
- for (k, v) in named_repl.items(): + # Fix references with different names + for ref, v in named_repl.items(): # TODO : Support ref groups - name = v[0] - if v[1]: + name = v[IX.name] + if v[IX.reflist]: name = '"{}"'.format(name)
text = re.sub( r'<ref name\s*=\s*(?P<quote>["']?)\s*{}\s*(?P=quote)\s*/>' - .format(k), + .format(ref), '<ref name={} />'.format(name), text) return text
pywikibot-commits@lists.wikimedia.org