jenkins-bot submitted this change.

View Change

Approvals: Rubin: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
[bugfix] Don't ignore identical references with newline in ref content

If a line feed was inside the ref content the processing for this
reference was skipped because the regex wasn't able to detect this
reference. Fix this issue with following steps:

- use DOTALL flag with self.REFS regex
- remove catchall start and endpattern of self.NAMES and self.GROUPS
regex and use re.search instead of re.match later
- use a set for found_ref_names instead a dict. The only value assigned
to the dict was 1.
- don't use None as found_refs key but an empty list
- introduce an IntEnum for named index constants for the reference
values; this improves readability
- remove debugging stuff of 'population' name
- rename variables for better readability
- add some comments

Bug: T286369
Change-Id: I47f81dcc9537591efea7bbb4628c55af0789cc14
---
M scripts/reflinks.py
1 file changed, 48 insertions(+), 38 deletions(-)

diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 0fd5da7..c753dc0 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -49,6 +49,7 @@
import tempfile

from contextlib import suppress
+from enum import IntEnum
from functools import partial
from http import HTTPStatus
from textwrap import shorten
@@ -274,6 +275,16 @@
self.title = self.title.title()


+class IX(IntEnum):
+
+ """Index class for references data."""
+
+ name = 0
+ reflist = 1
+ quoted = 2
+ change_needed = 3
+
+
class DuplicateReferences:

"""Helper to de-duplicate references in text.
@@ -289,8 +300,8 @@

# Match references
self.REFS = re.compile(
- r'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
- fmt = r'(?i).*{0}\s*=\s*(?P<quote>["\']?)\s*(?P<{0}>.+)\s*(?P=quote).*'
+ r'(?is)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
+ fmt = r'(?i){0}\s*=\s*(?P<quote>["\']?)\s*(?P<{0}>.+)\s*(?P=quote)'
self.NAMES = re.compile(fmt.format('name'))
self.GROUPS = re.compile(fmt.format('group'))
self.autogen = i18n.twtranslate(site, 'reflinks-autogen')
@@ -303,101 +314,100 @@
# values are [name, [list of full ref matches],
# quoted, need_to_change]
found_refs = {}
- found_ref_names = {}
+ found_ref_names = set()
# Replace key by [value, quoted]
named_repl = {}

+ # Parse references
for match in self.REFS.finditer(text):
content = match.group('content')
if not content.strip():
continue

params = match.group('params')
- group = self.GROUPS.match(params)
+ group = self.GROUPS.search(params) or ''
if group not in found_refs:
found_refs[group] = {}

groupdict = found_refs[group]
if content in groupdict:
v = groupdict[content]
- v[1].append(match.group())
+ v[IX.reflist].append(match.group())
else:
v = [None, [match.group()], False, False]

- found = self.NAMES.match(params)
+ found = self.NAMES.search(params)
if found:
quoted = found.group('quote') in ['"', "'"]
name = found.group('name')
- if v[0]:
- if v[0] != name:
- named_repl[name] = [v[0], v[2]]
- else:
+
+ if not v[IX.name]:
# First name associated with this content
- if name == 'population':
- pywikibot.output(content)
if name not in found_ref_names:
# first time ever we meet this name
- if name == 'population':
- pywikibot.output('in')
- v[2] = quoted
- v[0] = name
+ v[IX.quoted] = quoted
+ v[IX.name] = name
else:
# if has_key, means that this name is used
# with another content. We'll need to change it
- v[3] = True
- found_ref_names[name] = 1
+ v[IX.change_needed] = True
+ elif v[IX.name] != name:
+ named_repl[name] = [v[IX.name], v[IX.quoted]]
+
+ found_ref_names.add(name)
groupdict[content] = v

+ # Find used autogenerated numbers
used_numbers = set()
for name in found_ref_names:
number = removeprefix(name, self.autogen)
with suppress(ValueError):
used_numbers.add(int(number))

- # generator to give the next free number
+ # generator to give the next free number for autogenerating names
free_number = (str(i) for i in itertools.count(start=1)
if i not in used_numbers)

- for (g, d) in found_refs.items():
- group = ''
- if g:
- group = 'group="{}" '.format(group)
+ # Fix references
+ for groupname, references in found_refs.items():
+ group = 'group="{}" '.format(group) if groupname else ''

- for (k, v) in d.items():
- if len(v[1]) == 1 and not v[3]:
+ for ref, v in references.items():
+ if len(v[IX.reflist]) == 1 and not v[IX.change_needed]:
continue

- name = v[0]
+ name = v[IX.name]
if not name:
name = '"{}{}"'.format(self.autogen, next(free_number))
- elif v[2]:
+ elif v[IX.quoted]:
name = '"{}"'.format(name)

- named = '<ref {}name={}>{}</ref>'.format(group, name, k)
- text = text.replace(v[1][0], named, 1)
+ named = '<ref {}name={}>{}</ref>'.format(group, name, ref)
+ text = text.replace(v[IX.reflist][0], named, 1)

- # make sure that the first (named ref) is not
- # removed later :
+ # make sure that the first (named ref) is not removed later
pos = text.index(named) + len(named)
header = text[:pos]
end = text[pos:]

- unnamed = '<ref {}name={} />'.format(group, name)
- for ref in v[1][1:]:
+ # replace multiple identical references with repeated ref
+ repeated_ref = '<ref {}name={} />'.format(group, name)
+ for ref in v[IX.reflist][1:]:
# Don't replace inside templates (T266411)
- end = replaceExcept(end, re.escape(ref), unnamed,
+ end = replaceExcept(end, re.escape(ref), repeated_ref,
exceptions=['template'])
text = header + end

- for (k, v) in named_repl.items():
+ # Fix references with different names
+ for ref, v in named_repl.items():
# TODO : Support ref groups
- name = v[0]
- if v[1]:
+ name = v[IX.name]
+ if v[IX.reflist]:
name = '"{}"'.format(name)

text = re.sub(
r'<ref name\s*=\s*(?P<quote>["\']?)\s*{}\s*(?P=quote)\s*/>'
- .format(k),
+ .format(ref),
'<ref name={} />'.format(name), text)
return text


To view, visit change 703859. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I47f81dcc9537591efea7bbb4628c55af0789cc14
Gerrit-Change-Number: 703859
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Rubin <rubin.happy@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged