[Gerrit] ...core[master]: [bugfix] Avoid dupliate reference names - Pywikibot-commits

25 Mar 2021

jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/674613 )
Change subject: [bugfix] Avoid dupliate reference names
......................................................................
[bugfix] Avoid dupliate reference names
The old implementation failed if there were autogenerated references
not starting with 1. Now an iterator was implemented which creates the
next unused number (upto 999 but this should be enough)
Other improvements:
- always use double quotes with references
- modify pattern to find single quotes in reference names
- remove regext o clean title (\s includes [\n\r\t])
- rename self.NAMES.match(params) to found
- use removeDisabledPart function only once
Bug: T278040
Change-Id: Ie082dea4334e62f6818a208b86704b3a8afcd0ad
---
M scripts/reflinks.py
1 file changed, 25 insertions(+), 21 deletions(-)
Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index f21b808..205f5c8 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -47,6 +47,7 @@
 import subprocess
 import tempfile
+from contextlib import suppress
 from functools import partial
 from textwrap import shorten
 from urllib.error import URLError
@@ -56,6 +57,7 @@
 import pywikibot
from pywikibot import comms, i18n, pagegenerators, textlib
+from pywikibot.backports import removeprefix
 from pywikibot.bot import ExistingPageBot, NoRedirectPageBot, SingleSiteBot
 from pywikibot import config2 as config
 from pywikibot.pagegenerators import (
@@ -233,7 +235,6 @@
         self.title = re.sub(r'[.+-=]{4,}', ' ', self.title)
         # remove \n and \r and unicode spaces from titles
         self.title = re.sub(r'\s', ' ', self.title)
-        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
         # remove extra whitespaces
         # remove leading and trailing ./;/,/-/_/+/ /
         self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))
@@ -285,10 +286,9 @@
         # Match references
         self.REFS = re.compile(
             r'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
-        self.NAMES = re.compile(
-            r'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
-        self.GROUPS = re.compile(
-            r'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*')
+        fmt = r'(?i).*{0}\s*=\s*(?P<quote>["']?)\s*(?P<{0}>.+)\s*(?P=quote).*'
+        self.NAMES = re.compile(fmt.format('name'))
+        self.GROUPS = re.compile(fmt.format('group'))
         self.autogen = i18n.twtranslate(site, 'reflinks-autogen')
def process(self, text):
@@ -320,10 +320,10 @@
             else:
                 v = [None, [match.group()], False, False]
-            name = self.NAMES.match(params)
-            if name:
-                quoted = name.group('quote') == '"'
-                name = name.group('name')
+            found = self.NAMES.match(params)
+            if found:
+                quoted = found.group('quote') in ['"', "'"]
+                name = found.group('name')
                 if v[0]:
                     if v[0] != name:
                         named_repl[name] = [v[0], v[2]]
@@ -344,14 +344,20 @@
                 found_ref_names[name] = 1
             groupdict[content] = v
-        id_ = 1
-        while self.autogen + str(id_) in found_ref_names:
-            id_ += 1
+        used_numbers = set()
+        for name in found_ref_names:
+            number = removeprefix(name, self.autogen)
+            with suppress(ValueError):
+                used_numbers.add(int(number))
+
+        # iterator to give the next free number
+        free_number = iter({str(i) for i in range(1, 1000)  # should be enough
+                            if i not in used_numbers})
for (g, d) in found_refs.items():
             group = ''
             if g:
-                group = 'group="{}" '.format(group)
+                group = 'group="{}" '.format(group)
for (k, v) in d.items():
                 if len(v[1]) == 1 and not v[3]:
@@ -359,10 +365,9 @@
name = v[0]
                 if not name:
-                    name = '"{}{}"'.format(self.autogen, id_)
-                    id_ += 1
+                    name = '"{}{}"'.format(self.autogen, next(free_number))
                 elif v[2]:
-                    name = '{!r}'.format(name)
+                    name = '"{}"'.format(name)
named = '<ref {}name={}>{}</ref>'.format(group, name, k)
                 text = text.replace(v[1][0], named, 1)
@@ -384,10 +389,10 @@
             # TODO : Support ref groups
             name = v[0]
             if v[1]:
-                name = '{!r}'.format(name)
+                name = '"{}"'.format(name)
text = re.sub(
-                '<ref name\s*=\s*(?P<quote>"?)\s*{}\s*(?P=quote)\s*/>'
+                r'<ref name\s*=\s*(?P<quote>["']?)\s*{}\s*(?P=quote)\s*/>'
                 .format(k),
                 '<ref name={} />'.format(name), text)
         return text
@@ -518,10 +523,9 @@
         """Process one page."""
         # Load the page's text from the wiki
         new_text = page.text
-
+        raw_text = textlib.removeDisabledParts(new_text)
         # for each link to change
-        for match in linksInRef.finditer(
-                textlib.removeDisabledParts(page.get())):
+        for match in linksInRef.finditer(raw_text):
link = match.group('url')
             if 'jstor.org' in link:
-- 
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/674613
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie082dea4334e62f6818a208b86704b3a8afcd0ad
Gerrit-Change-Number: 674613
Gerrit-PatchSet: 7
Gerrit-Owner: Xqt info@gno.de
Gerrit-Reviewer: D3r1ck01 xsavitar.wiki@aol.com
Gerrit-Reviewer: Rubin rubin.happy@gmail.com
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged