jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/674613 )
Change subject: [bugfix] Avoid dupliate reference names
......................................................................
[bugfix] Avoid dupliate reference names
The old implementation failed if there were autogenerated references
not starting with 1. Now an iterator was implemented which creates the
next unused number (upto 999 but this should be enough)
Other improvements:
- always use double quotes with references
- modify pattern to find single quotes in reference names
- remove regext o clean title (\s includes [\n\r\t])
- rename self.NAMES.match(params) to found
- use removeDisabledPart function only once
Bug: T278040
Change-Id: Ie082dea4334e62f6818a208b86704b3a8afcd0ad
---
M scripts/reflinks.py
1 file changed, 25 insertions(+), 21 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index f21b808..205f5c8 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -47,6 +47,7 @@
import subprocess
import tempfile
+from contextlib import suppress
from functools import partial
from textwrap import shorten
from urllib.error import URLError
@@ -56,6 +57,7 @@
import pywikibot
from pywikibot import comms, i18n, pagegenerators, textlib
+from pywikibot.backports import removeprefix
from pywikibot.bot import ExistingPageBot, NoRedirectPageBot, SingleSiteBot
from pywikibot import config2 as config
from pywikibot.pagegenerators import (
@@ -233,7 +235,6 @@
self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
# remove \n and \r and unicode spaces from titles
self.title = re.sub(r'\s', ' ', self.title)
- self.title = re.sub(r'[\n\r\t]', ' ', self.title)
# remove extra whitespaces
# remove leading and trailing ./;/,/-/_/+/ /
self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_
'))
@@ -285,10 +286,9 @@
# Match references
self.REFS = re.compile(
r'(?i)<ref(?P<params>[^>/]*)>(?P<content>.*?)</ref>')
- self.NAMES = re.compile(
-
r'(?i).*name\s*=\s*(?P<quote>"?)\s*(?P<name>.+)\s*(?P=quote).*')
- self.GROUPS = re.compile(
-
r'(?i).*group\s*=\s*(?P<quote>"?)\s*(?P<group>.+)\s*(?P=quote).*')
+ fmt =
r'(?i).*{0}\s*=\s*(?P<quote>["\']?)\s*(?P<{0}>.+)\s*(?P=quote).*'
+ self.NAMES = re.compile(fmt.format('name'))
+ self.GROUPS = re.compile(fmt.format('group'))
self.autogen = i18n.twtranslate(site, 'reflinks-autogen')
def process(self, text):
@@ -320,10 +320,10 @@
else:
v = [None, [match.group()], False, False]
- name = self.NAMES.match(params)
- if name:
- quoted = name.group('quote') == '"'
- name = name.group('name')
+ found = self.NAMES.match(params)
+ if found:
+ quoted = found.group('quote') in ['"',
"'"]
+ name = found.group('name')
if v[0]:
if v[0] != name:
named_repl[name] = [v[0], v[2]]
@@ -344,14 +344,20 @@
found_ref_names[name] = 1
groupdict[content] = v
- id_ = 1
- while self.autogen + str(id_) in found_ref_names:
- id_ += 1
+ used_numbers = set()
+ for name in found_ref_names:
+ number = removeprefix(name, self.autogen)
+ with suppress(ValueError):
+ used_numbers.add(int(number))
+
+ # iterator to give the next free number
+ free_number = iter({str(i) for i in range(1, 1000) # should be enough
+ if i not in used_numbers})
for (g, d) in found_refs.items():
group = ''
if g:
- group = 'group=\"{}\" '.format(group)
+ group = 'group="{}" '.format(group)
for (k, v) in d.items():
if len(v[1]) == 1 and not v[3]:
@@ -359,10 +365,9 @@
name = v[0]
if not name:
- name = '"{}{}"'.format(self.autogen, id_)
- id_ += 1
+ name = '"{}{}"'.format(self.autogen,
next(free_number))
elif v[2]:
- name = '{!r}'.format(name)
+ name = '"{}"'.format(name)
named = '<ref {}name={}>{}</ref>'.format(group, name,
k)
text = text.replace(v[1][0], named, 1)
@@ -384,10 +389,10 @@
# TODO : Support ref groups
name = v[0]
if v[1]:
- name = '{!r}'.format(name)
+ name = '"{}"'.format(name)
text = re.sub(
- '<ref
name\\s*=\\s*(?P<quote>"?)\\s*{}\\s*(?P=quote)\\s*/>'
+ r'<ref
name\s*=\s*(?P<quote>["\']?)\s*{}\s*(?P=quote)\s*/>'
.format(k),
'<ref name={} />'.format(name), text)
return text
@@ -518,10 +523,9 @@
"""Process one page."""
# Load the page's text from the wiki
new_text = page.text
-
+ raw_text = textlib.removeDisabledParts(new_text)
# for each link to change
- for match in linksInRef.finditer(
- textlib.removeDisabledParts(page.get())):
+ for match in linksInRef.finditer(raw_text):
link = match.group('url')
if 'jstor.org' in link:
--
To view, visit
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/674613
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie082dea4334e62f6818a208b86704b3a8afcd0ad
Gerrit-Change-Number: 674613
Gerrit-PatchSet: 7
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Rubin <rubin.happy(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged