jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/942668 )
Change subject: [IMPR] use inline re.IGNORECASE flag in textlib.case_escape function
......................................................................
[IMPR] use inline re.IGNORECASE flag in textlib.case_escape function
- use inline re.IGNORECASE flag for the first letter of string argument
- add underscore parameter to detect interchangeable and collapsible
spaces/underscores in string
- use underscore parameter within scripts
Bug: T308265
Change-Id: I58df8260db97c45cde6e959ada7e5a8acc959d79
---
M pywikibot/textlib.py
M scripts/image.py
M scripts/delinker.py
3 files changed, 31 insertions(+), 15 deletions(-)
Approvals:
Matěj Suchánek: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 39f8d50..b8ac31f 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -163,19 +163,24 @@
return phrase
-def case_escape(case: str, string: str) -> str:
+def case_escape(case: str, string: str, *, underscore: bool = False) -> str:
"""Return an escaped regex pattern which depends on 'first-letter' case.
.. versionadded:: 7.0
+ .. versionchanged:: 8.4
+ Added the optional *underscore* parameter.
- :param case: if `case` is 'first-letter' the regex contains an
- upper/lower case set for the first letter
+ :param case: if `case` is 'first-letter', the regex contains an
+ inline re.IGNORECASE flag for the first letter
+ :param underscore: if True, expand the regex to detect spaces and
+ underscores which are interchangeable and collapsible
"""
- first = string[0]
- if first.isalpha() and case == 'first-letter':
- pattern = f'[{first.upper()}{first.lower()}]{re.escape(string[1:])}'
+ if case == 'first-letter':
+ pattern = f'(?i:{string[:1]}){re.escape(string[1:])}'
else:
pattern = re.escape(string)
+ if underscore:
+ pattern = re.sub(r'_|\\ ', '[_ ]+', pattern)
return pattern
@@ -1557,9 +1562,7 @@
return oldtext
# title might contain regex special characters
- title = case_escape(site.namespaces[14].case, title)
- # spaces and underscores in page titles are interchangeable and collapsible
- title = title.replace(r'\ ', '[ _]+').replace(r'\_', '[ _]+')
+ title = case_escape(site.namespaces[14].case, title, underscore=True)
categoryR = re.compile(r'\[\[\s*({})\s*:\s*{}[\s\u200e\u200f]*'
r'((?:\|[^]]+)?\]\])'
.format(catNamespace, title), re.I)
diff --git a/scripts/delinker.py b/scripts/delinker.py
index 6282cd6..4d3d0b4 100755
--- a/scripts/delinker.py
+++ b/scripts/delinker.py
@@ -100,9 +100,9 @@
"""Set page to current page and delink that page."""
# use image_regex from image.py
namespace = file_page.site.namespaces[6]
- escaped = case_escape(namespace.case, file_page.title(with_ns=False))
- # Be careful, spaces and _ have been converted to '\ ' and '\_'
- escaped = re.sub('\\\\[_ ]', '[_ ]', escaped)
+ escaped = case_escape(namespace.case,
+ file_page.title(with_ns=False),
+ underscore=True)
self.image_regex = re.compile(
r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'
r'(?:[^\[\]]|\[\[[^\]]+\]\]|\[[^\]]+\])*|) *\]\]'
diff --git a/scripts/image.py b/scripts/image.py
index 1b5d05d..167fa78 100755
--- a/scripts/image.py
+++ b/scripts/image.py
@@ -85,10 +85,8 @@
param)
namespace = self.site.namespaces[6]
- escaped = case_escape(namespace.case, self.old_image)
+ escaped = case_escape(namespace.case, self.old_image, underscore=True)
- # Be careful, spaces and _ have been converted to '\ ' and '\_'
- escaped = re.sub('\\\\[_ ]', '[_ ]', escaped)
if not self.opt.loose or not self.new_image:
image_regex = re.compile(
r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/942668
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I58df8260db97c45cde6e959ada7e5a8acc959d79
Gerrit-Change-Number: 942668
Gerrit-PatchSet: 2
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <dalangi-ctr(a)wikimedia.org>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97(a)gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/942624 )
Change subject: [IMPR] use urllib.parse.unquote() for tools.chars.url2string() function
......................................................................
[IMPR] use urllib.parse.unquote() for tools.chars.url2string() function
Simplify tools.chars.url2string() function by using
urllib.parse.unquote() instead of urllib.parse.unquote_to_bytes and
encoding/decoding strings for it.
Change-Id: I49bf4fec45f6f67ddab75f7248b8b1a9eadc6d8a
---
M pywikibot/tools/chars.py
1 file changed, 30 insertions(+), 9 deletions(-)
Approvals:
Matěj Suchánek: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
index c64c84d..47bfb5a 100644
--- a/pywikibot/tools/chars.py
+++ b/pywikibot/tools/chars.py
@@ -8,7 +8,7 @@
import sys
from contextlib import suppress
from typing import Union
-from urllib.parse import unquote_to_bytes
+from urllib.parse import unquote
from pywikibot.backports import Iterable
from pywikibot.tools._unidata import _category_cf
@@ -98,10 +98,22 @@
encodings: Union[str, Iterable[str]] = 'utf-8') -> str:
"""Convert URL-encoded text to unicode using several encoding.
- Uses the first encoding that doesn't cause an error.
+ Uses the first encoding that doesn't cause an error. Raises the
+ first exception if all encodings fails.
+
+ For a single *encodings* string this function is equvalent to
+ :samp:`urllib.parse.unquote(title, encodings, errors='strict')`
+
+ .. versionchanged:: 8.4
+ Ignore *LookupError* and try other encodings.
+
+ .. seealso:: :python:`urllib.parse.unquote
+ <library/urllib.parse.html#urllib.parse.unquote>`
**Example:**
+ >>> url2string('abc%20def')
+ 'abc def'
>>> url2string('/El%20Ni%C3%B1o/')
'/El Niño/'
>>> url2string('/El%20Ni%C3%B1o/', 'ascii')
@@ -118,19 +130,15 @@
:raise LookupError: unknown encoding
"""
if isinstance(encodings, str):
- encodings = [encodings]
+ return unquote(title, encodings, errors='strict')
first_exception = None
for enc in encodings:
try:
- t = title.encode(enc)
- t = unquote_to_bytes(t)
- result = t.decode(enc)
- except UnicodeError as e:
+ return unquote(title, enc, errors='strict')
+ except (UnicodeError, LookupError) as e:
if not first_exception:
first_exception = e
- else:
- return result
# Couldn't convert, raise the first exception
raise first_exception
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/942624
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I49bf4fec45f6f67ddab75f7248b8b1a9eadc6d8a
Gerrit-Change-Number: 942624
Gerrit-PatchSet: 4
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97(a)gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/942603 )
Change subject: [IMPR] Convert URL-encoded characters also for links outside main namespace
......................................................................
[IMPR] Convert URL-encoded characters also for links outside main namespace
As found by T342470 the CosmeticChangesToolkit.cleanUpLinks() does not
convert URL-encoded characters outside main namespace or for interwiki
links. This patch solved this issue.
Bug: T342470
Change-Id: Ie9f8fc503df842ad45fe44eefc57449c0473cd29
---
M pywikibot/cosmetic_changes.py
1 file changed, 28 insertions(+), 12 deletions(-)
Approvals:
Meno25: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index ffd43a5..bf3e112 100644
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -501,32 +501,38 @@
"""Tidy up wikilinks found in a string.
This function will:
- * Replace underscores with spaces
+ * Replace underscores with spaces
* Move leading and trailing spaces out of the wikilink and into the
surrounding text
-
* Convert URL-encoded characters into Unicode-encoded characters
-
* Move trailing characters out of the link and make the link without
using a pipe, if possible
-
* Capitalize the article title of the link, if appropriate
+ .. versionchanged:: 8.4
+ Convert URL-encoded characters if a link is an interwiki link
+ or different from main namespace.
+
:param text: string to perform the clean-up on
:return: text with tidied wikilinks
"""
# helper function which works on one link and either returns it
# unmodified, or returns a replacement.
def handleOneLink(match: Match[str]) -> str:
- titleWithSection = match['titleWithSection']
+ # Convert URL-encoded characters to str
+ titleWithSection = url2string(match['titleWithSection'],
+ encodings=self.site.encodings())
label = match['label']
trailingChars = match['linktrail']
newline = match['newline']
+ # entire link but convert URL-encoded text
+ oldlink = url2string(match.group(),
+ encodings=self.site.encodings())
is_interwiki = self.site.isInterwikiLink(titleWithSection)
if is_interwiki:
- return match.group()
+ return oldlink
# The link looks like this:
# [[page_title|link_text]]trailing_chars
@@ -538,7 +544,7 @@
except InvalidTitleError:
in_main_namespace = False
if not in_main_namespace:
- return match.group()
+ return oldlink
# Replace underlines by spaces, also multiple underlines
titleWithSection = re.sub('_+', ' ', titleWithSection)
@@ -560,13 +566,9 @@
titleWithSection = titleWithSection.rstrip()
hadTrailingSpaces = len(titleWithSection) != titleLength
- # Convert URL-encoded characters to str
- titleWithSection = url2string(titleWithSection,
- encodings=self.site.encodings())
-
if not titleWithSection:
# just skip empty links.
- return match.group()
+ return match.groups()
# Remove unnecessary initial and final spaces from label.
# Please note that some editors prefer spaces around pipes.
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/942603
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie9f8fc503df842ad45fe44eefc57449c0473cd29
Gerrit-Change-Number: 942603
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Meno25 <meno25mail(a)gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/i18n/+/942454 )
Change subject: [i18n] Additional translations for checkimages
......................................................................
[i18n] Additional translations for checkimages
Change-Id: Ib92dd114119efbb3f8dcac5abed6d125ddda840f
---
M checkimages/en.json
M checkimages/qqq.json
2 files changed, 13 insertions(+), 0 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/checkimages/en.json b/checkimages/en.json
index 421142b..3431526 100644
--- a/checkimages/en.json
+++ b/checkimages/en.json
@@ -11,6 +11,8 @@
"checkimages-doubles-file-comment": "Bot: File already on Commons, may be deleted",
"checkimages-doubles-head": "Duplicate file",
"checkimages-doubles-talk-comment": "Bot: Notify that the file already exists on Commons",
+ "checkimages-forced-mode": "('''forced mode''')",
+ "checkimages-has-duplicates": "has the following duplicates%(force)s:",
"checkimages-log-comment": "Bot: Updating the log",
"checkimages-no-license-head": "Image without license",
"checkimages-source-tag-comment": "Bot: Marking newly uploaded untagged file",
diff --git a/checkimages/qqq.json b/checkimages/qqq.json
index 33e9e57..d324fca 100644
--- a/checkimages/qqq.json
+++ b/checkimages/qqq.json
@@ -10,6 +10,8 @@
"checkimages-doubles-file-comment": "Edit summary used by the bot while it reports a problem in the file page",
"checkimages-doubles-head": "Head of the report given to the uploader",
"checkimages-doubles-talk-comment": "Edit summary used by the bot while it reports the problem in the uploader's talk page",
+ "checkimages-forced-mode": "Report is generated in force mode",
+ "checkimages-has-duplicates": "Report that an image has several duplicates",
"checkimages-log-comment": "Edit summary for the checkimages' report",
"checkimages-no-license-head": "The header of a report if an image has no license",
"checkimages-source-tag-comment": "Edit summary for untagged user talk notice",
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/i18n/+/942454
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/i18n
Gerrit-Branch: master
Gerrit-Change-Id: Ib92dd114119efbb3f8dcac5abed6d125ddda840f
Gerrit-Change-Number: 942454
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged