jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] move character bases functions from page to tools.chars

unicode2html and url2unicode belongs to page titles but is also
independed from other framework parts. Therefore move them to the
module dealing with characters and strings. The page module is too
big and complex already.

- move page.UnicodeToAsciiHtml to tools.chars.string_to_ascii_html
- move page.unicode2html to tools.chars.string2html
- move page.url2unicode to tools.chars.url2string
- deprecate page functions and their direct import from pywikibot
- deprecate pywikibot.site.BaseSite as encodings parameter in url2unicode
function; BaseSite.encodings should be used instead
- update usage of unicode2html in reflinks.py
- update usage of url2unicode in cosmetic_changes.py

Change-Id: Ifab5ab13bf18a2693c014b3d0beeee59e36e0c2a
---
M pywikibot/__init__.py
M pywikibot/cosmetic_changes.py
M pywikibot/page/__init__.py
M pywikibot/tools/chars.py
M scripts/reflinks.py
5 files changed, 102 insertions(+), 57 deletions(-)

diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py
index 4e1204e..cb84609 100644
--- a/pywikibot/__init__.py
+++ b/pywikibot/__init__.py
@@ -1221,7 +1221,6 @@
SiteLink,
User,
html2unicode,
- unicode2html,
url2unicode,
)

@@ -1361,6 +1360,9 @@
since='20200707')
wrapper._add_deprecated_attr('showHelp', show_help,
since='20200705', future_warning=True)
+wrapper._add_deprecated_attr(
+ 'unicode2html', replacement_name='pywikibot.tools.chars.string2html',
+ since='6.2.0', future_warning=True)

# This module aliases many (but not all) pywikibot.exception classes and one
# from pywikibot.data.api. Use of these aliases is deprecated. When removed
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index 8d2270c..c9d3c73 100755
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -61,7 +61,6 @@
import pywikibot
from pywikibot import textlib
from pywikibot.exceptions import InvalidTitleError
-from pywikibot.page import url2unicode
from pywikibot.textlib import (
FILE_LINK_REGEX,
_get_regexes,
@@ -74,6 +73,7 @@
first_upper,
issue_deprecation_warning,
)
+from pywikibot.tools.chars import url2string


try:
@@ -582,8 +582,8 @@
hadTrailingSpaces = len(titleWithSection) != titleLength

# Convert URL-encoded characters to str
- titleWithSection = url2unicode(titleWithSection,
- encodings=self.site)
+ titleWithSection = url2string(titleWithSection,
+ encodings=self.site.encodings())

if not titleWithSection:
# just skip empty links.
diff --git a/pywikibot/page/__init__.py b/pywikibot/page/__init__.py
index 9cdaa25..19bfd3d 100644
--- a/pywikibot/page/__init__.py
+++ b/pywikibot/page/__init__.py
@@ -25,7 +25,7 @@
from http import HTTPStatus
from itertools import chain
from typing import Any, Optional, Union
-from urllib.parse import quote_from_bytes, unquote_to_bytes
+from urllib.parse import quote_from_bytes
from warnings import warn

import pywikibot
@@ -73,6 +73,8 @@
deprecated_args,
first_upper,
is_ip_address,
+ issue_deprecation_warning,
+ ModuleDeprecationWrapper,
redirect_func,
remove_last_args,
)
@@ -5181,9 +5183,8 @@
self._anchor = None

# Convert URL-encoded characters to unicode
- encodings = [self._source.encoding()] + list(self._source.encodings())
-
- self._text = url2unicode(self._text, encodings=encodings)
+ self._text = pywikibot.tools.chars.url2string(
+ self._text, encodings=self._source.encodings())

# Clean up the name, it can come from anywhere.
# Convert HTML entities to unicode
@@ -5724,40 +5725,12 @@
return _ENTITY_SUB(handle_entity, text)


-def UnicodeToAsciiHtml(string) -> str:
- """Convert unicode to a str using HTML entities."""
- html = []
- for c in string:
- cord = ord(c)
- if 31 < cord < 127:
- html.append(c)
- else:
- html.append('&#{};'.format(cord))
- return ''.join(html)
-
-
-def unicode2html(string: str, encoding: str) -> str:
- """
- Convert unicode string to requested HTML encoding.
-
- Attempt to encode the
- string into the desired format; if that doesn't work, encode the unicode
- into HTML &#; entities. If it does work, return it unchanged.
-
- @param string: String to update
- @param encoding: Encoding to use
- """
- try:
- string.encode(encoding)
- except UnicodeError:
- string = UnicodeToAsciiHtml(string)
- return string
-
-
@deprecated_args(site='encodings')
+@deprecated('pywikibot.tools.chars.url2string', since='6.2.0',
+ future_warning=True)
def url2unicode(title: str, encodings='utf-8') -> str:
"""
- Convert URL-encoded text to unicode using several encoding.
+ DEPRECATED. Convert URL-encoded text to unicode using several encoding.

Uses the first encoding that doesn't cause an error.

@@ -5767,21 +5740,24 @@

@raise UnicodeError: Could not convert using any encoding.
"""
- if isinstance(encodings, str):
- encodings = [encodings]
- elif isinstance(encodings, pywikibot.site.BaseSite):
- # create a list of all possible encodings for both hint sites
- site = encodings
- encodings = [site.encoding()] + list(site.encodings())
+ if isinstance(encodings, pywikibot.site.BaseSite):
+ # use all possible encodings from Site object
+ encodings = encodings.encodings()
+ issue_deprecation_warning(
+ 'Passing BaseSite object to encodings parameter',
+ 'BaseSite.endcodings()',
+ depth=1,
+ warning_class=FutureWarning,
+ since='6.2.0'
+ )

- first_exception = None
- for enc in encodings:
- try:
- t = title.encode(enc)
- t = unquote_to_bytes(t)
- return t.decode(enc)
- except UnicodeError as ex:
- if not first_exception:
- first_exception = ex
- # Couldn't convert, raise the original exception
- raise first_exception
+ return pywikibot.tools.chars.url2string(title, encodings)
+
+
+wrapper = ModuleDeprecationWrapper(__name__)
+wrapper._add_deprecated_attr('UnicodeToAsciiHtml',
+ pywikibot.tools.chars.string_to_ascii_html,
+ since='6.2.0', future_warning=True)
+wrapper._add_deprecated_attr('unicode2html',
+ pywikibot.tools.chars.string2html,
+ since='6.2.0', future_warning=True)
diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py
index d7799a9..f57a601 100644
--- a/pywikibot/tools/chars.py
+++ b/pywikibot/tools/chars.py
@@ -7,6 +7,11 @@
import re
import sys

+from contextlib import suppress
+from typing import Union
+from urllib.parse import unquote_to_bytes
+
+from pywikibot.backports import List, Tuple
from pywikibot.tools._unidata import _category_cf


@@ -36,3 +41,63 @@
return '<{0:x}>'.format(codepoint)

return INVISIBLE_REGEX.sub(replace, text)
+
+
+def string_to_ascii_html(string: str) -> str:
+ """Convert unicode chars of str to HTML entities if chars are not ASCII."""
+ html = []
+ for c in string:
+ cord = ord(c)
+ if 31 < cord < 127:
+ html.append(c)
+ else:
+ html.append('&#{};'.format(cord))
+ return ''.join(html)
+
+
+def string2html(string: str, encoding: str) -> str:
+ """Convert unicode string to requested HTML encoding.
+
+ Attempt to encode the string into the desired format; if that work
+ return it unchanged. Otherwise encode the non-ASCII characters into
+ HTML &#; entities.
+
+ @param string: String to update
+ @param encoding: Encoding to use
+ """
+ with suppress(UnicodeError):
+ string.encode(encoding)
+ return string
+
+ return string_to_ascii_html(string)
+
+
+def url2string(
+ title: str,
+ encodings: Union[str, List[str], Tuple[str, ...]] = 'utf-8'
+) -> str:
+ """Convert URL-encoded text to unicode using several encoding.
+
+ Uses the first encoding that doesn't cause an error.
+
+ @param title: URL-encoded character data to convert
+ @param encodings: Encodings to attempt to use during conversion.
+
+ @raise UnicodeError: Could not convert using any encoding.
+ """
+ if isinstance(encodings, str):
+ encodings = [encodings]
+
+ first_exception = None
+ for enc in encodings:
+ try:
+ t = title.encode(enc)
+ t = unquote_to_bytes(t)
+ except UnicodeError as e:
+ if not first_exception:
+ first_exception = e
+ else:
+ return t.decode(enc)
+
+ # Couldn't convert, raise the first exception
+ raise first_exception
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index bb889be..dc437c4 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -65,6 +65,8 @@
)
from pywikibot.textlib import replaceExcept
from pywikibot.tools.formatter import color_format
+from pywikibot.tools.chars import string2html
+
from scripts import noreferences


@@ -246,7 +248,7 @@
self.title = self.title.replace('}}', '}&#125;')
# prevent multiple quotes being interpreted as '' or '''
self.title = self.title.replace("''", "'&#39;")
- self.title = pywikibot.unicode2html(self.title, self.site.encoding())
+ self.title = string2html(self.title, self.site.encoding())
# TODO : remove HTML when both opening and closing tags are included

def avoid_uppercase(self):

To view, visit change 690429. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ifab5ab13bf18a2693c014b3d0beeee59e36e0c2a
Gerrit-Change-Number: 690429
Gerrit-PatchSet: 2
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: Dvorapa <dvorapa@seznam.cz>
Gerrit-Reviewer: JJMC89 <JJMC89.Wikimedia@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged