jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/690429 )
Change subject: [IMPR] move character bases functions from page to tools.chars ......................................................................
[IMPR] move character bases functions from page to tools.chars
unicode2html and url2unicode belongs to page titles but is also independed from other framework parts. Therefore move them to the module dealing with characters and strings. The page module is too big and complex already.
- move page.UnicodeToAsciiHtml to tools.chars.string_to_ascii_html - move page.unicode2html to tools.chars.string2html - move page.url2unicode to tools.chars.url2string - deprecate page functions and their direct import from pywikibot - deprecate pywikibot.site.BaseSite as encodings parameter in url2unicode function; BaseSite.encodings should be used instead - update usage of unicode2html in reflinks.py - update usage of url2unicode in cosmetic_changes.py
Change-Id: Ifab5ab13bf18a2693c014b3d0beeee59e36e0c2a --- M pywikibot/__init__.py M pywikibot/cosmetic_changes.py M pywikibot/page/__init__.py M pywikibot/tools/chars.py M scripts/reflinks.py 5 files changed, 102 insertions(+), 57 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/__init__.py b/pywikibot/__init__.py index 4e1204e..cb84609 100644 --- a/pywikibot/__init__.py +++ b/pywikibot/__init__.py @@ -1221,7 +1221,6 @@ SiteLink, User, html2unicode, - unicode2html, url2unicode, )
@@ -1361,6 +1360,9 @@ since='20200707') wrapper._add_deprecated_attr('showHelp', show_help, since='20200705', future_warning=True) +wrapper._add_deprecated_attr( + 'unicode2html', replacement_name='pywikibot.tools.chars.string2html', + since='6.2.0', future_warning=True)
# This module aliases many (but not all) pywikibot.exception classes and one # from pywikibot.data.api. Use of these aliases is deprecated. When removed diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py index 8d2270c..c9d3c73 100755 --- a/pywikibot/cosmetic_changes.py +++ b/pywikibot/cosmetic_changes.py @@ -61,7 +61,6 @@ import pywikibot from pywikibot import textlib from pywikibot.exceptions import InvalidTitleError -from pywikibot.page import url2unicode from pywikibot.textlib import ( FILE_LINK_REGEX, _get_regexes, @@ -74,6 +73,7 @@ first_upper, issue_deprecation_warning, ) +from pywikibot.tools.chars import url2string
try: @@ -582,8 +582,8 @@ hadTrailingSpaces = len(titleWithSection) != titleLength
# Convert URL-encoded characters to str - titleWithSection = url2unicode(titleWithSection, - encodings=self.site) + titleWithSection = url2string(titleWithSection, + encodings=self.site.encodings())
if not titleWithSection: # just skip empty links. diff --git a/pywikibot/page/__init__.py b/pywikibot/page/__init__.py index 9cdaa25..19bfd3d 100644 --- a/pywikibot/page/__init__.py +++ b/pywikibot/page/__init__.py @@ -25,7 +25,7 @@ from http import HTTPStatus from itertools import chain from typing import Any, Optional, Union -from urllib.parse import quote_from_bytes, unquote_to_bytes +from urllib.parse import quote_from_bytes from warnings import warn
import pywikibot @@ -73,6 +73,8 @@ deprecated_args, first_upper, is_ip_address, + issue_deprecation_warning, + ModuleDeprecationWrapper, redirect_func, remove_last_args, ) @@ -5181,9 +5183,8 @@ self._anchor = None
# Convert URL-encoded characters to unicode - encodings = [self._source.encoding()] + list(self._source.encodings()) - - self._text = url2unicode(self._text, encodings=encodings) + self._text = pywikibot.tools.chars.url2string( + self._text, encodings=self._source.encodings())
# Clean up the name, it can come from anywhere. # Convert HTML entities to unicode @@ -5724,40 +5725,12 @@ return _ENTITY_SUB(handle_entity, text)
-def UnicodeToAsciiHtml(string) -> str: - """Convert unicode to a str using HTML entities.""" - html = [] - for c in string: - cord = ord(c) - if 31 < cord < 127: - html.append(c) - else: - html.append('&#{};'.format(cord)) - return ''.join(html) - - -def unicode2html(string: str, encoding: str) -> str: - """ - Convert unicode string to requested HTML encoding. - - Attempt to encode the - string into the desired format; if that doesn't work, encode the unicode - into HTML &#; entities. If it does work, return it unchanged. - - @param string: String to update - @param encoding: Encoding to use - """ - try: - string.encode(encoding) - except UnicodeError: - string = UnicodeToAsciiHtml(string) - return string - - @deprecated_args(site='encodings') +@deprecated('pywikibot.tools.chars.url2string', since='6.2.0', + future_warning=True) def url2unicode(title: str, encodings='utf-8') -> str: """ - Convert URL-encoded text to unicode using several encoding. + DEPRECATED. Convert URL-encoded text to unicode using several encoding.
Uses the first encoding that doesn't cause an error.
@@ -5767,21 +5740,24 @@
@raise UnicodeError: Could not convert using any encoding. """ - if isinstance(encodings, str): - encodings = [encodings] - elif isinstance(encodings, pywikibot.site.BaseSite): - # create a list of all possible encodings for both hint sites - site = encodings - encodings = [site.encoding()] + list(site.encodings()) + if isinstance(encodings, pywikibot.site.BaseSite): + # use all possible encodings from Site object + encodings = encodings.encodings() + issue_deprecation_warning( + 'Passing BaseSite object to encodings parameter', + 'BaseSite.endcodings()', + depth=1, + warning_class=FutureWarning, + since='6.2.0' + )
- first_exception = None - for enc in encodings: - try: - t = title.encode(enc) - t = unquote_to_bytes(t) - return t.decode(enc) - except UnicodeError as ex: - if not first_exception: - first_exception = ex - # Couldn't convert, raise the original exception - raise first_exception + return pywikibot.tools.chars.url2string(title, encodings) + + +wrapper = ModuleDeprecationWrapper(__name__) +wrapper._add_deprecated_attr('UnicodeToAsciiHtml', + pywikibot.tools.chars.string_to_ascii_html, + since='6.2.0', future_warning=True) +wrapper._add_deprecated_attr('unicode2html', + pywikibot.tools.chars.string2html, + since='6.2.0', future_warning=True) diff --git a/pywikibot/tools/chars.py b/pywikibot/tools/chars.py index d7799a9..f57a601 100644 --- a/pywikibot/tools/chars.py +++ b/pywikibot/tools/chars.py @@ -7,6 +7,11 @@ import re import sys
+from contextlib import suppress +from typing import Union +from urllib.parse import unquote_to_bytes + +from pywikibot.backports import List, Tuple from pywikibot.tools._unidata import _category_cf
@@ -36,3 +41,63 @@ return '<{0:x}>'.format(codepoint)
return INVISIBLE_REGEX.sub(replace, text) + + +def string_to_ascii_html(string: str) -> str: + """Convert unicode chars of str to HTML entities if chars are not ASCII.""" + html = [] + for c in string: + cord = ord(c) + if 31 < cord < 127: + html.append(c) + else: + html.append('&#{};'.format(cord)) + return ''.join(html) + + +def string2html(string: str, encoding: str) -> str: + """Convert unicode string to requested HTML encoding. + + Attempt to encode the string into the desired format; if that work + return it unchanged. Otherwise encode the non-ASCII characters into + HTML &#; entities. + + @param string: String to update + @param encoding: Encoding to use + """ + with suppress(UnicodeError): + string.encode(encoding) + return string + + return string_to_ascii_html(string) + + +def url2string( + title: str, + encodings: Union[str, List[str], Tuple[str, ...]] = 'utf-8' +) -> str: + """Convert URL-encoded text to unicode using several encoding. + + Uses the first encoding that doesn't cause an error. + + @param title: URL-encoded character data to convert + @param encodings: Encodings to attempt to use during conversion. + + @raise UnicodeError: Could not convert using any encoding. + """ + if isinstance(encodings, str): + encodings = [encodings] + + first_exception = None + for enc in encodings: + try: + t = title.encode(enc) + t = unquote_to_bytes(t) + except UnicodeError as e: + if not first_exception: + first_exception = e + else: + return t.decode(enc) + + # Couldn't convert, raise the first exception + raise first_exception diff --git a/scripts/reflinks.py b/scripts/reflinks.py index bb889be..dc437c4 100755 --- a/scripts/reflinks.py +++ b/scripts/reflinks.py @@ -65,6 +65,8 @@ ) from pywikibot.textlib import replaceExcept from pywikibot.tools.formatter import color_format +from pywikibot.tools.chars import string2html + from scripts import noreferences
@@ -246,7 +248,7 @@ self.title = self.title.replace('}}', '}}') # prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace("''", "''") - self.title = pywikibot.unicode2html(self.title, self.site.encoding()) + self.title = string2html(self.title, self.site.encoding()) # TODO : remove HTML when both opening and closing tags are included
def avoid_uppercase(self):
pywikibot-commits@lists.wikimedia.org