jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/874903 )
Change subject: [IMPR] return 'https' scheme with Family.protocol()
......................................................................
[IMPR] return 'https' scheme with Family.protocol()
- return 'https' scheme with Family.protocol() by default
- update all related family files
- enable http.request to pass a given protocol to site.base_url()
- try to switch the scheme if a request fails and has no json result;
this ensures that we have no braking change with the new default
'https' scheme
- raise a FatalServerError to stop the loop if requests raises one of
MissingSchema, InvalidSchema, InvalidURL, InvalidHeader exception
Bug: T326046
Change-Id: I10859e11b569ee3a4441272439342792038567aa
---
M pywikibot/families/wikihow_family.py
M pywikibot/family.py
M pywikibot/comms/http.py
M pywikibot/families/wikispore_family.py
M pywikibot/families/osm_family.py
M pywikibot/families/vikidia_family.py
M tests/http_tests.py
M pywikibot/families/i18n_family.py
M pywikibot/data/api/_requests.py
9 files changed, 72 insertions(+), 45 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/comms/http.py b/pywikibot/comms/http.py
index 60a9322..8fed6a6 100644
--- a/pywikibot/comms/http.py
+++ b/pywikibot/comms/http.py
@@ -249,7 +249,7 @@
return UserAgent().random
-def request(site,
+def request(site: 'pywikibot.site.BaseSite',
uri: Optional[str] = None,
headers: Optional[dict] = None,
**kwargs) -> requests.Response:
@@ -261,13 +261,16 @@
The optional uri is a relative uri from site base uri including the
document root '/'.
+ .. versionchanged:: 8.2
+ a *protocol* parameter can be given which is passed to the
+ :meth:`family.Family.base_url` method.
+
:param site: The Site to connect to
- :type site: pywikibot.site.BaseSite
:param uri: the URI to retrieve
- :keyword charset: Either a valid charset (usable for str.decode()) or None
- to automatically chose the charset from the returned header (defaults
- to latin-1)
- :type charset: CodecInfo, str, None
+ :keyword Optional[CodecInfo, str] charset: Either a valid charset
+ (usable for str.decode()) or None to automatically chose the
+ charset from the returned header (defaults to latin-1)
+ :keyword Optional[str] protocol: a url scheme
:return: The received data Response
"""
kwargs.setdefault('verify', site.verify_SSL_certificate())
@@ -278,7 +281,7 @@
format_string = headers.get('user-agent')
headers['user-agent'] = user_agent(site, format_string)
- baseuri = site.base_url(uri)
+ baseuri = site.base_url(uri, protocol=kwargs.pop('protocol', None))
r = fetch(baseuri, headers=headers, **kwargs)
site.throttle.retry_after = int(r.headers.get('retry-after', 0))
return r
@@ -331,6 +334,10 @@
if isinstance(response, requests.Timeout):
raise ServerError(response)
+ if isinstance(response, ValueError):
+ # MissingSchema, InvalidSchema, InvalidURL, InvalidHeader
+ raise FatalServerError(str(response))
+
if isinstance(response, Exception):
with suppress(Exception):
# request exception may contain response and request attribute
diff --git a/pywikibot/data/api/_requests.py b/pywikibot/data/api/_requests.py
index 892c5a2..25f7e57 100644
--- a/pywikibot/data/api/_requests.py
+++ b/pywikibot/data/api/_requests.py
@@ -17,7 +17,7 @@
from email.mime.nonmultipart import MIMENonMultipart
from pathlib import Path
from typing import Any, Optional, Union
-from urllib.parse import unquote, urlencode
+from urllib.parse import unquote, urlencode, urlparse
from warnings import warn
import pywikibot
@@ -674,13 +674,23 @@
paramstring) -> tuple:
"""Get or post a http request with exception handling.
+ .. versionchanged:: 8.2
+ change the scheme if the previous request didn't have json
+ content.
+
:return: a tuple containing requests.Response object from
http.request and use_get value
"""
+ kwargs = {}
+ schemes = ('http', 'https')
+ if self.json_warning and self.site.protocol() in schemes:
+ # retry with other scheme
+ kwargs['protocol'] = schemes[self.site.protocol() == 'http']
+
try:
response = http.request(self.site, uri=uri,
method='GET' if use_get else 'POST',
- data=data, headers=headers)
+ data=data, headers=headers, **kwargs)
except Server504Error:
pywikibot.log('Caught HTTP 504 error; retrying')
except Client414Error:
@@ -708,6 +718,10 @@
def _json_loads(self, response) -> Optional[dict]:
"""Return a dict from requests.Response.
+ .. versionchanged:: 8.2
+ show a warning to add a ``protocoll()`` method to the family
+ file if suitable.
+
:param response: a requests.Response object
:type response: requests.Response
:return: a data dict
@@ -753,7 +767,18 @@
self[param] = [str(int(value) // 2)]
pywikibot.info(f'Set {param} = {self[param]}')
else:
+ scheme = urlparse(response.url).scheme
+ if self.json_warning and scheme != self.site.protocol():
+ warn(f"""
+Your {self.site.family} family uses a wrong scheme {self.site.protocol()!r}
+but {scheme!r} is required. Please add the following code to your family file:
+
+ def protocol(self, code: str) -> str:
+ return '{scheme}'
+
+""", stacklevel=2)
return result or {}
+
self.wait()
return None
diff --git a/pywikibot/families/i18n_family.py b/pywikibot/families/i18n_family.py
index 67c780d..93be54e 100644
--- a/pywikibot/families/i18n_family.py
+++ b/pywikibot/families/i18n_family.py
@@ -1,6 +1,6 @@
"""Family module for Translate Wiki."""
#
-# (C) Pywikibot team, 2007-2022
+# (C) Pywikibot team, 2007-2023
#
# Distributed under the terms of the MIT license.
#
@@ -14,7 +14,3 @@
name = 'i18n'
domain = 'translatewiki.net'
-
- def protocol(self, code) -> str:
- """Return https as the protocol for this family."""
- return 'https'
diff --git a/pywikibot/families/osm_family.py b/pywikibot/families/osm_family.py
index f5c2962..81b14cc 100644
--- a/pywikibot/families/osm_family.py
+++ b/pywikibot/families/osm_family.py
@@ -1,6 +1,6 @@
"""Family module for OpenStreetMap wiki."""
#
-# (C) Pywikibot team, 2009-2022
+# (C) Pywikibot team, 2009-2023
#
# Distributed under the terms of the MIT license.
#
@@ -43,7 +43,3 @@
edit_restricted_templates = {
'en': ('In Bearbeitung',),
}
-
- def protocol(self, code) -> str:
- """Return https as the protocol for this family."""
- return 'https'
diff --git a/pywikibot/families/vikidia_family.py b/pywikibot/families/vikidia_family.py
index 2878681..6280258 100644
--- a/pywikibot/families/vikidia_family.py
+++ b/pywikibot/families/vikidia_family.py
@@ -21,7 +21,3 @@
# Sites we want to edit but not count as real languages
test_codes = ['central', 'test']
-
- def protocol(self, code) -> str:
- """Return https as the protocol for this family."""
- return 'https'
diff --git a/pywikibot/families/wikihow_family.py b/pywikibot/families/wikihow_family.py
index d49b642..2343870 100644
--- a/pywikibot/families/wikihow_family.py
+++ b/pywikibot/families/wikihow_family.py
@@ -60,7 +60,3 @@
def scriptpath(self, code) -> str:
"""Return the script path for this family."""
return ''
-
- def protocol(self, code) -> str:
- """Return 'https' as the protocol."""
- return 'https'
diff --git a/pywikibot/families/wikispore_family.py b/pywikibot/families/wikispore_family.py
index 427280e..a0deedb 100644
--- a/pywikibot/families/wikispore_family.py
+++ b/pywikibot/families/wikispore_family.py
@@ -3,7 +3,7 @@
.. versionadded:: 4.1
"""
#
-# (C) Pywikibot team, 2020-2022
+# (C) Pywikibot team, 2020-2023
#
# Distributed under the terms of the MIT license.
#
@@ -22,6 +22,3 @@
'en': 'wikispore.wmflabs.org',
'test': 'wikispore-test.wmflabs.org',
}
-
- def protocol(self, code) -> str:
- return 'https'
diff --git a/pywikibot/family.py b/pywikibot/family.py
index 1d9d0b8..ed75706 100644
--- a/pywikibot/family.py
+++ b/pywikibot/family.py
@@ -428,15 +428,18 @@
# Methods
def protocol(self, code: str) -> str:
- """
- The protocol to use to connect to the site.
+ """The protocol to use to connect to the site.
- May be overridden to return 'https'. Other protocols are not supported.
+ May be overridden to return 'http'. Other protocols are not
+ supported.
+
+ .. versionchanged:: 8.2
+ ``https`` is returned instead of ``http``.
:param code: language code
:return: protocol that this family uses
"""
- return 'http'
+ return 'https'
def verify_SSL_certificate(self, code: str) -> bool:
"""
@@ -828,10 +831,6 @@
return {code: cls.domain for code in codes}
- def protocol(self, code) -> str:
- """Return 'https' as the protocol."""
- return 'https'
-
def scriptpath(self, code):
"""Return the script path for this family."""
return '' if code == 'en' else ('/' + code)
@@ -980,10 +979,6 @@
"""Return Wikimedia Commons as the shared image repository."""
return ('commons', 'commons')
- def protocol(self, code) -> str:
- """Return 'https' as the protocol."""
- return 'https'
-
def eventstreams_host(self, code) -> str:
"""Return 'https://stream.wikimedia.org' as the stream hostname."""
return 'https://stream.wikimedia.org'
diff --git a/tests/http_tests.py b/tests/http_tests.py
index 73036a5..eee230b 100755
--- a/tests/http_tests.py
+++ b/tests/http_tests.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""Tests for http module."""
#
-# (C) Pywikibot team, 2014-2022
+# (C) Pywikibot team, 2014-2023
#
# Distributed under the terms of the MIT license.
#
@@ -153,7 +153,7 @@
"""Test invalid scheme."""
# A InvalidSchema is raised within requests
with self.assertRaisesRegex(
- requests.exceptions.InvalidSchema,
+ FatalServerError,
"No connection adapters were found for 'invalid://url'"):
http.fetch('invalid://url')
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/874903
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I10859e11b569ee3a4441272439342792038567aa
Gerrit-Change-Number: 874903
Gerrit-PatchSet: 5
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/920321 )
Change subject: [doc] Remove outdated basestring type hints
......................................................................
[doc] Remove outdated basestring type hints
Change-Id: I7f8b92b28a04fd9b56d8d25ca00d28eabd9f1c85
---
M pywikibot/textlib.py
1 file changed, 11 insertions(+), 9 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 7439253..ce63410 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -1,8 +1,4 @@
-"""Functions for manipulating wiki-text.
-
-Unless otherwise noted, all functions take a unicode string as the
-argument and return a unicode string.
-"""
+"""Functions for manipulating wiki-text."""
#
# (C) Pywikibot team, 2008-2023
#
@@ -655,8 +651,7 @@
If it's a string and the replacement was a sequence it converts it into a
Page instance. If the replacement is done via a callable it'll use it like
- unlinking and directly replace the link with the text itself. It only
- supports unicode when used by the callable and bytes are not allowed.
+ unlinking and directly replace the link with the text itself.
If either the section or label should be used the replacement can be a
function which returns a Link instance and copies the value which should
@@ -815,8 +810,6 @@
new_link = new_label
if isinstance(new_link, str):
- # Nothing good can come out of the fact that bytes is returned so
- # force unicode
text = text[:start] + new_link + text[end:]
# Make sure that next time around we will not find this same hit.
curpos = start + len(new_link)
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/920321
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I7f8b92b28a04fd9b56d8d25ca00d28eabd9f1c85
Gerrit-Change-Number: 920321
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/920316 )
Change subject: [IMPR] decrease nested flow statements in textlib.replaceExcept
......................................................................
[IMPR] decrease nested flow statements in textlib.replaceExcept
Also add type hints and update documentation
Change-Id: Iaf8e7927f574c188cf2cfff78365c0883b66aea3
---
M pywikibot/textlib.py
1 file changed, 87 insertions(+), 58 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 2ed436e..7439253 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -16,7 +16,14 @@
from typing import NamedTuple, Optional, Union
import pywikibot
-from pywikibot.backports import Container, Dict, Iterable, List
+from pywikibot.backports import (
+ Callable,
+ Container,
+ Dict,
+ Iterable,
+ Match,
+ List,
+)
from pywikibot.backports import OrderedDict as OrderedDictType
from pywikibot.backports import Pattern
from pywikibot.backports import Sequence as SequenceType
@@ -352,30 +359,38 @@
return result
-def replaceExcept(text: str, old, new, exceptions: list,
- caseInsensitive: bool = False, allowoverlap: bool = False,
- marker: str = '', site=None, count: int = 0) -> str:
+def replaceExcept(text: str,
+ old: Union[str, Pattern[str]],
+ new: Union[str, Callable[[Match[str]], str]],
+ exceptions: List[Union[str, Pattern[str]]],
+ caseInsensitive: bool = False,
+ allowoverlap: bool = False,
+ marker: str = '',
+ site: Optional['pywikibot.site.BaseSite'] = None,
+ count: int = 0) -> str:
"""
- Return text with 'old' replaced by 'new', ignoring specified types of text.
+ Return text with *old* replaced by *new*, ignoring specified types of text.
- Skips occurrences of 'old' within exceptions; e.g., within nowiki tags or
- HTML comments. If caseInsensitive is true, then use case insensitive
- regex matching. If allowoverlap is true, overlapping occurrences are all
- replaced (watch out when using this, it might lead to infinite loops!).
+ Skip occurrences of *old* within *exceptions*; e.g. within nowiki
+ tags or HTML comments. If *caseInsensitive* is true, then use case
+ insensitive regex matching. If *allowoverlap* is true, overlapping
+ occurrences are all replaced
+
+ .. caution:: Watch out when using *allowoverlap*, it might lead to
+ infinite loops!
:param text: text to be modified
:param old: a compiled or uncompiled regular expression
- :param new: a unicode string (which can contain regular
- expression references), or a function which takes
- a match object as parameter. See parameter repl of
- re.sub().
+ :param new: a string (which can contain regular expression
+ references), or a function which takes a match object as
+ parameter. See parameter *repl* of ``re.sub()``.
:param exceptions: a list of strings or already compiled regex
- objects which signal what to leave out. Strings might be like
- ['math', 'table', 'template'] for example.
+ objects which signal what to leave out. List of strings might be
+ like ``['math', 'table', 'template']`` for example.
:param marker: a string that will be added to the last replacement;
if nothing is changed, it is added at the end
:param count: how many replacements to do at most. See parameter
- count of re.sub().
+ *count* of ``re.sub()``.
"""
# if we got a string, compile it as a regular expression
if isinstance(old, str):
@@ -393,6 +408,7 @@
while not count or replaced < count:
if index > len(text):
break
+
match = old.search(text, index)
if not match:
# nothing left to replace
@@ -412,55 +428,57 @@
# an HTML comment or text in nowiki tags stands before the next
# valid match. Skip.
index = nextExceptionMatch.end()
+ continue
+
+ # We found a valid match. Replace it.
+ if callable(new):
+ # the parameter new can be a function which takes the match
+ # as a parameter.
+ replacement = new(match)
else:
- # We found a valid match. Replace it.
- if callable(new):
- # the parameter new can be a function which takes the match
- # as a parameter.
- replacement = new(match)
- else:
- # it is not a function, but a string.
+ # it is not a function, but a string.
- # it is a little hack to make \n work. It would be better
- # to fix it previously, but better than nothing.
- new = new.replace('\\n', '\n')
+ # it is a little hack to make \n work. It would be better
+ # to fix it previously, but better than nothing.
+ new = new.replace('\\n', '\n')
- # We cannot just insert the new string, as it may contain regex
- # group references such as \2 or \g<name>.
- # On the other hand, this approach does not work because it
- # can't handle lookahead or lookbehind (see bug T123185).
- # So we have to process the group references manually.
- replacement = ''
+ # We cannot just insert the new string, as it may contain regex
+ # group references such as \2 or \g<name>.
+ # On the other hand, this approach does not work because it
+ # can't handle lookahead or lookbehind (see bug T123185).
+ # So we have to process the group references manually.
+ replacement = ''
- group_regex = re.compile(r'\\(\d+)|\\g<(.+?)>')
- last = 0
- for group_match in group_regex.finditer(new):
- group_id = group_match[1] or group_match[2]
- with suppress(ValueError):
- group_id = int(group_id)
+ group_regex = re.compile(r'\\(\d+)|\\g<(.+?)>')
+ last = 0
+ for group_match in group_regex.finditer(new):
+ group_id = group_match[1] or group_match[2]
+ with suppress(ValueError):
+ group_id = int(group_id)
- try:
- replacement += new[last:group_match.start()]
- replacement += match[group_id] or ''
- except IndexError:
- raise IndexError('Invalid group reference: {}\n'
- 'Groups found: {}'
- .format(group_id, match.groups()))
- last = group_match.end()
- replacement += new[last:]
+ try:
+ replacement += new[last:group_match.start()]
+ replacement += match[group_id] or ''
+ except IndexError:
+ raise IndexError(f'Invalid group reference: {group_id}\n'
+ f'Groups found: {match.groups()}')
+ last = group_match.end()
+ replacement += new[last:]
- text = text[:match.start()] + replacement + text[match.end():]
+ text = text[:match.start()] + replacement + text[match.end():]
- # continue the search on the remaining text
- if allowoverlap:
- index = match.start() + 1
- else:
- index = match.start() + len(replacement)
- if not match.group():
- # When the regex allows to match nothing, shift by one char
- index += 1
- markerpos = match.start() + len(replacement)
- replaced += 1
+ # continue the search on the remaining text
+ if allowoverlap:
+ index = match.start() + 1
+ else:
+ index = match.start() + len(replacement)
+
+ if not match.group():
+ # When the regex allows to match nothing, shift by one char
+ index += 1
+
+ markerpos = match.start() + len(replacement)
+ replaced += 1
return text[:markerpos] + marker + text[markerpos:]
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/920316
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Iaf8e7927f574c188cf2cfff78365c0883b66aea3
Gerrit-Change-Number: 920316
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/920307 )
Change subject: [bugfix] Add missing f for ValueError in textlib._get_regexes
......................................................................
[bugfix] Add missing f for ValueError in textlib._get_regexes
Also decrease nested flow statements
Change-Id: I6d75ae4fb76fc068761fd753af0ac6818c5a6b7b
---
M pywikibot/textlib.py
1 file changed, 25 insertions(+), 16 deletions(-)
Approvals:
Xqt: Looks good to me, approved
Hashar: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 047628b..2ed436e 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -319,26 +319,24 @@
# assume the string is a reference to a standard regex above,
# which may not yet have a site specific re compiled.
- if exc in _regex_cache:
- if isinstance(_regex_cache[exc], tuple):
- if not site and exc in ('interwiki', 'property', 'invoke',
- 'category', 'file'):
- raise ValueError(
- 'Site cannot be None for the {exc!r} regex')
-
- if (exc, site) not in _regex_cache:
- re_text, re_var = _regex_cache[exc]
- _regex_cache[(exc, site)] = re.compile(
- re_text % re_var(site), re.VERBOSE)
-
- result.append(_regex_cache[(exc, site)])
- else:
- result.append(_regex_cache[exc])
- else:
+ if exc not in _regex_cache:
# nowiki, noinclude, includeonly, timeline, math and other
# extensions
_regex_cache[exc] = _tag_regex(exc)
result.append(_regex_cache[exc])
+ elif not isinstance(_regex_cache[exc], tuple):
+ result.append(_regex_cache[exc])
+ else:
+ if not site and exc in ('interwiki', 'property', 'invoke',
+ 'category', 'file'):
+ raise ValueError(f'Site cannot be None for the {exc!r} regex')
+
+ if (exc, site) not in _regex_cache:
+ re_text, re_var = _regex_cache[exc]
+ _regex_cache[(exc, site)] = re.compile(
+ re_text % re_var(site), re.VERBOSE)
+
+ result.append(_regex_cache[(exc, site)])
# handle aliases
if exc == 'source':
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/920307
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I6d75ae4fb76fc068761fd753af0ac6818c5a6b7b
Gerrit-Change-Number: 920307
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Hashar <hashar(a)free.fr>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged