jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/511334 )
Change subject: [IMPR] Weblinkchecker: throttle connections to the same host ......................................................................
[IMPR] Weblinkchecker: throttle connections to the same host
Bug: T152350 Change-Id: I894582d115013f5bf09e42bff6023c25bee6f02b --- M scripts/weblinkchecker.py 1 file changed, 27 insertions(+), 2 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py index 4c49ee9..7730707 100755 --- a/scripts/weblinkchecker.py +++ b/scripts/weblinkchecker.py @@ -114,6 +114,7 @@ import re import threading import time +import urllib.parse as urlparse from contextlib import suppress from functools import partial from http import HTTPStatus @@ -122,6 +123,7 @@
import pywikibot from pywikibot import comms, config, i18n, pagegenerators, textlib +from pywikibot.backports import Dict, removeprefix from pywikibot.bot import ExistingPageBot, SingleSiteBot, suggest_help from pywikibot.exceptions import ( IsRedirectPageError, @@ -289,6 +291,10 @@ After checking the page, it will die. """
+ #: Collecting start time of a thread for any host + hosts = {} # type: Dict[str, float] + lock = threading.Lock() + def __init__(self, page, url, history, http_ignores, day) -> None: """Initializer.""" self.page = page @@ -307,12 +313,28 @@ self._use_fake_user_agent = config.fake_user_agent_default.get( 'weblinkchecker', False) self.day = day + super().__init__()
- name = '{} - {}'.format(page.title(), url.encode('utf-8', 'replace')) - super().__init__(name=name) + @classmethod + def get_delay(cls, name: str) -> float: + """Determine delay from class attribute. + + Store the last call for a given hostname with an offset of + 6 seconds to ensure there are no more than 10 calls per minute + for the same host. Calculate the delay to start the run. + + :param name: The key for the hosts class attribute + :return: The calulated delay to start the run + """ + now = time.monotonic() + with cls.lock: + timestamp = cls.hosts.get(name, now) + cls.hosts[name] = max(now, timestamp) + 6 + return max(0, timestamp - now)
def run(self): """Run the bot.""" + time.sleep(self.get_delay(self.name)) try: header = self.header r = comms.http.fetch( @@ -599,6 +621,9 @@ self.http_ignores, self.day) # thread dies when program terminates thread.daemon = True + # use hostname as thread.name + thread.name = removeprefix( + urlparse.urlparse(url).hostname, 'www.') self.threads.append(thread)
pywikibot-commits@lists.wikimedia.org