jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] Weblinkchecker: throttle connections to the same host

Bug: T152350
Change-Id: I894582d115013f5bf09e42bff6023c25bee6f02b
---
M scripts/weblinkchecker.py
1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index 4c49ee9..7730707 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -114,6 +114,7 @@
import re
import threading
import time
+import urllib.parse as urlparse
from contextlib import suppress
from functools import partial
from http import HTTPStatus
@@ -122,6 +123,7 @@

import pywikibot
from pywikibot import comms, config, i18n, pagegenerators, textlib
+from pywikibot.backports import Dict, removeprefix
from pywikibot.bot import ExistingPageBot, SingleSiteBot, suggest_help
from pywikibot.exceptions import (
IsRedirectPageError,
@@ -289,6 +291,10 @@
After checking the page, it will die.
"""

+ #: Collecting start time of a thread for any host
+ hosts = {} # type: Dict[str, float]
+ lock = threading.Lock()
+
def __init__(self, page, url, history, http_ignores, day) -> None:
"""Initializer."""
self.page = page
@@ -307,12 +313,28 @@
self._use_fake_user_agent = config.fake_user_agent_default.get(
'weblinkchecker', False)
self.day = day
+ super().__init__()

- name = '{} - {}'.format(page.title(), url.encode('utf-8', 'replace'))
- super().__init__(name=name)
+ @classmethod
+ def get_delay(cls, name: str) -> float:
+ """Determine delay from class attribute.
+
+ Store the last call for a given hostname with an offset of
+ 6 seconds to ensure there are no more than 10 calls per minute
+ for the same host. Calculate the delay to start the run.
+
+ :param name: The key for the hosts class attribute
+ :return: The calulated delay to start the run
+ """
+ now = time.monotonic()
+ with cls.lock:
+ timestamp = cls.hosts.get(name, now)
+ cls.hosts[name] = max(now, timestamp) + 6
+ return max(0, timestamp - now)

def run(self):
"""Run the bot."""
+ time.sleep(self.get_delay(self.name))
try:
header = self.header
r = comms.http.fetch(
@@ -599,6 +621,9 @@
self.http_ignores, self.day)
# thread dies when program terminates
thread.daemon = True
+ # use hostname as thread.name
+ thread.name = removeprefix(
+ urlparse.urlparse(url).hostname, 'www.')
self.threads.append(thread)



To view, visit change 511334. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I894582d115013f5bf09e42bff6023c25bee6f02b
Gerrit-Change-Number: 511334
Gerrit-PatchSet: 6
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Dalba <dalba.wiki@gmail.com>
Gerrit-Reviewer: Dvorapa <dvorapa@seznam.cz>
Gerrit-Reviewer: Huji <huji.huji@gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb@gmail.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw@arctus.nl>
Gerrit-Reviewer: Mpaa <mpaa.wiki@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: Zhuyifei1999 <zhuyifei1999@gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Rubin <rubin.happy@gmail.com>
Gerrit-MessageType: merged