jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789854 )
Change subject: [IMPR] Do not kill threads in weblinkchecker.py
......................................................................
[IMPR] Do not kill threads in weblinkchecker.py
- Make count_link_check_threads a staticmethod of WeblinkCheckerRobot
- call suggest_help only once to run the bot and check for
missing_generator and missing_dependencies
- move all code from finally clause into teardown method
- remove the wait_time and do not kill the threads but ask for
killing the jobs if KeyboardInterrupt is raised
Bug: T113139
Change-Id: I2f49f05c3622f1c1b21a4dd6d4da29996d0b7034
---
M scripts/weblinkchecker.py
1 file changed, 53 insertions(+), 56 deletions(-)
Approvals:
Rubin: Looks good to me, but someone else must approve
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index 15838b1..1fd5f6b 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -137,8 +137,9 @@
try:
import memento_client
from memento_client.memento_client import MementoClientException
-except ImportError as e:
- memento_client = e
+ missing_dependencies = None
+except ImportError:
+ missing_dependencies = ['memento_client']
docuReplacements = {'¶ms;': pagegenerators.parameterHelp} # noqa: N816
@@ -173,9 +174,6 @@
def _get_closest_memento_url(url, when=None, timegate_uri=None):
"""Get most recent memento for url."""
- if isinstance(memento_client, ImportError):
- raise memento_client
-
if not when:
when = datetime.datetime.now()
@@ -606,6 +604,52 @@
thread.daemon = True
self.threads.append(thread)
+ def teardown(self) -> None:
+ """Finish remaining threads and save history file."""
+ num = self.count_link_check_threads()
+ if num:
+ pywikibot.info('<<lightblue>>Waiting for remaining {} threads '
+ 'to finish, please wait...'.format(num))
+
+ while self.count_link_check_threads():
+ try:
+ time.sleep(0.1)
+ except KeyboardInterrupt:
+ # Threads will die automatically because they are daemonic.
+ if pywikibot.input_yn('There are {} pages remaining in the '
+ 'queue. Really exit?'
+ .format(self.count_link_check_threads()),
+ default=False, automatic_quit=False):
+ break
+
+ num = self.count_link_check_threads()
+ if num:
+ pywikibot.info('<<yellow>>>Remaining {} threads will be killed.'
+ .format(num))
+
+ if self.history.report_thread:
+ self.history.report_thread.shutdown()
+ # wait until the report thread is shut down; the user can
+ # interrupt it by pressing CTRL-C.
+ try:
+ while self.history.report_thread.is_alive():
+ time.sleep(0.1)
+ except KeyboardInterrupt:
+ pywikibot.info('Report thread interrupted.')
+ self.history.report_thread.kill()
+
+ pywikibot.info('Saving history...')
+ self.history.save()
+
+ @staticmethod
+ def count_link_check_threads() -> int:
+ """Count LinkCheckThread threads.
+
+ :return: number of LinkCheckThread threads
+ """
+ return sum(isinstance(thread, LinkCheckThread)
+ for thread in threading.enumerate())
+
def RepeatPageGenerator(): # noqa: N802
"""Generator for pages in History."""
@@ -619,19 +663,6 @@
yield page
-def count_link_check_threads() -> int:
- """
- Count LinkCheckThread threads.
-
- :return: number of LinkCheckThread threads
- """
- i = 0
- for thread in threading.enumerate():
- if isinstance(thread, LinkCheckThread):
- i += 1
- return i
-
-
def main(*args: str) -> None:
"""
Process command line arguments and invoke bot.
@@ -683,46 +714,12 @@
if not gen:
gen = gen_factory.getCombinedGenerator()
- if gen:
+
+ if not suggest_help(missing_generator=not gen,
+ missing_dependencies=missing_dependencies):
bot = WeblinkCheckerRobot(http_ignores, config.weblink_dead_days,
generator=gen)
- try:
- bot.run()
- except ImportError:
- suggest_help(missing_dependencies=('memento_client',))
- return
- finally:
- wait_time = 0
- # Don't wait longer than 30 seconds for threads to finish.
- while count_link_check_threads() > 0 and wait_time < 30:
- try:
- pywikibot.output('Waiting for remaining {} threads to '
- 'finish, please wait...'
- .format(count_link_check_threads()))
- # wait 1 second
- time.sleep(1)
- wait_time += 1
- except KeyboardInterrupt:
- pywikibot.output('Interrupted.')
- break
- if count_link_check_threads() > 0:
- pywikibot.output('Remaining {} threads will be killed.'
- .format(count_link_check_threads()))
- # Threads will die automatically because they are daemonic.
- if bot.history.report_thread:
- bot.history.report_thread.shutdown()
- # wait until the report thread is shut down; the user can
- # interrupt it by pressing CTRL-C.
- try:
- while bot.history.report_thread.is_alive():
- time.sleep(0.1)
- except KeyboardInterrupt:
- pywikibot.output('Report thread interrupted.')
- bot.history.report_thread.kill()
- pywikibot.output('Saving history...')
- bot.history.save()
- else:
- suggest_help(missing_generator=True)
+ bot.run()
if __name__ == '__main__':
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789854
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I2f49f05c3622f1c1b21a4dd6d4da29996d0b7034
Gerrit-Change-Number: 789854
Gerrit-PatchSet: 3
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Rubin <rubin.happy(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789784 )
Change subject: [IMPR] Use Page.extlinks() to get external links
......................................................................
[IMPR] Use Page.extlinks() to get external links
- use Page.extlinks() to get external links instead of retrieving urls
from page.text; this also includes external links from templates.
- since page content is no longer necesary, do not preload pages anymore
- use new use_redirects filter attribute instead of RedirectFilterPageGenerator
- the "old" weblinks_from_text is kept as text_predicate for
XmlDumpPageGenerator
Bug: T60812
Change-Id: Iff098db4f8c31cabf06657fb833835bc22a35c6b
---
M scripts/weblinkchecker.py
1 file changed, 6 insertions(+), 7 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index 4c49ee9..15838b1 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -235,6 +235,9 @@
"""
Yield web links from text.
+ Only used as text predicate for XmlDumpPageGenerator to speed up
+ generator.
+
TODO: move to textlib
"""
text = textlib.removeDisabledParts(text)
@@ -568,6 +571,8 @@
It uses several LinkCheckThreads at once to process pages from generator.
"""
+ use_redirects = False
+
def __init__(self, http_ignores=None, day: int = 7, **kwargs) -> None:
"""Initializer."""
super().__init__(**kwargs)
@@ -589,7 +594,7 @@
def treat_page(self) -> None:
"""Process one page."""
page = self.current_page
- for url in weblinks_from_text(page.text):
+ for url in page.extlinks():
for ignore_regex in ignorelist:
if ignore_regex.match(url):
break
@@ -679,12 +684,6 @@
if not gen:
gen = gen_factory.getCombinedGenerator()
if gen:
- if not gen_factory.nopreload:
- # fetch at least 240 pages simultaneously from the wiki, but more
- # if a high thread number is set.
- num_pages = max(240, config.max_external_links * 2)
- gen = pagegenerators.PreloadingGenerator(gen, groupsize=num_pages)
- gen = pagegenerators.RedirectFilterPageGenerator(gen)
bot = WeblinkCheckerRobot(http_ignores, config.weblink_dead_days,
generator=gen)
try:
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789784
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Iff098db4f8c31cabf06657fb833835bc22a35c6b
Gerrit-Change-Number: 789784
Gerrit-PatchSet: 2
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Rubin <rubin.happy(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789587 )
Change subject: [IMPR] Increase minimum workers
......................................................................
[IMPR] Increase minimum workers
doubling the workers to have one free for each family is a good value
Change-Id: I5fa697c8959ed5c70e8a76cf684981ff53dac55f
---
M scripts/maintenance/preload_sites.py
1 file changed, 7 insertions(+), 4 deletions(-)
Approvals:
Mpaa: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/maintenance/preload_sites.py b/scripts/maintenance/preload_sites.py
index 3c0afcd..5e0743b 100755
--- a/scripts/maintenance/preload_sites.py
+++ b/scripts/maintenance/preload_sites.py
@@ -73,15 +73,18 @@
.. versionchanged:: 7.3
Default of worker is calculated like for Python 3.8 but preserves
- at least one worker more than families_list elements to ensure a
- worker can be added in :func:`preload_family`.
+ at least one worker for each element in families_list for better
+ performance.
"""
start = datetime.now()
if worker is None:
# Python 3.8 default
worker = min(32, (os.cpu_count() or 1) + 4)
- # allow to add futures in preload_family
- worker = max(len(families) + 1, worker)
+ # to allow adding futures in preload_family the workers must be one
+ # more than families are handled
+ worker = max(len(families) * 2, worker)
+ pywikibot.output('Using {} workers to process {} families'
+ .format(worker, len(families)))
with ThreadPoolExecutor(worker) as executor:
futures = {executor.submit(preload_family, family, executor)
for family in families}
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789587
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I5fa697c8959ed5c70e8a76cf684981ff53dac55f
Gerrit-Change-Number: 789587
Gerrit-PatchSet: 2
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789893 )
Change subject: [bugfix] Remove ThreadList.stop_all() method
......................................................................
[bugfix] Remove ThreadList.stop_all() method
ThreadList is a thread pool executor which collect a couple of Threads
and starts them running. Threads does not have a stop() method and
calling Thread.stop() leads to an AttributeError
- remove ThreadList.stop_all() due to this bug. It is not used elsewhere.
There is no easy way to stop a running Thread.
Bug: T307830
Change-Id: I8512dc654aca0f6763b016957af4c1d56a9aa0f1
---
M pywikibot/tools/__init__.py
1 file changed, 0 insertions(+), 10 deletions(-)
Approvals:
Mpaa: Looks good to me, but someone else must approve
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/tools/__init__.py b/pywikibot/tools/__init__.py
index 04fd27d..acb91a8 100644
--- a/pywikibot/tools/__init__.py
+++ b/pywikibot/tools/__init__.py
@@ -809,16 +809,6 @@
pywikibot.logging.debug("thread {} ('{}') started"
.format(len(self), type(thd)))
- def stop_all(self) -> None:
- """Stop all threads the pool."""
- if self:
- pywikibot.logging.debug('EARLY QUIT: Threads: {}'
- .format(len(self)))
- for thd in self:
- thd.stop()
- pywikibot.logging.debug('EARLY QUIT: Queue size left in {}: {}'
- .format(thd, thd.queue.qsize()))
-
def intersect_generators(*iterables, allow_duplicates: bool = False):
"""Generator of intersect iterables.
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789893
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I8512dc654aca0f6763b016957af4c1d56a9aa0f1
Gerrit-Change-Number: 789893
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged