jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789784 )
Change subject: [IMPR] Use Page.extlinks() to get external links
......................................................................
[IMPR] Use Page.extlinks() to get external links
- use Page.extlinks() to get external links instead of retrieving urls
from page.text; this also includes external links from templates.
- since page content is no longer necesary, do not preload pages anymore
- use new use_redirects filter attribute instead of RedirectFilterPageGenerator
- the "old" weblinks_from_text is kept as text_predicate for
XmlDumpPageGenerator
Bug: T60812
Change-Id: Iff098db4f8c31cabf06657fb833835bc22a35c6b
---
M scripts/weblinkchecker.py
1 file changed, 6 insertions(+), 7 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index 4c49ee9..15838b1 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -235,6 +235,9 @@
"""
Yield web links from text.
+ Only used as text predicate for XmlDumpPageGenerator to speed up
+ generator.
+
TODO: move to textlib
"""
text = textlib.removeDisabledParts(text)
@@ -568,6 +571,8 @@
It uses several LinkCheckThreads at once to process pages from generator.
"""
+ use_redirects = False
+
def __init__(self, http_ignores=None, day: int = 7, **kwargs) -> None:
"""Initializer."""
super().__init__(**kwargs)
@@ -589,7 +594,7 @@
def treat_page(self) -> None:
"""Process one page."""
page = self.current_page
- for url in weblinks_from_text(page.text):
+ for url in page.extlinks():
for ignore_regex in ignorelist:
if ignore_regex.match(url):
break
@@ -679,12 +684,6 @@
if not gen:
gen = gen_factory.getCombinedGenerator()
if gen:
- if not gen_factory.nopreload:
- # fetch at least 240 pages simultaneously from the wiki, but more
- # if a high thread number is set.
- num_pages = max(240, config.max_external_links * 2)
- gen = pagegenerators.PreloadingGenerator(gen, groupsize=num_pages)
- gen = pagegenerators.RedirectFilterPageGenerator(gen)
bot = WeblinkCheckerRobot(http_ignores, config.weblink_dead_days,
generator=gen)
try:
--
To view, visit
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/789784
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Iff098db4f8c31cabf06657fb833835bc22a35c6b
Gerrit-Change-Number: 789784
Gerrit-PatchSet: 2
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: Rubin <rubin.happy(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged