jenkins-bot submitted this change.

View Change


Approvals: Matěj Suchánek: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] Speedup archivebot.py

- Lazy load discussion page if archives, header or threads attribute is
needed
- preload archivepages to detect whether they exist but only if
there are more than 25 pages to preload
- no longer sort archivepages by its name because that is confusing
(Archive2 > Archive100)

archivebot is up to 20 times faster, 2-3 times on average (testwiki).

Change-Id: Ia8fe0efd0c74525f311220bf7e4175f3c7f89d7d
---
M scripts/archivebot.py
1 file changed, 55 insertions(+), 18 deletions(-)

diff --git a/scripts/archivebot.py b/scripts/archivebot.py
index f499e02..05a36ef 100755
--- a/scripts/archivebot.py
+++ b/scripts/archivebot.py
@@ -316,29 +316,29 @@
def __init__(self, source, archiver, params=None, keep=False) -> None:
"""Initializer."""
super().__init__(source)
- self.threads = []
- self.full = False
self.archiver = archiver
# for testing purposes we allow archiver to be None and we are able
# to create the a DiscussionPage in this way:
# >>> import pywikibot as py
# >>> from scripts.archivebot import DiscussionPage
# >>> d = DiscussionPage(py.Page(py.Site(), <talk page name>), None)
+ self.params = params
+ self.keep = keep
+ self.full = False
+ self.archived_threads = 0
if archiver is None:
self.timestripper = TimeStripper(self.site)
else:
self.timestripper = self.archiver.timestripper
- self.params = params
- self.keep = keep
- try:
+
+ def __getattr__(self, name):
+ """Lazy load page if archives, header or threads attribute is missing.
+
+ .. versionadded:: 8.1
+ """
+ if name in ('archives', 'header', 'threads'):
self.load_page()
- except NoPageError:
- self.header = archiver.get_attr('archiveheader',
- i18n.twtranslate(
- self.site.code,
- 'archivebot-archiveheader'))
- if self.params:
- self.header = self.header % self.params
+ return self.__getattribute__(name)

@staticmethod
def max(
@@ -367,11 +367,19 @@
self.header = ''
self.threads = []
self.archives = {}
- self.archived_threads = 0
+
+ try:
+ text = self.get()
+ except NoPageError:
+ self.header = self.archiver.get_attr(
+ 'archiveheader',
+ i18n.twtranslate(self.site.code, 'archivebot-archiveheader'))
+ if self.params:
+ self.header = self.header % self.params
+ return

# Exclude unsupported headings (h1, h3, etc):
# adding the marker will make them ignored by extract_sections()
- text = self.get()
marker = findmarker(text)
text = re.sub(r'^((=|={3,})[^=])', marker + r'\1', text, flags=re.M)

@@ -382,6 +390,7 @@
self.header = '\n\n'.join((header.rstrip(), footer, ''))
else:
self.header = header + footer
+
for thread_heading, thread_content in threads:
cur_thread = DiscussionThread(thread_heading.strip('= '),
self.timestripper)
@@ -702,6 +711,16 @@
archive = self.get_archive_page(pattern % params, params)

if counter_matters:
+
+ # preload pages
+ if counter >= 25:
+ for c in range(counter):
+ params = self.get_params(thread.timestamp, c + 1)
+ self.get_archive_page(pattern % params, params)
+ list(self.site.preloadpages(
+ self.archives.values(),
+ groupsize=self.site.maxlimit))
+
while not counter_found and counter > 1 \
and not archive.exists():
# This may happen when either:
@@ -710,8 +729,6 @@
# (number #3 above)
# 2. era changed between runs.
# Decrease the counter.
- # TODO: This can be VERY slow, use preloading
- # or binary search.
counter -= 1
params = self.get_params(thread.timestamp, counter)
archive = self.get_archive_page(
@@ -743,6 +760,7 @@
"""Process a single DiscussionPage object."""
if not self.page.botMayEdit():
return
+
whys = self.analyze_page()
mintoarchive = int(self.get_attr('minthreadstoarchive', 2))
if self.archived_threads < mintoarchive:
@@ -751,6 +769,7 @@
pywikibot.info(f'Only {self.archived_threads} (< {mintoarchive}) '
f'threads are old enough. Skipping')
return
+
if whys:
# Search for the marker template
rx = re.compile(r'\{\{%s\s*?\n.*?\n\}\}'
@@ -763,9 +782,9 @@

pywikibot.info(f'Archiving {self.archived_threads} thread(s).')
# Save the archives first (so that bugs don't cause a loss of data)
- for _title, archive in sorted(self.archives.items()):
+ for archive in self.archives.values():
count = archive.archived_threads
- if count == 0:
+ if not count:
continue
self.comment_params['count'] = count
comment = i18n.twtranslate(self.site.code,

To view, visit change 904606. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ia8fe0efd0c74525f311220bf7e4175f3c7f89d7d
Gerrit-Change-Number: 904606
Gerrit-PatchSet: 2
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged