jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/904606 )
Change subject: [IMPR] Speedup archivebot.py ......................................................................
[IMPR] Speedup archivebot.py
- Lazy load discussion page if archives, header or threads attribute is needed - preload archivepages to detect whether they exist but only if there are more than 25 pages to preload - no longer sort archivepages by its name because that is confusing (Archive2 > Archive100)
archivebot is up to 20 times faster, 2-3 times on average (testwiki).
Change-Id: Ia8fe0efd0c74525f311220bf7e4175f3c7f89d7d --- M scripts/archivebot.py 1 file changed, 55 insertions(+), 18 deletions(-)
Approvals: Matěj Suchánek: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/scripts/archivebot.py b/scripts/archivebot.py index f499e02..05a36ef 100755 --- a/scripts/archivebot.py +++ b/scripts/archivebot.py @@ -316,29 +316,29 @@ def __init__(self, source, archiver, params=None, keep=False) -> None: """Initializer.""" super().__init__(source) - self.threads = [] - self.full = False self.archiver = archiver # for testing purposes we allow archiver to be None and we are able # to create the a DiscussionPage in this way: # >>> import pywikibot as py # >>> from scripts.archivebot import DiscussionPage # >>> d = DiscussionPage(py.Page(py.Site(), <talk page name>), None) + self.params = params + self.keep = keep + self.full = False + self.archived_threads = 0 if archiver is None: self.timestripper = TimeStripper(self.site) else: self.timestripper = self.archiver.timestripper - self.params = params - self.keep = keep - try: + + def __getattr__(self, name): + """Lazy load page if archives, header or threads attribute is missing. + + .. versionadded:: 8.1 + """ + if name in ('archives', 'header', 'threads'): self.load_page() - except NoPageError: - self.header = archiver.get_attr('archiveheader', - i18n.twtranslate( - self.site.code, - 'archivebot-archiveheader')) - if self.params: - self.header = self.header % self.params + return self.__getattribute__(name)
@staticmethod def max( @@ -367,11 +367,19 @@ self.header = '' self.threads = [] self.archives = {} - self.archived_threads = 0 + + try: + text = self.get() + except NoPageError: + self.header = self.archiver.get_attr( + 'archiveheader', + i18n.twtranslate(self.site.code, 'archivebot-archiveheader')) + if self.params: + self.header = self.header % self.params + return
# Exclude unsupported headings (h1, h3, etc): # adding the marker will make them ignored by extract_sections() - text = self.get() marker = findmarker(text) text = re.sub(r'^((=|={3,})[^=])', marker + r'\1', text, flags=re.M)
@@ -382,6 +390,7 @@ self.header = '\n\n'.join((header.rstrip(), footer, '')) else: self.header = header + footer + for thread_heading, thread_content in threads: cur_thread = DiscussionThread(thread_heading.strip('= '), self.timestripper) @@ -702,6 +711,16 @@ archive = self.get_archive_page(pattern % params, params)
if counter_matters: + + # preload pages + if counter >= 25: + for c in range(counter): + params = self.get_params(thread.timestamp, c + 1) + self.get_archive_page(pattern % params, params) + list(self.site.preloadpages( + self.archives.values(), + groupsize=self.site.maxlimit)) + while not counter_found and counter > 1 \ and not archive.exists(): # This may happen when either: @@ -710,8 +729,6 @@ # (number #3 above) # 2. era changed between runs. # Decrease the counter. - # TODO: This can be VERY slow, use preloading - # or binary search. counter -= 1 params = self.get_params(thread.timestamp, counter) archive = self.get_archive_page( @@ -743,6 +760,7 @@ """Process a single DiscussionPage object.""" if not self.page.botMayEdit(): return + whys = self.analyze_page() mintoarchive = int(self.get_attr('minthreadstoarchive', 2)) if self.archived_threads < mintoarchive: @@ -751,6 +769,7 @@ pywikibot.info(f'Only {self.archived_threads} (< {mintoarchive}) ' f'threads are old enough. Skipping') return + if whys: # Search for the marker template rx = re.compile(r'{{%s\s*?\n.*?\n}}' @@ -763,9 +782,9 @@
pywikibot.info(f'Archiving {self.archived_threads} thread(s).') # Save the archives first (so that bugs don't cause a loss of data) - for _title, archive in sorted(self.archives.items()): + for archive in self.archives.values(): count = archive.archived_threads - if count == 0: + if not count: continue self.comment_params['count'] = count comment = i18n.twtranslate(self.site.code,
pywikibot-commits@lists.wikimedia.org