[Gerrit] ...core[master]: [IMPR] Reset counter when "era" changes - Pywikibot-commits

29 Jul 2020

jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/616719 )
Change subject: [IMPR] Reset counter when "era" changes
......................................................................
[IMPR] Reset counter when "era" changes
This splits the main loop into two ("divide-and-conquer algorithm"):
- The first loop scans the whole page for old threads to be archived
  and groups them by archive.
- The second loop iterates the groups in order that the archive with
  the oldest threads goes first. This is the one the saved counter
  applies to. Any other group resets the counter (if it matters).
Bug: T215247
Change-Id: I2ab39a39a2692bedc0f6de57fb4d06b108ae1164
---
M scripts/archivebot.py
1 file changed, 78 insertions(+), 26 deletions(-)
Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/scripts/archivebot.py b/scripts/archivebot.py
index b8c81e0..292f35d 100755
--- a/scripts/archivebot.py
+++ b/scripts/archivebot.py
@@ -101,7 +101,7 @@
 import re
 import time
-from collections import OrderedDict
+from collections import defaultdict, OrderedDict
 from hashlib import md5
 from math import ceil
 from typing import Any, List, Optional, Pattern, Set, Tuple
@@ -664,39 +664,89 @@
         max_arch_size = str2size(self.get_attr('maxarchivesize'))
         counter = int(self.get_attr('counter', '1'))
         pattern = self.get_attr('archive')
-        oldthreads = self.page.threads
-        self.page.threads = []
+
+        keep_threads = []
+        threads_per_archive = defaultdict(list)
         whys = set()
-        pywikibot.output('Processing {} threads'.format(len(oldthreads)))
-        for thread in oldthreads:
-            threads_left = len(oldthreads) - self.archived_threads
-            if threads_left <= int(self.get_attr('minthreadsleft', 5)):
-                self.page.threads.append(thread)
-                continue  # Because there's too little threads left.
+        pywikibot.output('Processing {} threads'
+                         .format(len(self.page.threads)))
+        for i, thread in enumerate(self.page.threads):
             # TODO: Make an option so that unstamped (unsigned) posts get
             # archived.
             why = self.should_archive_thread(thread)
             if not why or why[0] != 'duration':
-                self.page.threads.append(thread)
+                keep_threads.append(i)
                 continue
-
             params = self.get_params(thread.timestamp, counter)
-            archive = self.get_archive_page(pattern % params, params)
+            # this is actually just a dummy key to group the threads by
+            # "era" regardless of the counter and deal with it later
+            key = pattern % params
+            threads_per_archive[key].append((i, thread))
+            whys.add(why)  # xxx: we don't now if we ever archive anything
-            aux_params = self.get_params(thread.timestamp, counter + 1)
-            counter_matters = (pattern % params) != (pattern % aux_params)
-            del aux_params
-            while counter_matters and archive.is_full(max_arch_size):
-                counter += 1
+        # we need to start with the oldest archive since that is
+        # the one the saved counter applies to, so sort the groups
+        # by the oldest timestamp
+        groups = sorted(threads_per_archive.values(),
+                        key=lambda group: min(t.timestamp for _, t in group))
+
+        era_change = False
+        for group in groups:
+            # We will reset counter IFF:
+            # 1. it matters (AND)
+            # 2. "era" (year, month, etc.) changes (AND)
+            # 3. there is something to put to the new archive.
+            counter_matters = False
+            for i, thread in group:
+                threads_left = len(self.page.threads) - self.archived_threads
+                if threads_left <= int(self.get_attr('minthreadsleft', 5)):
+                    keep_threads.append(i)
+                    continue  # Because there's too little threads left.
+
+                if era_change:
+                    era_change = False
+                    counter = 1
+
                 params = self.get_params(thread.timestamp, counter)
                 archive = self.get_archive_page(pattern % params, params)
-            archive.feed_thread(thread, max_arch_size)
-            whys.add(why)
-            self.archived_threads += 1
+                aux_params = self.get_params(thread.timestamp, counter + 1)
+                # TODO: this variable does not change, figure out a way
+                # to only compute it once
+                counter_matters = (pattern % params) != (pattern % aux_params)
+                del aux_params
+                if counter_matters:
+                    while counter > 1 and not archive.exists():
+                        # This may happen when either:
+                        # 1. a previous version of the bot run and reset
+                        #    the counter without archiving anything
+                        #    (number #3 above)
+                        # 2. era changed between runs.
+                        # Decrease the counter.
+                        # TODO: This can be VERY slow, use preloading
+                        # or binary search.
+                        counter -= 1
+                        params = self.get_params(thread.timestamp, counter)
+                        archive = self.get_archive_page(
+                            pattern % params, params)
+                    while archive.is_full(max_arch_size):
+                        counter += 1
+                        params = self.get_params(thread.timestamp, counter)
+                        archive = self.get_archive_page(
+                            pattern % params, params)
-        self.set_attr('counter', str(counter))
-        return whys
+                archive.feed_thread(thread, max_arch_size)
+                self.archived_threads += 1
+            if counter_matters:
+                era_change = True
+
+        if self.archived_threads:
+            self.page.threads = [self.page.threads[i]
+                                 for i in sorted(keep_threads)]
+            self.set_attr('counter', str(counter))
+            return whys
+        else:
+            return set()
def run(self) -> None:
         """Process a single DiscussionPage object."""
@@ -723,13 +773,15 @@
             pywikibot.output('Archiving {0} thread(s).'
                              .format(self.archived_threads))
             # Save the archives first (so that bugs don't cause a loss of data)
-            for a in sorted(self.archives.keys()):
-                self.comment_params['count'] = self.archives[
-                    a].archived_threads
+            for title, archive in sorted(self.archives.items()):
+                count = archive.archived_threads
+                if count == 0:
+                    continue
+                self.comment_params['count'] = count
                 comment = i18n.twtranslate(self.site.code,
                                            'archivebot-archive-summary',
                                            self.comment_params)
-                self.archives[a].update(comment)
+                archive.update(comment)
# Save the page itself
             self.page.header = rx.sub(self.attr2text(), self.page.header)
-- 
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/616719
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I2ab39a39a2692bedc0f6de57fb4d06b108ae1164
Gerrit-Change-Number: 616719
Gerrit-PatchSet: 3
Gerrit-Owner: Matěj Suchánek matejsuchanek97@gmail.com
Gerrit-Reviewer: D3r1ck alangiderick@gmail.com
Gerrit-Reviewer: D3r1ck01 xsavitar.wiki@aol.com
Gerrit-Reviewer: Dvorapa dvorapa@seznam.cz
Gerrit-Reviewer: Mpaa mpaa.wiki@gmail.com
Gerrit-Reviewer: Whym whym@whym.org
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged