jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/616543 )
Change subject: [IMPR] Refactor PageArchiver's main loop ......................................................................
[IMPR] Refactor PageArchiver's main loop
- Avoid archiving to full archives. Previously the bot wouldn't check this. Create, split or deprecate methods where needed. - However, only bother checking if the archivation depends on the counter. Otherwise just archive the thread. - Improve typehints.
Change-Id: I5095d0811014ffbf56da37f1fa767931217b5317 --- M scripts/archivebot.py 1 file changed, 77 insertions(+), 43 deletions(-)
Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
diff --git a/scripts/archivebot.py b/scripts/archivebot.py index 7698709..b8c81e0 100755 --- a/scripts/archivebot.py +++ b/scripts/archivebot.py @@ -116,6 +116,7 @@
ShouldArchive = Optional[Tuple[str, str]] +Size = Tuple[int, str]
ZERO = datetime.timedelta(0)
@@ -245,7 +246,7 @@ return key, duration
-def str2size(string) -> Tuple[int, str]: +def str2size(string) -> Size: """ Return a size for a shorthand size.
@@ -371,7 +372,14 @@ self.timestamp = max(self.timestamp, timestamp)
def size(self) -> int: - """Return size of discussion thread.""" + """ + Return size of discussion thread. + + Note that the result is NOT equal to that of + len(self.to_text()). This method counts bytes, rather than + codepoints (characters). This corresponds to MediaWiki's + definition of page size. + """ return len(self.title.encode('utf-8')) + len( self.content.encode('utf-8')) + 12
@@ -455,18 +463,23 @@ pywikibot.output('{} thread(s) found on {}' .format(len(self.threads), self))
- def feed_thread(self, thread, max_archive_size=(250 * 1024, 'B')) -> bool: + def is_full(self, max_archive_size=(250 * 1024, 'B')) -> bool: """Check whether archive size exceeded.""" - self.threads.append(thread) - self.archived_threads += 1 if max_archive_size[1] == 'B': if self.size() >= max_archive_size[0]: - self.full = True + self.full = True # xxx: this is one-way flag elif max_archive_size[1] == 'T': if len(self.threads) >= max_archive_size[0]: self.full = True return self.full
+ def feed_thread(self, thread: DiscussionThread, + max_archive_size=(250 * 1024, 'B')) -> bool: + """Append a new thread to the archive.""" + self.threads.append(thread) + self.archived_threads += 1 + return self.is_full(max_archive_size) + def size(self) -> int: """Return size of talk page threads.""" return len(self.header.encode('utf-8')) + sum(t.size() @@ -596,16 +609,15 @@ # TODO: handle marked with template return None
- def feed_archive(self, archive, thread, max_archive_size, params=None - ) -> bool: + def get_archive_page(self, title: str, params=None) -> DiscussionPage: """ - Feed the thread to one of the archives. + Return the page for archiving.
- If it doesn't exist yet, create it. + If it doesn't exist yet, create and cache it. Also check for security violations. """ - title = archive.title() page_title = self.page.title() + archive = pywikibot.Page(self.site, title) if not (self.force or title.startswith(page_title + '/') or self.key_ok()): raise ArchiveSecurityError( @@ -613,12 +625,45 @@ .format(archive, page_title)) if title not in self.archives: self.archives[title] = DiscussionPage(archive, self, params) - return self.archives[title].feed_thread(thread, max_archive_size) + return self.archives[title] + + @deprecated(since='20200727', future_warning=True) + def feed_archive(self, archive: pywikibot.Page, thread: DiscussionThread, + max_archive_size: Size, params=None) -> bool: + """ + Feed the thread to one of the archives. + + Also check for security violations. + + @return: whether the archive is full + """ + archive_page = self.get_archive_page( + archive.title(with_ns=True), params) + return archive_page.feed_thread(thread, max_archive_size) + + def get_params(self, timestamp, counter: int) -> dict: + """Make params for archiving template.""" + lang = self.site.lang + return { + 'counter': to_local_digits(counter, lang), + 'year': to_local_digits(timestamp.year, lang), + 'isoyear': to_local_digits(timestamp.isocalendar()[0], lang), + 'isoweek': to_local_digits(timestamp.isocalendar()[1], lang), + 'semester': to_local_digits(int(ceil(timestamp.month / 6)), lang), + 'quarter': to_local_digits(int(ceil(timestamp.month / 3)), lang), + 'month': to_local_digits(timestamp.month, lang), + 'monthname': self.month_num2orig_names[timestamp.month]['long'], + 'monthnameshort': self.month_num2orig_names[ + timestamp.month]['short'], + 'week': to_local_digits( + int(time.strftime('%W', timestamp.timetuple())), lang), + }
def analyze_page(self) -> Set[ShouldArchive]: """Analyze DiscussionPage.""" max_arch_size = str2size(self.get_attr('maxarchivesize')) - arch_counter = int(self.get_attr('counter', '1')) + counter = int(self.get_attr('counter', '1')) + pattern = self.get_attr('archive') oldthreads = self.page.threads self.page.threads = [] whys = set() @@ -631,37 +676,26 @@ # TODO: Make an option so that unstamped (unsigned) posts get # archived. why = self.should_archive_thread(thread) - if why: - archive = self.get_attr('archive') - lang = self.site.lang - timestamp = thread.timestamp - params = { - 'counter': to_local_digits(arch_counter, lang), - 'year': to_local_digits(timestamp.year, lang), - 'isoyear': to_local_digits(timestamp.isocalendar()[0], - lang), - 'isoweek': to_local_digits(timestamp.isocalendar()[1], - lang), - 'semester': to_local_digits(int(ceil(timestamp.month / 6)), - lang), - 'quarter': to_local_digits(int(ceil(timestamp.month / 3)), - lang), - 'month': to_local_digits(timestamp.month, lang), - 'monthname': self.month_num2orig_names[ - timestamp.month]['long'], - 'monthnameshort': self.month_num2orig_names[ - timestamp.month]['short'], - 'week': to_local_digits( - int(time.strftime('%W', timestamp.timetuple())), lang), - } - archive = pywikibot.Page(self.site, archive % params) - if self.feed_archive(archive, thread, max_arch_size, params): - arch_counter += 1 - self.set_attr('counter', str(arch_counter)) - whys.add(why) - self.archived_threads += 1 - else: + if not why or why[0] != 'duration': self.page.threads.append(thread) + continue + + params = self.get_params(thread.timestamp, counter) + archive = self.get_archive_page(pattern % params, params) + + aux_params = self.get_params(thread.timestamp, counter + 1) + counter_matters = (pattern % params) != (pattern % aux_params) + del aux_params + while counter_matters and archive.is_full(max_arch_size): + counter += 1 + params = self.get_params(thread.timestamp, counter) + archive = self.get_archive_page(pattern % params, params) + + archive.feed_thread(thread, max_arch_size) + whys.add(why) + self.archived_threads += 1 + + self.set_attr('counter', str(counter)) return whys
def run(self) -> None:
pywikibot-commits@lists.wikimedia.org