jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/397803 )
Change subject: [bugfix] Make archivebot ignore headers in nowiki, pre, etc. ......................................................................
[bugfix] Make archivebot ignore headers in nowiki, pre, etc.
archivebot.py has been incorrectly searching for thread headers in tags like nowiki, pre, source and in comments
Bug: T182496 Change-Id: I2e7183f431ce3bb4cdd1729e592b7509cbd37b94 --- M scripts/archivebot.py 1 file changed, 25 insertions(+), 5 deletions(-)
Approvals: jenkins-bot: Verified Xqt: Looks good to me, approved
diff --git a/scripts/archivebot.py b/scripts/archivebot.py index 52f1b31..930a8d9 100755 --- a/scripts/archivebot.py +++ b/scripts/archivebot.py @@ -109,7 +109,7 @@
from pywikibot.date import apply_month_delta from pywikibot import i18n -from pywikibot.textlib import TimeStripper +from pywikibot.textlib import TimeStripper, _get_regexes from pywikibot.textlib import to_local_digits from pywikibot.tools import issue_deprecation_warning, FrozenDict
@@ -453,12 +453,32 @@ self.threads = [] self.archives = {} self.archived_threads = 0 - lines = self.get().split('\n') + text = self.get() + # Replace text in following exceptions by spaces, but don't change line + # numbers + exceptions = ['comment', 'code', 'pre', 'source', 'nowiki'] + exc_regexes = _get_regexes(exceptions, self.site) + stripped_text = text + for regex in exc_regexes: + for match in re.finditer(regex, stripped_text): + before = stripped_text[:match.start()] + restricted = stripped_text[match.start():match.end()] + after = stripped_text[match.end():] + restricted = re.sub(r'[^\n]', r'', restricted) + stripped_text = before + restricted + after + # Find thread headers in stripped text and return their line numbers + stripped_lines = stripped_text.split('\n') + thread_headers = [] + for line_number, line in enumerate(stripped_lines, start=1): + if re.search(r'^== *[^=].*? *== *$', line): + thread_headers.append(line_number) + # Fill self by original thread headers on returned line numbers + lines = text.split('\n') found = False # Reading header cur_thread = None - for line in lines: - thread_header = re.search('^== *([^=].*?) *== *$', line) - if thread_header: + for line_number, line in enumerate(lines, start=1): + if line_number in thread_headers: + thread_header = re.search('^== *([^=].*?) *== *$', line) found = True # Reading threads now if cur_thread: self.threads.append(cur_thread)
pywikibot-commits@lists.wikimedia.org