jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/776176 )
Change subject: [IMPR] Port CommonsDelinker to core ......................................................................
[IMPR] Port CommonsDelinker to core
This is an initial rewrite of compat's CommonsDelinker. It reads the local deletion log and shared repository deletion log and delinks local references.
Also backport image_regex to image.py
Bug: T299563 Change-Id: Ib7b7405115b485d4f404aedecc0146bb30c21468 --- M docs/scripts/unsorted.rst M docs/scripts_ref/scripts.rst M scripts/README.rst A scripts/delinker.py M scripts/image.py M tests/script_tests.py 6 files changed, 181 insertions(+), 2 deletions(-)
Approvals: Rubin: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/docs/scripts/unsorted.rst b/docs/scripts/unsorted.rst index ec81ef4..f7686d2 100644 --- a/docs/scripts/unsorted.rst +++ b/docs/scripts/unsorted.rst @@ -19,6 +19,11 @@ .. automodule:: scripts.coordinate_import :no-members:
+delinker script +--------------- +.. automodule:: scripts.delinker + :no-members: + djvutext script --------------- .. automodule:: scripts.djvutext diff --git a/docs/scripts_ref/scripts.rst b/docs/scripts_ref/scripts.rst index b40ab91..d4c262d 100644 --- a/docs/scripts_ref/scripts.rst +++ b/docs/scripts_ref/scripts.rst @@ -97,6 +97,11 @@
.. automodule:: scripts.delete
+delinker script +--------------- + +.. automodule:: scripts.delinker + djvutext script --------------- .. automodule:: scripts.djvutext diff --git a/scripts/README.rst b/scripts/README.rst index d8311a4..7282cc6 100644 --- a/scripts/README.rst +++ b/scripts/README.rst @@ -58,6 +58,8 @@ +------------------------+---------------------------------------------------------+ | delete.py | This script can be used to delete pages en masse. | +------------------------+---------------------------------------------------------+ + | delinker.py | Delink file references of deleted images. | + +------------------------+---------------------------------------------------------+ | djvutext.py | Extracts OCR text from djvu files and uploads onto | | | pages in the "Page" namespace on Wikisource. | +------------------------+---------------------------------------------------------+ diff --git a/scripts/delinker.py b/scripts/delinker.py new file mode 100644 index 0000000..9ee7e59 --- /dev/null +++ b/scripts/delinker.py @@ -0,0 +1,165 @@ +#!/usr/bin/python3 +"""Delink removed files from wiki. + +This script keeps track of image deletions and delinks removed files +from current wiki in namespace 0. This script is suitable to delink +files from a image repository as well as for local images. + +The following parameters are supported: + +-exclude: If the deletion log contains this pattern, the file is not + delinked (default is 'no-delink'). + +-localonly Retrieve deleted File pages from local log only + +-since: Start the deletion log with this timestamp given in MediaWiki + timestamp format. If no `-since` option is given, the start + timestamp is read from setting file. If the option is empty, + the processing starts from the very beginning. If the script + stops, the last timestamp is written to the settings file and + the next script call starts there if no `-since` is given. + +.. note:: This sample script is a + :class:`ConfigParserBot <pywikibot.bot.ConfigParserBot>`. All + settings can be made either by giving option with the command line or + with a settings file which is scripts.ini by default. If you don't + want the default values you can add any option you want to change to + that settings file below the [delinker] section like. + +.. versionadded:: 7.2 + This script is completely rewriten from compat branch. +""" +# +# (C) Pywikibot team, 2006-2022 +# +# Distributed under the terms of the MIT license. +# +import configparser +import heapq +import re + +import pywikibot +from pywikibot.backports import removeprefix +from pywikibot.bot import ( + ConfigParserBot, + AutomaticTWSummaryBot, + SingleSiteBot, + calledModuleName, +) +from pywikibot.textlib import case_escape, ignore_case, replaceExcept +from pywikibot.tools.formatter import color_format + + +class CommonsDelinker(SingleSiteBot, ConfigParserBot, AutomaticTWSummaryBot): + + """Bot to delink deleted images.""" + + update_options = { + 'exclude': 'no-delink', + 'localonly': False, + 'since': '', + } + summary_key = 'delinker-delink' + + @property + def generator(self): + """Read deletion logs and yield the oldest entry first.""" + ts = (pywikibot.Timestamp.fromtimestampformat(self.opt.since) + if self.opt.since else None) + params = { + 'logtype': 'delete', + 'namespace': 6, + 'reverse': True, + 'start': ts, + } + + iterables = [self.site.logevents(**params)] + repo = self.site.image_repository() if not self.opt.localonly else None + if repo: + iterables.append(repo.logevents(**params)) + + for entry in heapq.merge(*iterables, + key=lambda event: event.timestamp()): + self.last_ts = entry.timestamp() + if entry['action'] == 'delete' \ + and self.opt.exclude not in entry.get('comment', ''): + yield entry + + def init_page(self, item) -> 'pywikibot.page.FilePage': + """Upcast logevent to FilePage and combine edit summary.""" + self.summary_parameters = dict(item) + return pywikibot.FilePage(self.site, item['title']) + + def skip_page(self, page) -> bool: + """Skip pages which neither exists locally nor on shared repository.""" + pywikibot.output('.', newline=False) + if page.file_is_shared() or page.exists(): + return True + return super().skip_page(page) + + def treat(self, file_page): + """Set page to current page and delink that page.""" + # use image_regex from image.py + namespace = file_page.site.namespaces[6] + escaped = case_escape(namespace.case, file_page.title(with_ns=False)) + # Be careful, spaces and _ have been converted to '\ ' and '_' + escaped = re.sub('\\[_ ]', '[_ ]', escaped) + self.image_regex = re.compile( + r'[[ *(?:{})\s*:\s*{} *(?P<parameters>|' + r'(?:[^[]]|[[[^]]+]]|[[^]]+])*|) *]]' + .format('|'.join(ignore_case(s) for s in namespace), escaped)) + + shown = False + for page in file_page.usingPages(content=True, namespaces=0): + if not shown: + pywikibot.output( + color_format('\n>>> {lightgreen}Delinking {}{default} <<<', + file_page.title())) + shown = True + super().treat(page) + + def treat_page(self): + """Delink a single page.""" + new = replaceExcept(self.current_page.text, self.image_regex, '', []) + self.put_current(new) + + def teardown(self): + """Save the last used logevent timestamp.""" + if not hasattr(self, 'last_ts'): + return + + pywikibot.output("\nUpdate 'since' to {} file".format(self.INI)) + conf = configparser.ConfigParser(inline_comment_prefixes=[';']) + conf.read(self.INI) + section = calledModuleName() + if not conf.has_section(section): + conf.add_section(section) + conf.set(section, 'since', self.last_ts.totimestampformat()) + with open(self.INI, 'w') as f: + conf.write(f) + + +def main(*args: str) -> None: + """ + Process command line arguments and invoke bot. + + If args is an empty list, sys.argv is used. + + :param args: command line arguments + """ + options = {} + local_args = pywikibot.handle_args() + for arg in local_args: + opt, _, value = arg.partition(':') + opt = removeprefix(opt, '-') + if opt == 'localonly': + options[opt] = True + else: + options[opt] = value + + bot = CommonsDelinker(site=pywikibot.Site(), **options) + bot.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/image.py b/scripts/image.py index 84785e7..f29c5f7 100755 --- a/scripts/image.py +++ b/scripts/image.py @@ -91,7 +91,8 @@ escaped = re.sub('\\[_ ]', '[_ ]', escaped) if not self.opt.loose or not self.new_image: image_regex = re.compile( - r'[[ *(?:{})\s*:\s*{} *(?P<parameters>|[^\n]+?|) *]]' + r'[[ *(?:{})\s*:\s*{} *(?P<parameters>|' + r'(?:[^[]]|[[[^]]+]]|[[^]]+])*|) *]]' .format('|'.join(ignore_case(s) for s in namespace), escaped)) else: image_regex = re.compile(r'' + escaped) diff --git a/tests/script_tests.py b/tests/script_tests.py index 2ab29cd..5b7e1b4 100755 --- a/tests/script_tests.py +++ b/tests/script_tests.py @@ -79,13 +79,14 @@ 'category_redirect', 'checkimages', 'clean_sandbox', + 'delinker', 'login', 'misspelling', - 'revertbot', 'noreferences', 'nowcommons', 'parser_function_count', 'patrol', + 'revertbot', 'shell', 'unusedfiles', 'upload',
pywikibot-commits@lists.wikimedia.org