jenkins-bot submitted this change.

View Change

Approvals: Rubin: Looks good to me, but someone else must approve Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] Port CommonsDelinker to core

This is an initial rewrite of compat's CommonsDelinker.
It reads the local deletion log and shared repository deletion log and
delinks local references.

Also backport image_regex to image.py

Bug: T299563
Change-Id: Ib7b7405115b485d4f404aedecc0146bb30c21468
---
M docs/scripts/unsorted.rst
M docs/scripts_ref/scripts.rst
M scripts/README.rst
A scripts/delinker.py
M scripts/image.py
M tests/script_tests.py
6 files changed, 181 insertions(+), 2 deletions(-)

diff --git a/docs/scripts/unsorted.rst b/docs/scripts/unsorted.rst
index ec81ef4..f7686d2 100644
--- a/docs/scripts/unsorted.rst
+++ b/docs/scripts/unsorted.rst
@@ -19,6 +19,11 @@
.. automodule:: scripts.coordinate_import
:no-members:

+delinker script
+---------------
+.. automodule:: scripts.delinker
+ :no-members:
+
djvutext script
---------------
.. automodule:: scripts.djvutext
diff --git a/docs/scripts_ref/scripts.rst b/docs/scripts_ref/scripts.rst
index b40ab91..d4c262d 100644
--- a/docs/scripts_ref/scripts.rst
+++ b/docs/scripts_ref/scripts.rst
@@ -97,6 +97,11 @@

.. automodule:: scripts.delete

+delinker script
+---------------
+
+.. automodule:: scripts.delinker
+
djvutext script
---------------
.. automodule:: scripts.djvutext
diff --git a/scripts/README.rst b/scripts/README.rst
index d8311a4..7282cc6 100644
--- a/scripts/README.rst
+++ b/scripts/README.rst
@@ -58,6 +58,8 @@
+------------------------+---------------------------------------------------------+
| delete.py | This script can be used to delete pages en masse. |
+------------------------+---------------------------------------------------------+
+ | delinker.py | Delink file references of deleted images. |
+ +------------------------+---------------------------------------------------------+
| djvutext.py | Extracts OCR text from djvu files and uploads onto |
| | pages in the "Page" namespace on Wikisource. |
+------------------------+---------------------------------------------------------+
diff --git a/scripts/delinker.py b/scripts/delinker.py
new file mode 100644
index 0000000..9ee7e59
--- /dev/null
+++ b/scripts/delinker.py
@@ -0,0 +1,165 @@
+#!/usr/bin/python3
+"""Delink removed files from wiki.
+
+This script keeps track of image deletions and delinks removed files
+from current wiki in namespace 0. This script is suitable to delink
+files from a image repository as well as for local images.
+
+The following parameters are supported:
+
+-exclude: If the deletion log contains this pattern, the file is not
+ delinked (default is 'no-delink').
+
+-localonly Retrieve deleted File pages from local log only
+
+-since: Start the deletion log with this timestamp given in MediaWiki
+ timestamp format. If no `-since` option is given, the start
+ timestamp is read from setting file. If the option is empty,
+ the processing starts from the very beginning. If the script
+ stops, the last timestamp is written to the settings file and
+ the next script call starts there if no `-since` is given.
+
+.. note:: This sample script is a
+ :class:`ConfigParserBot <pywikibot.bot.ConfigParserBot>`. All
+ settings can be made either by giving option with the command line or
+ with a settings file which is scripts.ini by default. If you don't
+ want the default values you can add any option you want to change to
+ that settings file below the [delinker] section like.
+
+.. versionadded:: 7.2
+ This script is completely rewriten from compat branch.
+"""
+#
+# (C) Pywikibot team, 2006-2022
+#
+# Distributed under the terms of the MIT license.
+#
+import configparser
+import heapq
+import re
+
+import pywikibot
+from pywikibot.backports import removeprefix
+from pywikibot.bot import (
+ ConfigParserBot,
+ AutomaticTWSummaryBot,
+ SingleSiteBot,
+ calledModuleName,
+)
+from pywikibot.textlib import case_escape, ignore_case, replaceExcept
+from pywikibot.tools.formatter import color_format
+
+
+class CommonsDelinker(SingleSiteBot, ConfigParserBot, AutomaticTWSummaryBot):
+
+ """Bot to delink deleted images."""
+
+ update_options = {
+ 'exclude': 'no-delink',
+ 'localonly': False,
+ 'since': '',
+ }
+ summary_key = 'delinker-delink'
+
+ @property
+ def generator(self):
+ """Read deletion logs and yield the oldest entry first."""
+ ts = (pywikibot.Timestamp.fromtimestampformat(self.opt.since)
+ if self.opt.since else None)
+ params = {
+ 'logtype': 'delete',
+ 'namespace': 6,
+ 'reverse': True,
+ 'start': ts,
+ }
+
+ iterables = [self.site.logevents(**params)]
+ repo = self.site.image_repository() if not self.opt.localonly else None
+ if repo:
+ iterables.append(repo.logevents(**params))
+
+ for entry in heapq.merge(*iterables,
+ key=lambda event: event.timestamp()):
+ self.last_ts = entry.timestamp()
+ if entry['action'] == 'delete' \
+ and self.opt.exclude not in entry.get('comment', ''):
+ yield entry
+
+ def init_page(self, item) -> 'pywikibot.page.FilePage':
+ """Upcast logevent to FilePage and combine edit summary."""
+ self.summary_parameters = dict(item)
+ return pywikibot.FilePage(self.site, item['title'])
+
+ def skip_page(self, page) -> bool:
+ """Skip pages which neither exists locally nor on shared repository."""
+ pywikibot.output('.', newline=False)
+ if page.file_is_shared() or page.exists():
+ return True
+ return super().skip_page(page)
+
+ def treat(self, file_page):
+ """Set page to current page and delink that page."""
+ # use image_regex from image.py
+ namespace = file_page.site.namespaces[6]
+ escaped = case_escape(namespace.case, file_page.title(with_ns=False))
+ # Be careful, spaces and _ have been converted to '\ ' and '\_'
+ escaped = re.sub('\\\\[_ ]', '[_ ]', escaped)
+ self.image_regex = re.compile(
+ r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'
+ r'(?:[^\[\]]|\[\[[^\]]+\]\]|\[[^\]]+\])*|) *\]\]'
+ .format('|'.join(ignore_case(s) for s in namespace), escaped))
+
+ shown = False
+ for page in file_page.usingPages(content=True, namespaces=0):
+ if not shown:
+ pywikibot.output(
+ color_format('\n>>> {lightgreen}Delinking {}{default} <<<',
+ file_page.title()))
+ shown = True
+ super().treat(page)
+
+ def treat_page(self):
+ """Delink a single page."""
+ new = replaceExcept(self.current_page.text, self.image_regex, '', [])
+ self.put_current(new)
+
+ def teardown(self):
+ """Save the last used logevent timestamp."""
+ if not hasattr(self, 'last_ts'):
+ return
+
+ pywikibot.output("\nUpdate 'since' to {} file".format(self.INI))
+ conf = configparser.ConfigParser(inline_comment_prefixes=[';'])
+ conf.read(self.INI)
+ section = calledModuleName()
+ if not conf.has_section(section):
+ conf.add_section(section)
+ conf.set(section, 'since', self.last_ts.totimestampformat())
+ with open(self.INI, 'w') as f:
+ conf.write(f)
+
+
+def main(*args: str) -> None:
+ """
+ Process command line arguments and invoke bot.
+
+ If args is an empty list, sys.argv is used.
+
+ :param args: command line arguments
+ """
+ options = {}
+ local_args = pywikibot.handle_args()
+ for arg in local_args:
+ opt, _, value = arg.partition(':')
+ opt = removeprefix(opt, '-')
+ if opt == 'localonly':
+ options[opt] = True
+ else:
+ options[opt] = value
+
+ bot = CommonsDelinker(site=pywikibot.Site(), **options)
+ bot.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/image.py b/scripts/image.py
index 84785e7..f29c5f7 100755
--- a/scripts/image.py
+++ b/scripts/image.py
@@ -91,7 +91,8 @@
escaped = re.sub('\\\\[_ ]', '[_ ]', escaped)
if not self.opt.loose or not self.new_image:
image_regex = re.compile(
- r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|[^\n]+?|) *\]\]'
+ r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'
+ r'(?:[^\[\]]|\[\[[^\]]+\]\]|\[[^\]]+\])*|) *\]\]'
.format('|'.join(ignore_case(s) for s in namespace), escaped))
else:
image_regex = re.compile(r'' + escaped)
diff --git a/tests/script_tests.py b/tests/script_tests.py
index 2ab29cd..5b7e1b4 100755
--- a/tests/script_tests.py
+++ b/tests/script_tests.py
@@ -79,13 +79,14 @@
'category_redirect',
'checkimages',
'clean_sandbox',
+ 'delinker',
'login',
'misspelling',
- 'revertbot',
'noreferences',
'nowcommons',
'parser_function_count',
'patrol',
+ 'revertbot',
'shell',
'unusedfiles',
'upload',

To view, visit change 776176. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ib7b7405115b485d4f404aedecc0146bb30c21468
Gerrit-Change-Number: 776176
Gerrit-PatchSet: 20
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: MarcoAurelio <maurelio@toolforge.org>
Gerrit-Reviewer: Rubin <rubin.happy@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: Zabe <alexander.vorwerk@stud.uni-goettingen.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Matěj Suchánek <matejsuchanek97@gmail.com>
Gerrit-MessageType: merged