jenkins-bot submitted this change.

View Change


Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] Add unidata.py script to mainenance scripts

Script is copied from https://phabricator.wikimedia.org/P7450

Bug: T200357
Change-Id: I557933a6325dfe2859b13c725674ef2eb1cc4734
---
A scripts/maintenance/unidata.py
1 file changed, 163 insertions(+), 0 deletions(-)

diff --git a/scripts/maintenance/unidata.py b/scripts/maintenance/unidata.py
new file mode 100644
index 0000000..cf20dca
--- /dev/null
+++ b/scripts/maintenance/unidata.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""Script to update :mod:`pywikibot.tools._unidata`.
+
+This script is for updating ``_first_upper_exception_dict``. Run this
+module multiple times using different python versions.
+
+.. note:: I seems that running under the latest version of Python gives
+ a superse of the older version and should be enough. But this is not
+ tested completely.
+"""
+#
+# (C) Pywikibot team, 2018-2023
+#
+# Distributed under the terms of the MIT license.
+#
+from sys import maxunicode
+from re import findall
+from json import dump, load
+from queue import Queue
+from threading import Thread
+
+from scripts.maintenance.wikimedia_sites import families_list
+from pywikibot.family import Family
+from pywikibot import Site
+from pywikibot.comms.http import session
+
+
+NUMBER_OF_THREADS = 26
+FILEPATH = '/data/firstup_excepts.json'
+
+
+def chars_uppers_wikilinks():
+ """Retrieve upper chars from MediaWiki using page titles."""
+ n = 0
+ chars = []
+ uppers = []
+ wikilinks = ''
+ for i in range(0, maxunicode + 1):
+ c = chr(i)
+ uc = c.upper()
+ if uc != c:
+ n += 1
+ chars.append(c)
+ uppers.append(uc)
+ # MediaWiki is first-letter case
+ wikilinks += '[[MediaWiki:' + c + ']]\n'
+ return chars, uppers, wikilinks
+
+
+def process_site(fam_name, site_code):
+ """Process title for a single site."""
+ j = session.post(
+ f'https://{site_code}.{fam_name}.org/w/api.php?'
+ f'action=parse&contentmodel=wikitext&prop=text'
+ f'&format=json&utf8',
+ data={'text': wikilinks},
+ timeout=10,
+ ).json()
+ pased_text = j['parse']['text']['*']
+ titles = findall(r'title="[^:]*:(.)', pased_text)
+ site_excepts = {}
+ for i, original_char in enumerate(chars):
+ title_char = titles[i]
+ if uppers[i] != title_char:
+ site_excepts[original_char] = title_char
+ return site_excepts
+
+
+def threads_target(q):
+ """Thread processing a single site."""
+ global families_excepts
+ while True:
+ try:
+ fam, code = q.get()
+ except TypeError: # non-iterable NoneType object
+ break
+ site_excepts = process_site(fam, code)
+ families_excepts[fam].setdefault(code, {}).update(site_excepts)
+ q.task_done()
+
+
+def spawn_threads(q):
+ """Prepare several threads."""
+ # TODO: use ThreadList instead
+ threads = []
+ for i in range(NUMBER_OF_THREADS):
+ t = Thread(target=threads_target, args=(q,))
+ t.start()
+ threads.append(t)
+ return threads
+
+
+def stop_threads(q, threads):
+ """Stop threads."""
+ for i in range(NUMBER_OF_THREADS):
+ q.put(None)
+ for t in threads:
+ t.join()
+
+
+def main():
+ """Main loop processing sites."""
+ global families_excepts
+ q = Queue()
+ threads = spawn_threads(q)
+ for fam_name in families_list:
+ family = Family.load(fam_name)
+ families_excepts.setdefault(fam_name, {})
+ for site_code in family.languages_by_size:
+ site = Site(site_code, family)
+ if site.namespaces[8].case != 'first-letter':
+ raise ValueError('MW namespace case is not first-letter')
+ fam_code = (fam_name, site_code)
+ if fam_code in {
+ ('wikisource', 'www'),
+ ('wikisource', 'mul'),
+ ('wikiversity', 'test'),
+ }:
+ continue # the API of these codes does not respond as expected
+ q.put(fam_code)
+ # block until all tasks are done
+ q.join()
+ stop_threads(q, threads)
+
+
+def save_json(obj, path):
+ """Save data to file."""
+ with open(path, 'w', encoding='utf8') as f:
+ dump(obj, f)
+
+
+def load_json(path):
+ """Load data from file."""
+ try:
+ with open(path, 'r', encoding='utf8') as f:
+ return load(f)
+ except OSError:
+ print('File not found:', path) # noqa: T001, T201
+ return {}
+
+
+if __name__ == '__main__':
+ chars, uppers, wikilinks = chars_uppers_wikilinks()
+ # save_json({'chars': chars, 'uppers': uppers, 'wikilinks': wikilinks},
+ # 'user-temp-save.json')
+ # j = load_json('user-temp-save.json')
+ # chars, uppers, wikilinks = j['chars'], j['uppers'], j['wikilinks']
+ # families_excepts = load_json(FILEPATH)
+ # main()
+ # save_json(families_excepts, FILEPATH)
+ print(process_site('wiktionary', 'fr')) # noqa: T001, T201

To view, visit change 950801. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I557933a6325dfe2859b13c725674ef2eb1cc4734
Gerrit-Change-Number: 950801
Gerrit-PatchSet: 5
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <dalangi-ctr@wikimedia.org>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged