jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/602984 )
Change subject: [maintenance] Add update_linktrails.py maintenance script ......................................................................
[maintenance] Add update_linktrails.py maintenance script
- Add update_linktrails.py maintenance script to update linktrail dict in family.py - add the script to docs
Change-Id: I5107e472b8959e146d1e6371efecf4bbb546c78e --- M docs/scripts/scripts.maintenance.rst M scripts/README.rst A scripts/maintenance/update_linktrails.py 3 files changed, 176 insertions(+), 1 deletion(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/docs/scripts/scripts.maintenance.rst b/docs/scripts/scripts.maintenance.rst index 7a31635..f3dd955 100644 --- a/docs/scripts/scripts.maintenance.rst +++ b/docs/scripts/scripts.maintenance.rst @@ -27,9 +27,14 @@
.. automodule:: scripts.maintenance.make_i18n_dict
+scripts.maintenance.update_linktrails script +--------------------------------------------- + +.. automodule:: scripts.maintenance.update_linktrails + + scripts.maintenance.wikimedia_sites script -------------------------------------------
.. automodule:: scripts.maintenance.wikimedia_sites
- diff --git a/scripts/README.rst b/scripts/README.rst index 5b009e5..de5014e 100644 --- a/scripts/README.rst +++ b/scripts/README.rst @@ -280,6 +280,8 @@ +------------------------+---------------------------------------------------------+ | make_i18n_dict.py | Generate a i18n file from a given script. | +------------------------+---------------------------------------------------------+ + | update_linktrails.py | Script that updates the linktrails in family.py file. | + +------------------------+---------------------------------------------------------+ | wikimedia_sites.py | Updates the language lists in Wikimedia family files. | +------------------------+---------------------------------------------------------+
diff --git a/scripts/maintenance/update_linktrails.py b/scripts/maintenance/update_linktrails.py new file mode 100644 index 0000000..bc50819 --- /dev/null +++ b/scripts/maintenance/update_linktrails.py @@ -0,0 +1,168 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +"""Script that updates the linktrails in family.py file. + +linktrails contains a regex for each site code which holds letters that +can follow a wikilink and are regarded as part of this link. This depends +on the linktrail setting in LanguageXx.php. This maintenance script +retrieves the site settings from wikipedia family and updates the Family +linktrails dict. +""" +# +# (C) Pywikibot team, 2017-2020 +# +# Distributed under the terms of the MIT license. +# + +import codecs +import re + +from contextlib import closing +from os.path import join + +import pywikibot + +from pywikibot.family import CODE_CHARACTERS +from pywikibot.tools import suppress_warnings + + +def format_string(code: str, pattern: str) -> str: + """Format a single pattern line.""" + fmt = ' ' * 8 + "'{}': '{}'" + code_len = len(code) + pattern_len = len(pattern) + if pattern_len > 63 - code_len: + index = pattern_len // 2 + result = fmt.format(code, pattern[:index]) + '\n' + result += ' ' * (code_len + 12) + "'{}',\n".format(pattern[index:]) + else: + result = fmt.format(code, pattern) + ',\n' + return result + + +def coroutine(func): + """Decorator which starts coroutine.""" + def start(*args, **kwargs): + cr = func(*args, **kwargs) + cr.send(None) + return cr + return start + + +@coroutine +def update_sites(fam): + """Process linktrail for a given site code.""" + formatter = update_line() + while True: + code = yield + + with suppress_warnings( + 'Site wikipedia:[{}]+ instantiated using different code' + .format(CODE_CHARACTERS), + category=UserWarning, + filename=r'.+pywikibot.tools.__init__.py'): + site = pywikibot.Site(code, 'wikipedia') + + if isinstance(site, pywikibot.site.RemovedSite): + continue + + if site.code != code: + pywikibot.output('"{}" is redirected to "{}"; skipping.' + .format(code, site.code)) + continue + + linktrail = site.siteinfo.get('general', expiry=True)['linktrail'] + oldtrail = fam.linktrails.get(code) + formatter.send((code, oldtrail, linktrail)) + + +@coroutine +def update_line(): + """Format linktrail for family file.""" + writer = update_family_file() + matcher = update_matched_line(writer) + while True: + code, old, linktrail = yield + line = format_string(code, old) if old else '' + + if not linktrail: + writer.send(line) + continue + + if linktrail == '/^()(.*)$/sD': # empty linktrail + line = format_string(code, '') + writer.send(line) + continue + + match = re.search( + r'((?::?|?:)?[(?P<pattern>.+?)]' + r'(?P<letters>(|.)*))?+)', + linktrail) + + if not match: + pywikibot.output('"{}": No pattern found in "{}"' + .format(code, linktrail)) + writer.send(line) + continue + + matcher.send((code, old, match)) + + +@coroutine +def update_matched_line(writer): + """Update matched linktrail.""" + while True: + code, old, match = yield + pattern = match.group('pattern') + letters = match.group('letters') + if pattern == 'a-z' and not letters: # default + if old: + pywikibot.output('"{}" has default linktrail; ' + 'removing {}'.format(code, old)) + continue + + if r'x{' in pattern: + # replace unicode escape string by corresponding char + pattern = re.sub( + r'\x{([A-F0-9]{4})}', + lambda match: chr(int(match.group(1), 16)), + pattern) + + if letters: + pattern += ''.join(letters.split('|')) + + new = '[{}]*'.format(pattern) + line = format_string(code, new) + writer.send(line) + + +@coroutine +def update_family_file(): + """Collect linktrails and write them to family.py.""" + text = " linktrails = {\n '_default': '[a-z]*',\n" + try: + while True: + text += yield + except GeneratorExit: + text += ' }' + # write lintrails to family file + pywikibot.output('Writing family file...') + family_file_name = join('pywikibot', 'family.py') + with codecs.open(family_file_name, 'r', 'utf8') as family_file: + family_text = family_file.read() + family_text = re.sub(r'(?msu)^ {4}linktrails.+?}', + text, family_text, 1) + with codecs.open(family_file_name, 'w', 'utf8') as family_file: + family_file.write(family_text) + + +def update_linktrails(family): + """Update linktrails for given family.""" + with closing(update_sites(family)) as updater: + for code in sorted(family.langs): + updater.send(code) + + +if __name__ == '__main__': + site = pywikibot.Site('en', 'wikipedia') + update_linktrails(site.family)