jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/579877 )
Change subject: [IMPR] use linktrail via siteinfo ......................................................................
[IMPR] use linktrail via siteinfo
- create linktrail regex from siteinfo['general']['linktrail'] - use tiny cache to cache the result - Previously Site.linktrail() was delegated to family.linktrail() by magic Site.__getattr__(). Deprecate this useless method now. - Also remove obsolete linktrails dict - Add TestLinktrails to site_tests.py - Update DrySite class with a default linktrail - Remove update_linktrails.py maintenance script - update documentation
Change-Id: Ie12ddb65f2ed9a9d520b39a4c372d3ee5d9f6309 --- M .codecov.yml M docs/scripts/maintenance.rst M docs/scripts_ref/scripts.maintenance.rst M pywikibot/family.py M pywikibot/scripts/generate_family_file.py M pywikibot/site/_apisite.py M scripts/README.rst D scripts/maintenance/update_linktrails.py M tests/site_tests.py M tests/utils.py 10 files changed, 117 insertions(+), 404 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/.codecov.yml b/.codecov.yml index 2ba30a4..f1a70ae 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -37,7 +37,6 @@ - scripts/maintenance/make_i18n_dict.py - scripts/maintenance/preload_sites.py - scripts/maintenance/sorting_order.py - - scripts/maintenance/update_linktrails.py - scripts/maintenance/wikimedia_sites.py - scripts/userscripts/ - tests/pwb/ diff --git a/docs/scripts/maintenance.rst b/docs/scripts/maintenance.rst index d096563..6f3ed7d 100644 --- a/docs/scripts/maintenance.rst +++ b/docs/scripts/maintenance.rst @@ -25,12 +25,6 @@ .. automodule:: scripts.maintenance.sorting_order :no-members:
-update_linktrails script description -------------------------------------- - -.. automodule:: scripts.maintenance.update_linktrails - :no-members: - wikimedia_sites script description -----------------------------------
diff --git a/docs/scripts_ref/scripts.maintenance.rst b/docs/scripts_ref/scripts.maintenance.rst index e7bb9c4..c331693 100644 --- a/docs/scripts_ref/scripts.maintenance.rst +++ b/docs/scripts_ref/scripts.maintenance.rst @@ -32,11 +32,6 @@
.. automodule:: scripts.maintenance.sorting_order
-scripts.maintenance.update_linktrails script ---------------------------------------------- - -.. automodule:: scripts.maintenance.update_linktrails - scripts.maintenance.wikimedia_sites script -------------------------------------------
diff --git a/pywikibot/family.py b/pywikibot/family.py index 871a36c..4f80f39 100644 --- a/pywikibot/family.py +++ b/pywikibot/family.py @@ -21,7 +21,7 @@ from pywikibot import config from pywikibot.backports import Dict, List, Set, Tuple # skipcq: PY-W2000 from pywikibot.exceptions import FamilyMaintenanceWarning, UnknownFamilyError -from pywikibot.tools import classproperty, deprecated +from pywikibot.tools import classproperty, deprecated, remove_last_args
logger = logging.getLogger('pywiki.wiki.family') @@ -29,7 +29,7 @@ # Legal characters for Family.name and Family.langs keys NAME_CHARACTERS = string.ascii_letters + string.digits # nds_nl code alias requires "_"n -# dash must be the last char to be reused as regex in update_linktrails +# dash must be the last char to be reused as regex CODE_CHARACTERS = string.ascii_lowercase + string.digits + '_-'
@@ -157,211 +157,6 @@ fyinterwiki.sort(key=lambda x: x.replace('y', 'i') + x.count('y') * '!')
- # Letters that can follow a wikilink and are regarded as part of - # this link. This depends on the linktrail setting in LanguageXx.php - # - # Do not use this dict directly but Site.linktrail or Family.linktrail - # methods instead - linktrails = { - '_default': '[a-z]*', - 'ab': '[a-zабвгӷҕдежзӡикқҟлмнопԥҧрстҭуфхҳцҵчҷҽҿшыҩџьә]*', - 'ady': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяӀ]*', - 'als': '[äöüßa-z]*', - 'alt': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяјҥӧӱ]*', - 'ami': '', - 'an': '[a-záéíóúñ]*', - 'ar': '[a-zء-يؐ-ًؚ-ٰٟۖ-ۜ۟-۪ۤۧۨ-ۭ]*', - 'ary': '[a-zء-يؐ-ًؚ-ٰٟۖ-ۜ۟-۪ۤۧۨ-ۭ]*', - 'arz': '[a-zء-يؐ-ًؚ-ٰٟۖ-ۜ۟-۪ۤۧۨ-ۭ]*', - 'ast': '[a-záéíóúñ]*', - 'atj': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'av': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяӀ]*', - 'avk': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'awa': '[a-zऀ-ॣ०-꣠-ꣿ]*', - 'ay': '[a-záéíóúñ]*', - 'az': '[a-zçəğıöşü]*', - 'azb': '[ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیآأئؤة]*', - 'ba': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяәөүғҡңҙҫһ“»]*', - 'bar': '[äöüßa-z]*', - 'bat-smg': '[a-ząčęėįšųūž]*', - 'be': '[абвгґджзеёжзійклмнопрстуўфхцчшыьэюяćčłńśšŭźža-z]*', - 'be-tarask': '[абвгґджзеёжзійклмнопрстуўфхцчшыьэюяćčłńśšŭźža-z]*', - 'bg': '[a-zабвгдежзийклмнопрстуфхцчшщъыьэюя]*', - 'bm': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'bn': '[ঀ-]*', - 'bpy': '[ঀ-]*', - 'br': "(?:[a-zA-ZàâçéèêîôûäëïöüùñÇÉÂÊÎÔÛÄËÏÖÜÀÈÙÑ]|[cC]['’]h|C['’]H)*", - 'bs': '[a-zćčžšđž]*', - 'bxr': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'ca': "(?:[a-zàèéíòóúç·ïü]|'(?!'))*", - 'cbk-zam': '[a-záéíóúñ]*', - 'ce': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяӀ]*', - 'ckb': '[ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆهھەیێ]*', - 'co': '[a-zàéèíîìóòúù]*', - 'crh': '[a-zâçğıñöşüа-яёʺʹ“»]*', - 'cs': '[a-záčďéěíňóřšťúůýž]*', - 'csb': '[a-zęóąśłżźćńĘÓĄŚŁŻŹĆŃ]*', - 'cu': '[a-zабвгдеєжѕзїіıићклмнопсстѹфхѡѿцчш' - 'щъыьѣюѥѧѩѫѭѯѱѳѷѵґѓђёјйљњќуўџэ҄я“»]*', - 'cv': '[a-zа-яĕçăӳ"»]*', - 'cy': '[àáâèéêìíîïòóôûŵŷa-z]*', - 'da': '[a-zæøå]*', - 'dag': '[ɛɣŋɔʒƐƔŊƆƷa-z]*', - 'de': '[äöüßa-z]*', - 'din': '[äëɛɛ̈éɣïŋöɔɔ̈óa-z]*', - 'dsb': '[äöüßa-z]*', - 'el': '[a-zαβγδεζηθικλμνξοπρστυφχψωςΑΒΓΔΕΖΗΘ' - 'ΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίόύώϊϋΐΰΆΈΉΊΌΎΏΪΫ]*', - 'eml': '[a-zàéèíîìóòúù]*', - 'es': '[a-záéíóúñ]*', - 'et': '[äöõšüža-z]*', - 'ext': '[a-záéíóúñ]*', - 'fa': '[ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیآأئؤة]*', - 'ff': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'fi': '[a-zäö]*', - 'fiu-vro': '[äöõšüža-z]*', - 'fo': '[áðíóúýæøa-z]*', - 'fr': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'frp': '[a-zàâçéèêîœôû·’æäåāăëēïīòöōùü‘]*', - 'frr': '[a-zäöüßåāđē]*', - 'fur': '[a-zàéèíîìóòúù]*', - 'fy': '[a-zàáèéìíòóùúâêîôûäëïöü]*', - 'gag': '[a-zÇĞçğİıÖöŞşÜüÂâÎîÛû]*', - 'gan': '', - 'gcr': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'gl': '[áâãàéêẽçíòóôõq̃úüűũa-z]*', - 'glk': '[ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیآأئؤة]*', - 'gn': '[a-záéíóúñ]*', - 'gu': '[-૿]*', - 'guw': '[a-zàáǎèéěìíǐòóǒùúɛ̌ɔɖẹọ]*', - 'he': '[a-zא-ת]*', - 'hi': '[a-zऀ-ॣ०-꣠-ꣿ]*', - 'hr': '[čšžćđßa-z]*', - 'hsb': '[äöüßa-z]*', - 'ht': '[a-zàèòÀÈÒ]*', - 'hu': '[a-záéíóúöüőűÁÉÍÓÚÖÜŐŰ]*', - 'hy': '[a-zաբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆև«»]*', - 'hyw': '[a-zաբգդեզէըթժիլխծկհձղճմյնշոչպջռսվտրցւփքօֆև«»]*', - 'ii': '', - 'inh': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяӀ]*', - 'is': '[áðéíóúýþæöa-z-–]*', - 'it': '[a-zàéèíîìóòúù]*', - 'ka': '[a-zაბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ“»]*', - 'kaa': "(?:[a-zıʼ’“»]|'(?!'))*", - 'kab': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'kbd': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяӀ]*', - 'kbp': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'kk': '[a-zäçéğıïñöşüýʺʹа-яёәғіқңөұүһٴ' - 'ابپتجحدرزسشعفقكلمنڭەوۇۋۆىيچھ“»]*', - 'kl': '[a-zæøå]*', - 'koi': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'krc': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'ksh': '[äöüėëijßəğåůæœça-z]*', - 'ku': '[a-zçêîşûẍḧÇÊÎŞÛẌḦ]*', - 'kv': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'lad': '[a-záéíóúñ]*', - 'lb': '[äöüßa-z]*', - 'lbe': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяӀ1“»]*', - 'lez': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяӀ]*', - 'li': '[a-zäöüïëéèà]*', - 'lij': '[a-zàéèíîìóòúù]*', - 'lld': '[a-zàéèíîìóòúù]*', - 'lmo': '[a-zàéèíîìóòúù]*', - 'ln': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'lrc': '[ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیآأئؤة]*', - 'lt': '[a-ząčęėįšųūž]*', - 'ltg': '[a-zA-ZĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž]*', - 'lv': '[a-zA-ZĀāČčĒēĢģĪīĶķĻļŅņŠšŪūŽž]*', - 'mai': '[a-zऀ-ॣ०-꣠-ꣿ]*', - 'mdf': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'mg': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'mhr': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'mk': '[a-zабвгдѓежзѕијклљмнњопрстќуфхцчџш]*', - 'ml': '[a-zം-ൿ]*', - 'mn': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя“»]*', - 'mr': '[ऀ-ॣॱ-ॿ]*', - 'mrj': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'mwl': '[áâãàéêẽçíòóôõq̃úüűũa-z]*', - 'myv': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'mzn': '[ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیآأئؤة]*', - 'nah': '[a-záéíóúñ]*', - 'nap': '[a-zàéèíîìóòúù]*', - 'nds': '[äöüßa-z]*', - 'nds-nl': '[a-zäöüïëéèà]*', - 'nl': '[a-zäöüïëéèà]*', - 'nn': '[æøåa-z]*', - 'no': '[æøåa-z]*', - 'nrm': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'oc': '[a-zàâçéèêîôû]*', - 'olo': '[a-zčČšŠžŽäÄöÖ]*', - 'or': '[a-z-]*', - 'os': '[a-zаæбвгдеёжзийклмнопрстуфхцчшщъыьэюя“»]*', - 'pa': '[ਁਂਃਅਆਇਈਉਊਏਐਓਔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮ' - 'ਯਰਲਲ਼ਵਸ਼ਸਹ਼ਾਿੀੁੂੇੈੋੌ੍ਖ਼ਗ਼ਜ਼ੜਫ਼ੰੱੲੳa-z]*', - 'pcd': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'pdc': '[äöüßa-z]*', - 'pfl': '[äöüßa-z]*', - 'pl': '[a-zęóąśłżźćńĘÓĄŚŁŻŹĆŃ]*', - 'pms': '[a-zàéèíîìóòúù]*', - 'pnt': '[a-zαβγδεζηθικλμνξοπρστυφχψωςΑΒΓΔΕΖΗΘ' - 'ΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίόύώϊϋΐΰΆΈΉΊΌΎΏΪΫ]*', - 'pt': '[áâãàéêẽçíòóôõq̃úüűũa-z]*', - 'pwn': '', - 'qu': '[a-záéíóúñ]*', - 'rmy': '[a-zăâîşţșțĂÂÎŞŢȘȚ]*', - 'ro': '[a-zăâîşţșțĂÂÎŞŢȘȚ]*', - 'roa-rup': '[a-zăâîşţșțĂÂÎŞŢȘȚ]*', - 'roa-tara': '[a-zàéèíîìóòúù]*', - 'ru': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'rue': '[a-zабвгґдеєжзиіїйклмнопрстуфхцчшщьєюяёъы“»]*', - 'sa': '[a-zऀ-ॣ०-꣠-ꣿ]*', - 'sah': '[a-zабвгҕдеёжзийклмнҥоөпрсһтуүфхцчшщъыьэюя]*', - 'scn': '[a-zàéèíîìóòúù]*', - 'se': '[a-zàáâçčʒǯđðéèêëǧǥȟíìîïıǩŋñóòôõßšŧúùûýÿüžþæøåäö]*', - 'sg': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'sh': '[a-zčćđžš]*', - 'shi': '[ⴰ-ⵯa-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙḍḥɛṛɣṣṭẓḌḤƐṚƔṢṬẒʷ]*', - 'sk': '[a-záäčďéíľĺňóôŕšťúýž]*', - 'skr': '[آابٻپتٹثجچڄحخدڈݙذرڑزژسشصضطظعغفقکگڳلمنݨوہھیےئأءۃڋڰںؤ]*', - 'sl': '[a-zčćđžš]*', - 'smn': '[a-zâčđŋšžäá]*', - 'sr': '[abvgdđežzijklljmnnjoprstćufhcčdž' - 'šабвгдђежзијклљмнњопрстћуфхцчџш]*', - 'srn': '[a-zäöüïëéèà]*', - 'stq': '[äöüßa-z]*', - 'sv': '[a-zåäöéÅÄÖÉ]*', - 'szl': '[a-zęóąśłżźćńĘÓĄŚŁŻŹĆŃ]*', - 'szy': '', - 'ta': '[-]*', - 'tay': '', - 'te': '[ఁ-౯]*', - 'tet': '[áâãàéêẽçíòóôõq̃úüűũa-z]*', - 'tg': '[a-zабвгдеёжзийклмнопрстуфхчшъэюяғӣқўҳҷцщыь]*', - 'tk': '[a-zÄäÇçĞğŇňÖöŞşÜüÝýŽž]*', - 'tr': '[a-zÇĞçğİıÖöŞşÜüÂâÎîÛû]*', - 'trv': '', - 'tt': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяӘәӨөҮүҖҗҢңҺһ]*', - 'ty': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'tyv': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'udm': '[a-zа-яёӝӟӥӧӵ]*', - 'uk': '[a-zабвгґдеєжзиіїйклмнопрстуфхцчшщьєюяёъы“»]*', - 'ur': '[ابپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنںوؤہھیئےآأءۃ]*', - 'uz': '[a-zʻʼ“»]*', - 'vec': '[a-zàéèíîìóòúù]*', - 'vep': '[äöõšüža-z]*', - 'vi': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'vls': '[a-zäöüïëéèà]*', - 'wa': '[a-zåâêîôûçéè]*', - 'wo': '[a-zàâçéèêîôûäëïöüùÇÉÂÊÎÔÛÄËÏÖÜÀÈÙ]*', - 'wuu': '', - 'xal': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюя]*', - 'xmf': '[a-zაბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ“»]*', - 'yi': '[a-zא-ת]*', - 'za': '', - 'zea': '[a-zäöüïëéèà]*', - 'zh': '', - } - # A list of category redirect template names in different languages category_redirect_templates = { '_default': [] @@ -617,20 +412,17 @@ Family._families[fam] = cls return cls
- def linktrail(self, code, fallback: str = '_default'): + @deprecated('APISite.linktrail()', since='7.3.0') + @remove_last_args(['fallback']) + def linktrail(self, code: str) -> str: """Return regex for trailing chars displayed as part of a link.
Returns a string, not a compiled regular expression object. + + .. deprecated:: 7.3 """ - if code in self.linktrails: - return self.linktrails[code] - - if fallback: - return self.linktrails[fallback] - - raise KeyError( - 'ERROR: linktrail in language {language_code} unknown' - .format(language_code=code)) + site = pywikibot.Site(code, 'wikipedia') + return site.linktrail()
def category_redirects(self, code, fallback: str = '_default'): """Return list of category redirect templates.""" diff --git a/pywikibot/scripts/generate_family_file.py b/pywikibot/scripts/generate_family_file.py index 817409c..9b22ee3 100755 --- a/pywikibot/scripts/generate_family_file.py +++ b/pywikibot/scripts/generate_family_file.py @@ -43,7 +43,7 @@ # Legal characters for Family name and Family langs keys NAME_CHARACTERS = string.ascii_letters + string.digits # nds_nl code alias requires "_"n -# dash must be the last char to be reused as regex in update_linktrails +# dash must be the last char to be reused as regex CODE_CHARACTERS = string.ascii_lowercase + string.digits + '_-'
diff --git a/pywikibot/site/_apisite.py b/pywikibot/site/_apisite.py index 98c2c81..1d6e174 100644 --- a/pywikibot/site/_apisite.py +++ b/pywikibot/site/_apisite.py @@ -69,6 +69,7 @@ from pywikibot.site._tokenwallet import TokenWallet from pywikibot.site._upload import Uploader from pywikibot.tools import ( + cached, MediaWikiVersion, deprecated, merge_unique_dicts, @@ -671,6 +672,47 @@ assert '$1' in path, 'articlepath must contain "$1" placeholder' return path.replace('$1', '{}')
+ @cached + def linktrail(self) -> str: + """Build linktrail regex from siteinfo linktrail. + + Letters that can follow a wikilink and are regarded as part of + this link. This depends on the linktrail setting in LanguageXx.php + + .. versionadded:: 7.3 + + :return: The linktrail regex. + """ + unresolved_linktrails = { + 'br': '(?:[a-zA-ZàâçéèêîôûäëïöüùñÇÉÂÊÎÔÛÄËÏÖÜÀÈÙÑ]' + "|[cC]['’]h|C['’]H)*", + 'ca': "(?:[a-zàèéíòóúç·ïü]|'(?!'))*", + 'kaa': "(?:[a-zıʼ’“»]|'(?!'))*", + } + linktrail = self.siteinfo['general']['linktrail'] + if linktrail == '/^()(.*)$/sD': # empty linktrail + return '' + + match = re.search(r'((?::?|?:)?[(?P<pattern>.+?)]' + r'(?P<letters>(|.)*))?+)', linktrail) + if not match: + with suppress(KeyError): + return unresolved_linktrails[self.code] + raise KeyError( + '"{}": No linktrail pattern extracted from "{}"' + .format(self.code, linktrail)) + + pattern = match.group('pattern') + letters = match.group('letters') + + if r'x{' in pattern: + pattern = re.sub(r'\x{([A-F0-9]{4})}', + lambda match: chr(int(match.group(1), 16)), + pattern) + if letters: + pattern += ''.join(letters.split('|')) + return '[{}]*'.format(pattern) + @staticmethod def assert_valid_iter_params( msg_prefix: str, diff --git a/scripts/README.rst b/scripts/README.rst index 7282cc6..519b755 100644 --- a/scripts/README.rst +++ b/scripts/README.rst @@ -172,8 +172,6 @@ +------------------------+---------------------------------------------------------+ | sorting_order.py | Updates interwiki sorting order in family.py file. | +------------------------+---------------------------------------------------------+ - | update_linktrails.py | Script that updates the linktrails in family.py file. | - +------------------------+---------------------------------------------------------+ | wikimedia_sites.py | Updates the language lists in Wikimedia family files. | +------------------------+---------------------------------------------------------+
diff --git a/scripts/maintenance/update_linktrails.py b/scripts/maintenance/update_linktrails.py deleted file mode 100755 index abfeddd..0000000 --- a/scripts/maintenance/update_linktrails.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/python3 -"""Script that updates the linktrails in family.py file. - -linktrails contains a regex for each site code which holds letters that -can follow a wikilink and are regarded as part of this link. This depends -on the linktrail setting in LanguageXx.php. This maintenance script -retrieves the site settings from wikipedia family and updates the Family -linktrails dict. -""" -# -# (C) Pywikibot team, 2017-2021 -# -# Distributed under the terms of the MIT license. -# - -import codecs -import re -from contextlib import closing -from os.path import join - -import pywikibot -from pywikibot.family import CODE_CHARACTERS -from pywikibot.tools import suppress_warnings - - -def format_string(code: str, pattern: str) -> str: - """Format a single pattern line.""" - fmt = ' ' * 8 + "'{}': {!r}" - code_len = len(code) - pattern_len = len(pattern) - - if pattern_len > 64 - code_len: - index = pattern_len // 2 - result = fmt.format(code, pattern[:index]) + '\n' - result += ' ' * (code_len + 12) + repr(pattern[index:]) - else: - result = fmt.format(code, pattern) - - result += ',\n' - # convert escape sequences of unprintable characters to unicode - result = re.sub(r'\u([a-f0-9]{4})', - lambda match: chr(int(match.group(1), 16)), result) - - return result - - -def coroutine(func): - """Decorator which starts coroutine.""" - def start(*args, **kwargs): - cr = func(*args, **kwargs) - cr.send(None) - return cr - return start - - -@coroutine -def update_sites(fam): - """Process linktrail for a given site code.""" - formatter = update_line() - while True: - code = yield - - with suppress_warnings( - 'Site wikipedia:[{}]+ instantiated using different code' - .format(CODE_CHARACTERS), - category=UserWarning, - filename=r'.+update_linktrails.py'): - site = pywikibot.Site(code, 'wikipedia') - - if isinstance(site, pywikibot.site.RemovedSite): - continue - - if site.code != code: - pywikibot.output('"{}" is redirected to "{}"; skipping.' - .format(code, site.code)) - continue - - linktrail = site.siteinfo.get('general', expiry=True)['linktrail'] - oldtrail = fam.linktrails.get(code) - formatter.send((code, oldtrail, linktrail)) - - -@coroutine -def update_line(): - """Format linktrail for family file.""" - writer = update_family_file() - matcher = update_matched_line(writer) - while True: - code, old, linktrail = yield - line = format_string(code, old) if old else '' - - if not linktrail: - writer.send(line) - continue - - if linktrail == '/^()(.*)$/sD': # empty linktrail - line = format_string(code, '') - writer.send(line) - continue - - match = re.search( - r'((?::?|?:)?[(?P<pattern>.+?)]' - r'(?P<letters>(|.)*))?+)', - linktrail) - - if not match: - pywikibot.output('"{}": No pattern found in "{}"' - .format(code, linktrail)) - writer.send(line) - continue - - matcher.send((code, old, match)) - - -@coroutine -def update_matched_line(writer): - """Update matched linktrail.""" - while True: - code, old, match = yield - pattern = match.group('pattern') - letters = match.group('letters') - if pattern == 'a-z' and not letters: # default - if old: - pywikibot.output('"{}" has default linktrail; ' - 'removing {}'.format(code, old)) - continue - - if r'x{' in pattern: - # replace unicode escape string by corresponding char - pattern = re.sub( - r'\x{([A-F0-9]{4})}', - lambda match: chr(int(match.group(1), 16)), - pattern) - - if letters: - pattern += ''.join(letters.split('|')) - - new = '[{}]*'.format(pattern) - line = format_string(code, new) - writer.send(line) - - -@coroutine -def update_family_file(): - """Collect linktrails and write them to family.py.""" - text = " linktrails = {\n '_default': '[a-z]*',\n" - try: - while True: - text += yield - except GeneratorExit: - text += ' }' - # write linktrails to family file - pywikibot.output('Writing family file...') - family_file_name = join('pywikibot', 'family.py') - with codecs.open(family_file_name, 'r', 'utf8') as family_file: - family_text = family_file.read() - family_text = re.sub(r'(?ms)^ {4}linktrails.+?}', - text, family_text, 1) - with codecs.open(family_file_name, 'w', 'utf8') as family_file: - family_file.write(family_text) - - -def update_linktrails(family): - """Update linktrails for given family.""" - with closing(update_sites(family)) as updater: - for code in sorted(family.langs): - updater.send(code) - - -if __name__ == '__main__': - site = pywikibot.Site('en', 'wikipedia') - update_linktrails(site.family) diff --git a/tests/site_tests.py b/tests/site_tests.py index 2c0f0e3..4698341 100755 --- a/tests/site_tests.py +++ b/tests/site_tests.py @@ -3260,6 +3260,67 @@ self.assertFalse(site.sametitle('Invalid:Foo', 'Invalid:foo'))
+class TestLinktrails(TestCase): + + """Test linktrail method.""" + + family = 'wikipedia' + code = 'test' + + def test_has_linktrail(self): + """Verify that every code has a linktrail. + + Test all smallest wikis and the others randomly. + """ + size = 20 + small_wikis = self.site.family.languages_by_size[-size:] + great_wikis = self.site.family.languages_by_size[:-size] + random.shuffle(great_wikis) + great_wikis = great_wikis[:size] + for code in sorted(small_wikis + great_wikis): + site = pywikibot.Site(code, self.family) + with self.subTest(site=site): + self.assertIsInstance(site.linktrail(), str) + + def test_linktrails(self): + """Test special linktrails. + + This is a subset of the old `family.linktrails` dict. + """ + linktrails = { + 'ami': '', + 'bug': '[a-z]*', + 'ca': "(?:[a-zàèéíòóúç·ïü]|'(?!'))*", + 'da': '[a-zæøå]*', + 'ext': '[a-záéíóúñ]*', + 'fa': '[ابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیآأئؤة]*', + 'gu': '[-૿]*', + 'he': '[a-zא-ת]*', + 'ii': '', + 'jv': '[a-z]*', + 'kaa': "(?:[a-zıʼ’“»]|'(?!'))*", + 'lez': '[a-zабвгдеёжзийклмнопрстуфхцчшщъыьэюяӀ]*', + 'mai': '[a-zऀ-ॣ०-꣠-ꣿ]*', + 'nds-nl': '[a-zäöüïëéèà]*', + 'or': '[a-z-]*', + 'pt': '[áâãàéêẽçíòóôõq̃úüűũa-z]*', + 'qu': '[a-záéíóúñ]*', + 'roa-rup': '[a-zăâîşţșțĂÂÎŞŢȘȚ]*', + 'sa': '[a-zऀ-ॣ०-꣠-ꣿ]*', + 'te': '[ఁ-౯]*', + 'uz': '[a-zʻʼ“»]*', + 'vec': '[a-zàéèíîìóòúù]*', + 'wuu': '', + 'xmf': '[a-zაბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ“»]*', + 'yi': '[a-zא-ת]*', + 'zh-cn': '' + } + for code, linktrail in linktrails.items(): + site = pywikibot.Site(code, self.family) + with self.subTest(site=site): + self.assertEqual(site.linktrail(), linktrail) + + class TestObsoleteSite(DefaultSiteTestCase):
"""Test 'closed' and obsolete code sites.""" diff --git a/tests/utils.py b/tests/utils.py index 35f3372..160d703 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -342,6 +342,10 @@ author_ns, 'Author', case=self.siteinfo['case']) return ns_dict
+ def linktrail(self): + """Return default linkrail.""" + return '[a-z]*' + @property def userinfo(self): """Return dry data."""
pywikibot-commits@lists.wikimedia.org