jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/871240 )
Change subject: [FEAT] Add parser for <pages /> tag ......................................................................
[FEAT] Add parser for <pages /> tag
Add parser for <pages /> tag, defined in ProofreadPage Extension. This is used for Page transclusion in Wikisource.
See: https: //www.mediawiki.org/wiki/Help:Extension:ProofreadPage/Pages_tag
Change-Id: I62f67ba7e77d3bc1322456be47164ef449f3e03f --- M tests/proofreadpage_tests.py M pywikibot/proofreadpage.py 2 files changed, 357 insertions(+), 1 deletion(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py index 56181aa..eaec537 100644 --- a/pywikibot/proofreadpage.py +++ b/pywikibot/proofreadpage.py @@ -26,6 +26,7 @@ # # Distributed under the terms of the MIT license. # +import collections.abc import json import re import time @@ -33,6 +34,7 @@ from http import HTTPStatus from typing import Any, Optional, Union from urllib.parse import unquote +from weakref import WeakKeyDictionary
from requests.exceptions import ReadTimeout
@@ -46,6 +48,7 @@ Sequence, Set, Tuple, + pairwise, ) from pywikibot.comms import http from pywikibot.data.api import ListGenerator, Request @@ -76,6 +79,248 @@ _IndexType = Tuple[Optional['IndexPage'], List['IndexPage']]
+class TagAttr: + """Tag attribute of <pages />. + + Represent a single attribute. + It is used internally in PagesTagParser() and shall not be used + stand-alone. + + It manages string formatting output and conversion str <--> int and quotes. + Input value can only be srt or int and shall have quotes or nothing. + + >>> a = TagAttr('to', 3.0) + Traceback (most recent call last): + ... + TypeError: value=3.0 must be str or int. + + >>> a = TagAttr('to', 'A123"') + Traceback (most recent call last): + ... + ValueError: value=A123" has wrong quotes. + + >>> a = TagAttr('to', 3) + >>> a + TagAttr('to', 3) + >>> str(a) + 'to=3' + >>> a.attr + 'to' + >>> a.value + 3 + + >>> a = TagAttr('to', '3') + >>> a + TagAttr('to', '3') + >>> str(a) + 'to=3' + >>> a.attr + 'to' + >>> a.value + 3 + + >>> a = TagAttr('to', '"3"') + >>> a + TagAttr('to', '"3"') + >>> str(a) + 'to="3"' + >>> a.value + 3 + + >>> a = TagAttr('to', "'3'") + >>> a + TagAttr('to', "'3'") + >>> str(a) + "to='3'" + >>> a.value + 3 + + >>> a = TagAttr('to', 'A123') + >>> a + TagAttr('to', 'A123') + >>> str(a) + 'to=A123' + >>> a.value + 'A123' + """ + + def __init__(self, attr, value): + """Initializer.""" + self.attr = attr + self._value = self._convert(value) + + def _convert(self, value): + """Handle conversion from str to int and quotes.""" + if not isinstance(value, (str, int)): + raise TypeError(f'value={value} must be str or int.') + + self._orig_value = value + + if isinstance(value, str): + if (value.startswith('"') != value.endswith('"') + or value.startswith("'") != value.endswith("'")): + raise ValueError(f'value={value} has wrong quotes.') + value = value.strip('"'') + value = int(value) if value.isdigit() else value + + return value + + @property + def value(self): + """Attribute value.""" + return self._value + + @value.setter + def value(self, value): + self._value = self._convert(value) + + def __str__(self): + attr = 'from' if self.attr == 'ffrom' else self.attr + return f'{attr}={self._orig_value}' + + def __repr__(self): + attr = 'from' if self.attr == 'ffrom' else self.attr + return f"{self.__class__.__name__}('{attr}', {repr(self._orig_value)})" + + +class TagAttrDesc: + """A descriptor tag.""" + + def __init__(self): + """Initializer.""" + self.attrs = WeakKeyDictionary() + + def __set_name__(self, owner, name): + self.public_name = name + + def __get__(self, obj, objtype=None): + attr = self.attrs.get(obj) + return attr.value if attr is not None else None + + def __set__(self, obj, value): + attr = self.attrs.get(obj) + if attr is not None: + attr.value = value + else: + self.attrs[obj] = TagAttr(self.public_name, value) + + def __delete__(self, obj): + self.attrs.pop(obj, None) + + +class PagesTagParser(collections.abc.Container): + """Parser for tag <pages />. + + See https://www.mediawiki.org/wiki/Help:Extension:ProofreadPage/Pages_tag + + Parse text and extract the first <pages ... /> tag. + Individual attributes will be accessible with dot notation. + + >>> tp = PagesTagParser( + ... 'Text: <pages index="Index.pdf" from="first" to="last" />') + >>> tp + PagesTagParser('<pages index="Index.pdf" from="first" to="last" />') + + Atttributes can be modified via dot notation. + If an attribute is a number, it is converted to int. + Note: 'from' is represented as 'ffrom' due to conflict with keyword. + >>> tp.ffrom = 1; tp.to = '"3"' + >>> tp.ffrom + 1 + >>> tp.to + 3 + + Quotes are stripped in the value and added back in the str representation. + Note that quotes are not mandatory. + >>> tp + PagesTagParser('<pages index="Index.pdf" from=1 to="3" />') + + Atttributes can be added via dot notation. + Order is fixed (same order as attribute definition in the class). + >>> tp.fromsection = '"A"' + >>> tp.fromsection + 'A' + >>> tp + PagesTagParser('<pages index="Index.pdf" from=1 to="3" fromsection="A" />') + + Atttributes can be deleted. + >>> del tp.fromsection + >>> tp + PagesTagParser('<pages index="Index.pdf" from=1 to="3" />') + + Attribute presence can be checked. + >>> 'to' in tp + True + + >>> 'step' in tp + False + """ + + pat_tag = re.compile(r'<pages (?P<attrs>[^/]*?)/>') + tokens = ( + 'index', + 'from', + 'to', + 'include', + 'exclude', + 'step', + 'header', + 'tosection', + 'fromsection', + 'onlysection', + ) + tokens = '(' + '=|'.join(tokens) + '=)' + pat_attr = re.compile(tokens) + + index = TagAttrDesc() + ffrom = TagAttrDesc() + to = TagAttrDesc() + include = TagAttrDesc() + exclude = TagAttrDesc() + step = TagAttrDesc() + header = TagAttrDesc() + tosection = TagAttrDesc() + fromsection = TagAttrDesc() + onlysection = TagAttrDesc() + + def __init__(self, text): + """Initializer.""" + m = self.pat_tag.search(text) + if m is None: + raise ValueError(f'Invalid text={text}') + + tag = m['attrs'] + matches = list(self.pat_attr.finditer(tag)) + positions = [m.span()[0] for m in matches] + [len(tag)] + + for begin, end in pairwise(positions): + attribute = tag[begin:end - 1] + attr, _, value = attribute.partition('=') + if attr == 'from': + attr = 'f' + attr + setattr(self, attr, value) + + @classmethod + def get_descriptors(cls): + """Get TagAttrDesc descriptors.""" + res = {k: v for k, v in cls.__dict__.items() + if isinstance(v, TagAttrDesc)} + return res + + def __contains__(self, attr): + return getattr(self, attr) is not None + + def __str__(self): + descriptors = self.get_descriptors().items() + attrs = [v.attrs.get(self) for k, v in descriptors + if v.attrs.get(self) is not None] + attrs = ' '.join(str(attr) for attr in attrs) + return f'<pages {attrs} />' if attrs else '<pages />' + + def __repr__(self): + return f"{self.__class__.__name__}('{self}')" + + def decompose(fn: Callable) -> Callable: # type: ignore """Decorator for ProofreadPage.
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py index 47b6965..44d6ef0 100755 --- a/tests/proofreadpage_tests.py +++ b/tests/proofreadpage_tests.py @@ -13,7 +13,12 @@ import pywikibot from pywikibot.data import api from pywikibot.exceptions import UnknownExtensionError -from pywikibot.proofreadpage import IndexPage, ProofreadPage +from pywikibot.proofreadpage import ( + IndexPage, + PagesTagParser, + ProofreadPage, + TagAttr, +) from pywikibot.tools import has_module from tests import unittest_print from tests.aspects import TestCase, require_modules @@ -24,6 +29,97 @@ from tests.utils import skipping
+class TestPagesTagParser(TestCase): + """Test TagAttr class.""" + + net = False + + def test_tag_attr_int(self): + """Test TagAttr for int values.""" + attr = TagAttr('to', 3) + self.assertEqual(repr(attr), "TagAttr('to', 3)") + self.assertEqual(str(attr), 'to=3') + self.assertEqual(attr.attr, 'to') + self.assertEqual(attr.value, 3) + + def test_tag_attr_srt_int(self): + """Test TagAttr for str values that can be converted to int.""" + attr = TagAttr('to', '3') + self.assertEqual(repr(attr), "TagAttr('to', '3')") + self.assertEqual(str(attr), 'to=3') + self.assertEqual(attr.attr, 'to') + self.assertEqual(attr.value, 3) + + attr.value = '"3"' + self.assertEqual(str(attr), 'to="3"') + self.assertEqual(repr(attr), """TagAttr('to', '"3"')""") + self.assertEqual(attr.value, 3) + + def test_tag_attr_str(self): + """Test TagAttr for str value.""" + attr = TagAttr('fromsection', 'A123') + self.assertEqual(repr(attr), "TagAttr('fromsection', 'A123')") + self.assertEqual(str(attr), 'fromsection=A123') + self.assertEqual(attr.attr, 'fromsection') + self.assertEqual(attr.value, 'A123') + + attr.value = '"A123"' + self.assertEqual(repr(attr), """TagAttr('fromsection', '"A123"')""") + self.assertEqual(str(attr), 'fromsection="A123"') + self.assertEqual(attr.value, 'A123') + + attr.value = "'A123'" + self.assertEqual(repr(attr), """TagAttr('fromsection', "'A123'")""") + self.assertEqual(str(attr), "fromsection='A123'") + self.assertEqual(attr.value, 'A123') + + def test_tag_attr_exceptions(self): + """Test TagAttr for Exceptions.""" + self.assertRaises(ValueError, TagAttr, 'fromsection', 'A123"') + self.assertRaises(TypeError, TagAttr, 'fromsection', 3.0) + + def test_pages_tag_parser(self): + """Test PagesTagParser.""" + tp = PagesTagParser('Text: <pages />') + self.assertEqual(repr(tp), "PagesTagParser('<pages />')") + + text = 'Text: <pages from="first" to="last" />' + tp = PagesTagParser(text) + self.assertEqual( + repr(tp), """PagesTagParser('<pages from="first" to="last" />')""") + self.assertEqual(tp.ffrom, 'first') + self.assertEqual(tp.to, 'last') + + tp.index = '"Index.pdf"' + self.assertEqual(tp.index, 'Index.pdf') + + tp.ffrom, tp.to = 1, '"3"' + self.assertEqual(tp.ffrom, 1) + self.assertEqual(tp.to, 3) + self.assertEqual(str(tp), '<pages index="Index.pdf" from=1 to="3" />') + + del tp.index + self.assertNotIn('index', tp) + + tp.to = "'3'" + self.assertEqual(str(tp), """<pages from=1 to='3' />""") + + tp.step = 3 + self.assertEqual(str(tp), """<pages from=1 to='3' step=3 />""") + self.assertIn('step', tp) + + def test_pages_tag_parser_exceptions(self): + """Test PagesTagParser Exceptions.""" + text = """Text: <pages index="Index.pdf />""" + self.assertRaises(ValueError, PagesTagParser, text) + + text = """Text: <pages index="Index.pdf' />""" + self.assertRaises(ValueError, PagesTagParser, text) + + text = """Text: <pages index="Index.pdf from=C" />""" + self.assertRaises(ValueError, PagesTagParser, text) + + class TestProofreadPageInvalidSite(TestCase):
"""Test ProofreadPage class."""
pywikibot-commits@lists.wikimedia.org