jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/814822 )
Change subject: [FEAT] Support harvesting time values ......................................................................
[FEAT] Support harvesting time values
Parse the values using wbparsevalue API calls. Include special handling for dates formatted using wikilinks.
Bug: T66503 Change-Id: I5f059edddfe276fbad16dbbc7bf967018516bb70 --- M pywikibot/site/_datasite.py M scripts/harvest_template.py M tests/harvest_template_tests.py 3 files changed, 100 insertions(+), 7 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/site/_datasite.py b/pywikibot/site/_datasite.py index 2966300..dbe6ee1 100644 --- a/pywikibot/site/_datasite.py +++ b/pywikibot/site/_datasite.py @@ -712,6 +712,7 @@
def parsevalue(self, datatype: str, values: List[str], options: Optional[Dict[str, Any]] = None, + language: Optional[str] = None, validate: bool = False) -> List[Any]: """ Send data values to the wikibase parser for interpretation. @@ -725,19 +726,21 @@ :param values: list of values to be parsed :param options: any additional options for wikibase parser (for time, 'precision' should be specified) + :param language: code of the language to parse the value in :param validate: whether parser should provide data validation as well as parsing :return: list of parsed values :raises ValueError: parsing failed due to some invalid input values """ params = { + 'action': 'wbparsevalue', 'datatype': datatype, 'values': values, 'options': json.dumps(options or {}), 'validate': validate, - 'uselang': 'en', + 'uselang': language or 'en', } - req = self.simple_request(action='wbparsevalue', **params) + req = self.simple_request(**params) try: data = req.submit() except APIError as e: diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py index 6dd4677..48f6dc9 100755 --- a/scripts/harvest_template.py +++ b/scripts/harvest_template.py @@ -101,6 +101,7 @@ # # Distributed under the terms of MIT License. # +import re import signal import sys from typing import Any, Iterator, Optional @@ -108,9 +109,11 @@ import pywikibot from pywikibot import pagegenerators as pg from pywikibot import textlib +from pywikibot import WbTime from pywikibot.backports import List, Tuple from pywikibot.bot import ConfigParserBot, OptionHandler, WikidataBot from pywikibot.exceptions import ( + APIError, InvalidPageError, InvalidTitleError, NoPageError, @@ -192,13 +195,15 @@ self.fields[key] = value else: # backwards compatibility self.fields[key] = (value, PropertyOptionHandler()) - self.cacheSources() - # TODO: Make it a list including the redirects to the template - template_title = template_title.replace('_', ' ') - self.templateTitles = self.getTemplateSynonyms(template_title) + self.template_title = template_title.replace('_', ' ') self.linkR = textlib.compileLinkR() self.create_missing_item = self.opt.create
+ def setup(self): + """Cache some static data from wikis.""" + self.cacheSources() + self.templateTitles = self.getTemplateSynonyms(self.template_title) + def getTemplateSynonyms(self, title) -> List[str]: """Fetch redirects of the title, so we can check against them.""" temp = pywikibot.Page(pywikibot.Site(), title, ns=10) @@ -369,6 +374,7 @@
.. versionadded:: 7.5 """ + value = value.replace('{{!}}', '|') prop, options = self.fields[field] matched = False
@@ -393,6 +399,61 @@ if linked_item: yield linked_item
+ def handle_time(self, value: str, + site: pywikibot.site.BaseSite, + *args) -> Iterator[WbTime]: + """Handle 'time' claim type. + + .. versionadded:: 7.5 + """ + value = value.replace('{{!}}', '|') + value = value.replace(' ', ' ') + value = re.sub('</?sup>', '', value) + + # Some wikis format dates using wikilinks. We construct + # all possible texts, e.g., "[[A|B]] of [[C]]" becomes + # "A of C" and "B of C", and parse them using the API. + # If the result is same for all the values, we import + # the value. + to_parse = {''} + prev_end = 0 + for match in pywikibot.link_regex.finditer(value): + start, end = match.span() + since_prev_match = value[prev_end:start] + + title = match.group('title').strip() + text = match.group(2) + if text: + text = text[1:].strip() # remove '|' + + new_to_parse = set() + for fragment in to_parse: + fragment += since_prev_match + new_to_parse.add(fragment + title) + if text: + new_to_parse.add(fragment + text) + + to_parse = new_to_parse + prev_end = end + + rest = value[prev_end:] + to_parse = [text + rest for text in to_parse] + + try: + result = self.repo.parsevalue('time', to_parse, language=site.lang) + except (APIError, ValueError): + return + + out = None + for data in result: + if out is None: + out = data + elif out != data: + pywikibot.output('Found ambiguous date: "{}"'.format(value)) + return + + yield WbTime.fromWikibase(out, self.repo) + @staticmethod def handle_string(value, *args) -> Iterator[str]: """Handle 'string' and 'external-id' claim type. @@ -496,6 +557,10 @@ 'Please specify either -template or -transcludes argument') return
+ if not fields: + pywikibot.error('No template parameters to harvest specified.') + return + if not gen.gens: gen.handle_arg('-transcludes:' + template_title) generator = gen.getCombinedGenerator(preload=True) diff --git a/tests/harvest_template_tests.py b/tests/harvest_template_tests.py index 3909727..742b6f2 100644 --- a/tests/harvest_template_tests.py +++ b/tests/harvest_template_tests.py @@ -8,7 +8,7 @@ import unittest from contextlib import suppress
-from pywikibot import ItemPage +from pywikibot import ItemPage, WbTime from scripts.harvest_template import HarvestRobot
from tests.aspects import ScriptMainTestCase @@ -48,6 +48,31 @@ self.assertIsInstance(target, ItemPage) self.assertEqual(target.title(), item)
+ def test_handle_time(self): + """Test handle_time method.""" + bot = HarvestRobot('Foo', {}, site=self.site) + + day = WbTime(2022, 7, 18, precision=11, site=bot.repo) + tests = [ + ('Foo', None), + ('2022', WbTime(2022, 0, 0, precision=9, site=bot.repo)), + ('2022-07-18', day), + ('18. červenec 2022', day), + ('18. července [[2021|2022]]', None), + ('[[18. červenec]] 2022', day), + ('[[18. červenec|18. července]] [[2022]]', day), + ('[[17. červenec|18. července]] [[2022]]', None), + ('44 př. n. l.', + WbTime(-44, 0, 0, precision=9, + calendarmodel='http://www.wikidata.org/entity/Q1985786', + site=bot.repo)), + ] + for text, time in tests: + with self.subTest(text=text, time=time): + gen = bot.handle_time(text, self.site) + out = next(gen, None) + self.assertEqual(time, out) +
if __name__ == '__main__': # pragma: no cover with suppress(SystemExit):