jenkins-bot submitted this change.
[FEAT] Support harvesting time values
Parse the values using wbparsevalue API calls.
Include special handling for dates formatted
using wikilinks.
Bug: T66503
Change-Id: I5f059edddfe276fbad16dbbc7bf967018516bb70
---
M pywikibot/site/_datasite.py
M scripts/harvest_template.py
M tests/harvest_template_tests.py
3 files changed, 100 insertions(+), 7 deletions(-)
diff --git a/pywikibot/site/_datasite.py b/pywikibot/site/_datasite.py
index 2966300..dbe6ee1 100644
--- a/pywikibot/site/_datasite.py
+++ b/pywikibot/site/_datasite.py
@@ -712,6 +712,7 @@
def parsevalue(self, datatype: str, values: List[str],
options: Optional[Dict[str, Any]] = None,
+ language: Optional[str] = None,
validate: bool = False) -> List[Any]:
"""
Send data values to the wikibase parser for interpretation.
@@ -725,19 +726,21 @@
:param values: list of values to be parsed
:param options: any additional options for wikibase parser
(for time, 'precision' should be specified)
+ :param language: code of the language to parse the value in
:param validate: whether parser should provide data validation as well
as parsing
:return: list of parsed values
:raises ValueError: parsing failed due to some invalid input values
"""
params = {
+ 'action': 'wbparsevalue',
'datatype': datatype,
'values': values,
'options': json.dumps(options or {}),
'validate': validate,
- 'uselang': 'en',
+ 'uselang': language or 'en',
}
- req = self.simple_request(action='wbparsevalue', **params)
+ req = self.simple_request(**params)
try:
data = req.submit()
except APIError as e:
diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 6dd4677..48f6dc9 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -101,6 +101,7 @@
#
# Distributed under the terms of MIT License.
#
+import re
import signal
import sys
from typing import Any, Iterator, Optional
@@ -108,9 +109,11 @@
import pywikibot
from pywikibot import pagegenerators as pg
from pywikibot import textlib
+from pywikibot import WbTime
from pywikibot.backports import List, Tuple
from pywikibot.bot import ConfigParserBot, OptionHandler, WikidataBot
from pywikibot.exceptions import (
+ APIError,
InvalidPageError,
InvalidTitleError,
NoPageError,
@@ -192,13 +195,15 @@
self.fields[key] = value
else: # backwards compatibility
self.fields[key] = (value, PropertyOptionHandler())
- self.cacheSources()
- # TODO: Make it a list including the redirects to the template
- template_title = template_title.replace('_', ' ')
- self.templateTitles = self.getTemplateSynonyms(template_title)
+ self.template_title = template_title.replace('_', ' ')
self.linkR = textlib.compileLinkR()
self.create_missing_item = self.opt.create
+ def setup(self):
+ """Cache some static data from wikis."""
+ self.cacheSources()
+ self.templateTitles = self.getTemplateSynonyms(self.template_title)
+
def getTemplateSynonyms(self, title) -> List[str]:
"""Fetch redirects of the title, so we can check against them."""
temp = pywikibot.Page(pywikibot.Site(), title, ns=10)
@@ -369,6 +374,7 @@
.. versionadded:: 7.5
"""
+ value = value.replace('{{!}}', '|')
prop, options = self.fields[field]
matched = False
@@ -393,6 +399,61 @@
if linked_item:
yield linked_item
+ def handle_time(self, value: str,
+ site: pywikibot.site.BaseSite,
+ *args) -> Iterator[WbTime]:
+ """Handle 'time' claim type.
+
+ .. versionadded:: 7.5
+ """
+ value = value.replace('{{!}}', '|')
+ value = value.replace(' ', ' ')
+ value = re.sub('</?sup>', '', value)
+
+ # Some wikis format dates using wikilinks. We construct
+ # all possible texts, e.g., "[[A|B]] of [[C]]" becomes
+ # "A of C" and "B of C", and parse them using the API.
+ # If the result is same for all the values, we import
+ # the value.
+ to_parse = {''}
+ prev_end = 0
+ for match in pywikibot.link_regex.finditer(value):
+ start, end = match.span()
+ since_prev_match = value[prev_end:start]
+
+ title = match.group('title').strip()
+ text = match.group(2)
+ if text:
+ text = text[1:].strip() # remove '|'
+
+ new_to_parse = set()
+ for fragment in to_parse:
+ fragment += since_prev_match
+ new_to_parse.add(fragment + title)
+ if text:
+ new_to_parse.add(fragment + text)
+
+ to_parse = new_to_parse
+ prev_end = end
+
+ rest = value[prev_end:]
+ to_parse = [text + rest for text in to_parse]
+
+ try:
+ result = self.repo.parsevalue('time', to_parse, language=site.lang)
+ except (APIError, ValueError):
+ return
+
+ out = None
+ for data in result:
+ if out is None:
+ out = data
+ elif out != data:
+ pywikibot.output('Found ambiguous date: "{}"'.format(value))
+ return
+
+ yield WbTime.fromWikibase(out, self.repo)
+
@staticmethod
def handle_string(value, *args) -> Iterator[str]:
"""Handle 'string' and 'external-id' claim type.
@@ -496,6 +557,10 @@
'Please specify either -template or -transcludes argument')
return
+ if not fields:
+ pywikibot.error('No template parameters to harvest specified.')
+ return
+
if not gen.gens:
gen.handle_arg('-transcludes:' + template_title)
generator = gen.getCombinedGenerator(preload=True)
diff --git a/tests/harvest_template_tests.py b/tests/harvest_template_tests.py
index 3909727..742b6f2 100644
--- a/tests/harvest_template_tests.py
+++ b/tests/harvest_template_tests.py
@@ -8,7 +8,7 @@
import unittest
from contextlib import suppress
-from pywikibot import ItemPage
+from pywikibot import ItemPage, WbTime
from scripts.harvest_template import HarvestRobot
from tests.aspects import ScriptMainTestCase
@@ -48,6 +48,31 @@
self.assertIsInstance(target, ItemPage)
self.assertEqual(target.title(), item)
+ def test_handle_time(self):
+ """Test handle_time method."""
+ bot = HarvestRobot('Foo', {}, site=self.site)
+
+ day = WbTime(2022, 7, 18, precision=11, site=bot.repo)
+ tests = [
+ ('Foo', None),
+ ('2022', WbTime(2022, 0, 0, precision=9, site=bot.repo)),
+ ('2022-07-18', day),
+ ('18. červenec 2022', day),
+ ('18. července [[2021|2022]]', None),
+ ('[[18. červenec]] 2022', day),
+ ('[[18. červenec|18. července]] [[2022]]', day),
+ ('[[17. červenec|18. července]] [[2022]]', None),
+ ('44 př. n. l.',
+ WbTime(-44, 0, 0, precision=9,
+ calendarmodel='http://www.wikidata.org/entity/Q1985786',
+ site=bot.repo)),
+ ]
+ for text, time in tests:
+ with self.subTest(text=text, time=time):
+ gen = bot.handle_time(text, self.site)
+ out = next(gen, None)
+ self.assertEqual(time, out)
+
if __name__ == '__main__': # pragma: no cover
with suppress(SystemExit):
To view, visit change 814822. To unsubscribe, or for help writing mail filters, visit settings.