jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/482496 )
Change subject: apy.py: check if QueryGenerator support namespaces
......................................................................
apy.py: check if QueryGenerator support namespaces
Check if 'query+module' supports namespace parameter and set
namespace filtering accordingly in pagagenerators.
support_namespace() has been added for this purpose.
Note:
- issue FutureWarning to alert that set_namespace() will soon raise
a TypeError (and support_namespace() will then be removed).
- this will be a breaking change, so time is given to fix 3rd party-code.
- TODO left, so it will also be clear future evolution.
set_namespace() now returns a bool (it makes tests easier).
This should not be an issue as now nothing is returned anyhow.
Tests added for set_namespace() and support_namespace().
Bug: T198452
Change-Id: If612392ec122ec67d9a7897ec17ae8fb356fa56d
---
M pywikibot/data/api.py
M pywikibot/pagegenerators.py
M tests/api_tests.py
M tests/utils.py
4 files changed, 124 insertions(+), 13 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/data/api.py b/pywikibot/data/api.py
index 1b86cf5..16ded04 100644
--- a/pywikibot/data/api.py
+++ b/pywikibot/data/api.py
@@ -2811,6 +2811,21 @@
self.api_limit),
_logger)
+ def support_namespace(self):
+ """Check if namespace is a supported parameter on this query.
+
+ Note: this function will be removed when self.set_namespace() will
+ throw TypeError() instead of just giving a warning.
+ See T196619.
+
+ @return: True if yes, False otherwise
+ @rtype: bool
+ """
+ assert(self.limited_module) # some modules do not have a prefix
+ return bool(
+ self.site._paraminfo.parameter('query+' + self.limited_module,
+ 'namespace'))
+
def set_namespace(self, namespaces):
"""Set a namespace filter on this query.
@@ -2820,9 +2835,12 @@
list of namespace identifiers. An empty iterator clears any
namespace restriction.
@raises KeyError: a namespace identifier was not resolved
- @raises TypeError: a namespace identifier has an inappropriate
- type such as NoneType or bool, or more than one namespace
- if the API module does not support multiple namespaces
+
+ # TODO: T196619
+ # @raises TypeError: module does not support a namespace parameter
+ # or a namespace identifier has an inappropriate
+ # type such as NoneType or bool, or more than one namespace
+ # if the API module does not support multiple namespaces
"""
assert(self.limited_module) # some modules do not have a prefix
param = self.site._paraminfo.parameter('query+' + self.limited_module,
@@ -2830,7 +2848,16 @@
if not param:
pywikibot.warning('{0} module does not support a namespace '
'parameter'.format(self.limited_module))
- return
+ warn('set_namespace() will be modified to raise TypeError '
+ 'when namespace parameter is not supported. '
+ 'It will be a Breaking Change, please update your code '
+ 'ASAP, due date July, 31st 2019.', FutureWarning, 2)
+
+ # TODO: T196619
+ # raise TypeError('{0} module does not support a namespace '
+ # 'parameter'.format(self.limited_module))
+
+ return False
if isinstance(namespaces, basestring):
namespaces = namespaces.split('|')
@@ -2852,6 +2879,8 @@
elif self.prefix + 'namespace' in self.request:
del self.request[self.prefix + 'namespace']
+ return None
+
def _query_continue(self):
if all(key not in self.data[self.continue_name]
for key in self.continuekey):
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 9555c39..e24b626 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -14,7 +14,7 @@
¶ms;
"""
#
-# (C) Pywikibot team, 2008-2018
+# (C) Pywikibot team, 2008-2019
#
# Distributed under the terms of the MIT license.
#
@@ -504,8 +504,13 @@
for i in range(len(self.gens)):
if isinstance(self.gens[i], pywikibot.data.api.QueryGenerator):
- if self.namespaces:
- self.gens[i].set_namespace(self.namespaces)
+ if (self.namespaces
+ and self.gens[i].support_namespace()):
+ self.gens[i].set_namespace(self.namespaces)
+ # QueryGenerator does not support namespace param.
+ else:
+ self.gens[i] = NamespaceFilterPageGenerator(
+ self.gens[i], self.namespaces, self.site)
if self.limit:
self.gens[i].set_maximum_items(self.limit)
else:
@@ -1013,8 +1018,7 @@
def _handle_unconnectedpages(self, value):
"""Handle `-unconnectedpages` argument."""
- # T196619 don't use QueryGenerator due to namespace filtering
- return (p for p in self.site.unconnected_pages(total=_int_none(value)))
+ return self.site.unconnected_pages(total=_int_none(value))
def _handle_imagesused(self, value):
"""Handle `-imagesused` argument."""
diff --git a/tests/api_tests.py b/tests/api_tests.py
index 76d5fc3..e4d71c1 100644
--- a/tests/api_tests.py
+++ b/tests/api_tests.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""API test module."""
#
-# (C) Pywikibot team, 2007-2018
+# (C) Pywikibot team, 2007-2019
#
# Distributed under the terms of the MIT license.
#
@@ -852,6 +852,84 @@
self.assertEqual(len(links), count)
+class TestDryQueryGeneratorNamespaceParam(TestCase):
+
+ """Test setting of namespace param with ListGenerator.
+
+ Generators with different characteristics are used.
+ site._paraminfo is not always faithful to API, but serves the purpose
+ here.
+ """
+
+ family = 'wikipedia'
+ code = 'en'
+
+ dry = True
+
+ def setUp(self):
+ """Set up test case."""
+ super(TestDryQueryGeneratorNamespaceParam, self).setUp()
+ self.site = self.get_site()
+ self.site._paraminfo['query+querypage'] = {
+ 'prefix': 'qp',
+ 'limit': {'max': 10},
+ }
+ self.site._paraminfo['query+allpages'] = {
+ 'prefix': 'ap',
+ 'limit': {'max': 10},
+ 'namespace': {'multi': True}
+ }
+ self.site._paraminfo['query+alllinks'] = {
+ 'prefix': 'al',
+ 'limit': {'max': 10},
+ 'namespace': {'default': 0}
+ }
+ self.site._paraminfo['query+links'] = {
+ 'prefix': 'pl',
+ }
+ self.site._paraminfo.query_modules_with_limits = {'querypage',
+ 'allpages',
+ 'alllinks'}
+
+ def test_namespace_for_module_with_no_limit(self):
+ """Test PageGenerator set_namespace."""
+ self.gen = api.PageGenerator(site=self.site,
+ generator='links',
+ parameters={'titles': 'test'})
+ self.assertRaises(AssertionError, self.gen.set_namespace, 0)
+ self.assertRaises(AssertionError, self.gen.set_namespace, 1)
+ self.assertRaises(AssertionError, self.gen.set_namespace, None)
+
+ def test_namespace_param_is_not_settable(self):
+ """Test ListGenerator support_namespace."""
+ self.gen = api.ListGenerator(listaction='querypage', site=self.site)
+ self.assertFalse(self.gen.support_namespace())
+ self.assertFalse(self.gen.set_namespace([0, 1]))
+
+ def test_namespace_none(self):
+ """Test ListGenerator set_namespace with None."""
+ self.gen = api.ListGenerator(listaction='alllinks', site=self.site)
+ self.assertRaises(TypeError, self.gen.set_namespace, None)
+
+ def test_namespace_non_multi(self):
+ """Test ListGenerator set_namespace when non multi."""
+ self.gen = api.ListGenerator(listaction='alllinks', site=self.site)
+ self.assertRaises(TypeError, self.gen.set_namespace, [0, 1])
+ self.assertIsNone(self.gen.set_namespace(0))
+
+ def test_namespace_multi(self):
+ """Test ListGenerator set_namespace when multi."""
+ self.gen = api.ListGenerator(listaction='allpages', site=self.site)
+ self.assertTrue(self.gen.support_namespace())
+ self.assertIsNone(self.gen.set_namespace([0, 1]))
+
+ def test_namespace_resolve_failed(self):
+ """Test ListGenerator set_namespace when resolve fails."""
+ self.gen = api.ListGenerator(listaction='allpages', site=self.site)
+ self.assertTrue(self.gen.support_namespace())
+ self.assertRaises(KeyError, self.gen.set_namespace, 10000)
+
+
class TestDryListGenerator(TestCase):
"""Test ListGenerator."""
@@ -879,7 +957,7 @@
def test_namespace_zero(self):
"""Test ListGenerator set_namespace with 0."""
- self.gen.set_namespace(0)
+ self.assertIsNone(self.gen.set_namespace(0))
class TestCachedRequest(DefaultSiteTestCase):
diff --git a/tests/utils.py b/tests/utils.py
index c0ce35b..5819f8d 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""Test utilities."""
#
-# (C) Pywikibot team, 2013-2018
+# (C) Pywikibot team, 2013-2019
#
# Distributed under the terms of the MIT license.
#
@@ -317,7 +317,7 @@
def parameter(self, module, param_name):
"""Load dry data."""
- return self[module][param_name]
+ return self[module].get(param_name)
def __getitem__(self, name):
"""Return dry data or a dummy parameter block."""
--
To view, visit https://gerrit.wikimedia.org/r/482496
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: If612392ec122ec67d9a7897ec17ae8fb356fa56d
Gerrit-Change-Number: 482496
Gerrit-PatchSet: 11
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Framawiki <framawiki(a)tools.wmflabs.org>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Multichill <maarten(a)mdammers.nl>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: Zhuyifei1999 <zhuyifei1999(a)gmail.com>
Gerrit-Reviewer: jenkins-bot (75)
jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/371586 )
Change subject: Harvest multiple values from one parameter
......................................................................
Harvest multiple values from one parameter
The new modifier is '-multi'.
Doesn't work with claim types besides wikibase-item.
Bug: T87689
Change-Id: Ied808405a21213e165d51b3fe3d79dfd883e58c0
---
M scripts/harvest_template.py
1 file changed, 50 insertions(+), 17 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/harvest_template.py b/scripts/harvest_template.py
index 6c6fd07..80718d0 100755
--- a/scripts/harvest_template.py
+++ b/scripts/harvest_template.py
@@ -40,6 +40,8 @@
has the imported property with the imported value and
some qualifiers.
+-multi If set, try to match multiple values from parameter.
+
Examples:
python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \
@@ -75,10 +77,18 @@
page won't be skipped if the item already has that property but there is
not the new value.
+ python pwb.py harvest_template -lang:en -family:wikipedia -namespace:0 \
+ -template:"Infobox musical artist" current_members P527 -exists:p \
+ -multi
+
+ will import band members from the "current_members" parameter of "Infobox
+ musical artist" on English Wikipedia as Wikidata property "P527" (has
+ part). This will only extract multiple band members if each is linked, and
+ will not add duplicate claims for the same member.
"""
#
# (C) Multichill, Amir, 2013
-# (C) Pywikibot team, 2013-2018
+# (C) Pywikibot team, 2013-2019
#
# Distributed under the terms of MIT License.
#
@@ -114,8 +124,9 @@
"""Class holding options for a param-property pair."""
availableOptions = {
- 'islink': False,
'exists': '',
+ 'islink': False,
+ 'multi': False,
}
@@ -140,12 +151,16 @@
@keyword exists: pattern for merging existing claims with harvested
values
@type exists: str
+ @keyword multi: Whether multiple values should be extracted from a
+ single parameter
+ @type multi: bool
"""
self.availableOptions.update({
'always': True,
'create': False,
'exists': '',
'islink': False,
+ 'multi': False,
})
super(HarvestRobot, self).__init__(**kwargs)
self.generator = generator
@@ -261,22 +276,42 @@
# This field contains something useful for us
prop, options = self.fields[field]
claim = pywikibot.Claim(self.repo, prop)
+ exists_arg = self._get_option_with_fallback(options, 'exists')
if claim.type == 'wikibase-item':
+ do_multi = self._get_option_with_fallback(
+ options, 'multi')
+ matched = False
# Try to extract a valid page
- match = pywikibot.link_regex.search(value)
- if match:
+ for match in pywikibot.link_regex.finditer(value):
+ matched = True
link_text = match.group(1)
- else:
- if self._get_option_with_fallback(options, 'islink'):
- link_text = value
- else:
- pywikibot.output(
- '{} field {} value {} is not a wikilink. '
- 'Skipping.'
- .format(claim.getID(), field, value))
- continue
+ linked_item = self._template_link_target(
+ item, link_text)
+ added = False
+ if linked_item:
+ claim.setTarget(linked_item)
+ added = self.user_add_claim_unless_exists(
+ item, claim, exists_arg, page.site,
+ pywikibot.output)
+ claim = pywikibot.Claim(self.repo, prop)
+ # stop after the first match if not supposed to add
+ # multiple values
+ if not do_multi:
+ break
+ # update exists_arg, so we can add more values
+ if 'p' not in exists_arg and added:
+ exists_arg += 'p'
- linked_item = self._template_link_target(item, link_text)
+ if matched:
+ continue
+
+ if not self._get_option_with_fallback(options, 'islink'):
+ pywikibot.output(
+ '{} field {} value {} is not a wikilink. Skipping.'
+ .format(claim.getID(), field, value))
+ continue
+
+ linked_item = self._template_link_target(item, value)
if not linked_item:
continue
@@ -308,9 +343,7 @@
# A generator might yield pages from multiple sites
self.user_add_claim_unless_exists(
- item, claim, self._get_option_with_fallback(
- options, 'exists'),
- page.site, pywikibot.output)
+ item, claim, exists_arg, page.site, pywikibot.output)
def main(*args):
--
To view, visit https://gerrit.wikimedia.org/r/371586
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: Ied808405a21213e165d51b3fe3d79dfd883e58c0
Gerrit-Change-Number: 371586
Gerrit-PatchSet: 8
Gerrit-Owner: Ejegg <ejegg(a)ejegg.com>
Gerrit-Reviewer: Ejegg <ejegg(a)ejegg.com>
Gerrit-Reviewer: Jgleeson <jgleeson(a)wikimedia.org>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Magul <tomasz.magulski(a)gmail.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot (75)