jenkins-bot has submitted this change and it was merged.
Change subject: pagegenerators.py: allow filtering by quality level
......................................................................
pagegenerators.py: allow filtering by quality level
Allow filtering by quality when page site has ProofreadPage extension
and requested pages are ProofreadPage pages.
Add related tests in page_generators_tests.py.
Add ProofreadPage.quality_level property for pages and related tests.
Bug: T122047
Change-Id: I93ff113f0fa3701b830a8589a955e0f35814b2a2
---
M pywikibot/pagegenerators.py
M pywikibot/proofreadpage.py
M tests/pagegenerators_tests.py
M tests/proofreadpage_tests.py
4 files changed, 123 insertions(+), 3 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index e205ff6..d3a15a9 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -48,7 +48,8 @@
from pywikibot import date, config, i18n, xmlreader
from pywikibot.comms import http
-from pywikibot.exceptions import ArgumentDeprecationWarning
+from pywikibot.exceptions import ArgumentDeprecationWarning, UnknownExtension
+from pywikibot.proofreadpage import ProofreadPage
if sys.version_info[0] > 2:
basestring = (str, )
@@ -278,6 +279,12 @@
Case insensitive regular expressions will be used and
dot matches any character, including a newline.
+-ql Filter pages based on page quality.
+ This is only applicable if contentmodel equals
+ 'proofread-page', otherwise has no effects.
+ Valid values are in range 0-4.
+ Multiple values can be comma-separated.
+
-onlyif A claim the page needs to contain, otherwise the item won't
be returned.
The format is property=value,qualifier=value. Multiple (or
@@ -339,6 +346,7 @@
self._namespaces = []
self.step = None
self.limit = None
+ self.qualityfilter_list = []
self.articlefilter_list = []
self.titlefilter_list = []
self.claimfilter_list = []
@@ -411,8 +419,11 @@
if self.limit:
self.gens[i] = itertools.islice(self.gens[i], self.limit)
if len(self.gens) == 0:
- if self.titlefilter_list or self.articlefilter_list or \
- self.claimfilter_list or self.subpage_max_depth is not None:
+ if (self.titlefilter_list or
+ self.articlefilter_list or
+ self.claimfilter_list or
+ self.subpage_max_depth is not None or
+ self.qualityfilter_list):
pywikibot.warning(
'filter(s) specified but no generators.')
return None
@@ -442,6 +453,10 @@
dupfiltergen = ItemClaimFilterPageGenerator(dupfiltergen,
claim[0], claim[1],
claim[2], claim[3])
+
+ if self.qualityfilter_list:
+ dupfiltergen = QualityFilterPageGenerator(
+ dupfiltergen, self.qualityfilter_list)
if self.titlefilter_list:
dupfiltergen = RegexFilterPageGenerator(
@@ -777,6 +792,19 @@
u'Which pattern do you want to grep?'))
else:
self.articlefilter_list.append(arg[6:])
+ return True
+ elif arg.startswith('-ql:'):
+ if not self.site.has_extension('ProofreadPage'):
+ raise UnknownExtension(
+ 'Ql filtering needs a site with ProofreadPage extension.')
+ value = map(int, arg[4:].split(','))
+ if min(value) < 0 or max(value) > 4: # Invalid input ql.
+ valid_ql = ['{0}: {1}'.format(*i) for
+ i in self.site.proofread_levels.items()]
+ valid_ql = ', '.join(valid_ql)
+ pywikibot.warning('Acceptable values for -ql are:\n %s'
+ % valid_ql)
+ self.qualityfilter_list = value
return True
elif arg.startswith('-onlyif') or arg.startswith('-onlyifnot'):
ifnot = arg.startswith('-onlyifnot')
@@ -1501,6 +1529,27 @@
return (page for page in generator
if cls.__filter_match(reg, page.text, quantifier))
+
+def QualityFilterPageGenerator(generator, quality):
+ """
+ Wrap a generator to filter pages according to quality levels.
+
+ This is possible only for pages with content_model 'proofread-page'.
+ In all the other cases, no filter is applied.
+
+ @param generator: A generator object
+ @param quality: proofread-page quality levels (valid range 0-4)
+ @type quality: list of int
+
+ """
+ for page in generator:
+ if page.namespace() == page.site.proofread_page_ns:
+ page = ProofreadPage(page)
+ if page.quality_level in quality:
+ yield page
+ else:
+ yield page
+
# name the generator methods
RegexFilterPageGenerator = RegexFilter.titlefilter
RegexBodyFilterPageGenerator = RegexFilter.contentfilter
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 5d0e9ba..5996ee7 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -161,6 +161,26 @@
if hasattr(self, "_index"):
del self._index
+ @property
+ def quality_level(self):
+ """Return the quality level of this page when it is retrieved from
API.
+
+ This is only applicable if contentmodel equals 'proofread-page'.
+ None is returned otherwise.
+
+ This property is read-only and is applicable only when page is loaded.
+ If quality level is overwritten during page processing, this property
+ is no longer necessarily aligned with the new value.
+
+ In this way, no text parsing is necessary to check quality level when
+ fetching a page.
+ # TODO: align this value with ProofreadPage.ql
+
+ """
+ if self.content_model == 'proofread-page' and hasattr(self,
'_quality'):
+ return self._quality
+ return self.ql
+
def decompose(fn):
"""Decorator.
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index 3936fca..ed38066 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -19,6 +19,8 @@
import pywikibot
from pywikibot import pagegenerators, date
+from pywikibot.exceptions import UnknownExtension
+
from pywikibot.pagegenerators import (
PagesFromTitlesGenerator,
PreloadingGenerator,
@@ -179,6 +181,33 @@
gen = pagegenerators.RegexBodyFilterPageGenerator(iter(pages), 'talk',
quantifier='none')
self.assertEqual(len(tuple(gen)), 9)
+
+
+class TestQualityFilterPageGenerator(TestCase):
+
+ """Test QualityFilterPageGenerator methods."""
+
+ family = 'wikisource'
+ code = 'en'
+
+ cached = True
+
+ base_title = 'Page:Popular Science Monthly Volume 1.djvu/%s'
+
+ def setUp(self):
+ super(TestQualityFilterPageGenerator, self).setUp()
+ self.site = self.get_site()
+ self.titles = [self.base_title % i for i in range(1, 11)]
+
+ def test_QualityFilterPageGenerator(self):
+ site = self.site
+ gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site)
+ gen = pagegenerators.QualityFilterPageGenerator(gen, [0])
+ self.assertEqual(len(tuple(gen)), 7)
+ gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site)
+ gen = pagegenerators.NamespaceFilterPageGenerator(gen, [4])
+ gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site)
+ self.assertEqual(len(tuple(gen)), 10)
class EdittimeFilterPageGeneratorTestCase(TestCase):
@@ -511,6 +540,10 @@
gf.handleArg('-ns:0')
self.assertEqual(gf.namespaces, set([1, 6]))
+ def test_unsupported_quality_level_filter(self):
+ gf = pagegenerators.GeneratorFactory(site=self.get_site())
+ self.assertRaises(UnknownExtension, gf.handleArg, '-ql:2')
+
class TestItemClaimFilterPageGenerator(WikidataTestCase):
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 6e517cd..39b464b 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -207,6 +207,24 @@
self.assertEqual(json.loads(page_text), json.loads(loaded_text))
+class TestPageQuality(TestCase):
+
+ """Test page quality."""
+
+ family = 'wikisource'
+ code = 'en'
+
+ cached = True
+
+ def test_applicable_quality_level(self):
+ """Test Page.quality_level when applicable."""
+ site = self.get_site()
+ title = 'Page:Popular Science Monthly Volume 49.djvu/1'
+ page = ProofreadPage(site, title)
+ self.assertEqual(page.content_model, 'proofread-page')
+ self.assertEqual(page.quality_level, 0)
+
+
@require_modules('bs4')
class TestProofreadPageIndexProperty(TestCase):
--
To view, visit
https://gerrit.wikimedia.org/r/250221
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I93ff113f0fa3701b830a8589a955e0f35814b2a2
Gerrit-PatchSet: 4
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: jenkins-bot <>