jenkins-bot has submitted this change and it was merged.
Change subject: pagegenerators.py: allow filtering by quality level ......................................................................
pagegenerators.py: allow filtering by quality level
Allow filtering by quality when page site has ProofreadPage extension and requested pages are ProofreadPage pages.
Add related tests in page_generators_tests.py.
Add ProofreadPage.quality_level property for pages and related tests.
Bug: T122047 Change-Id: I93ff113f0fa3701b830a8589a955e0f35814b2a2 --- M pywikibot/pagegenerators.py M pywikibot/proofreadpage.py M tests/pagegenerators_tests.py M tests/proofreadpage_tests.py 4 files changed, 123 insertions(+), 3 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index e205ff6..d3a15a9 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -48,7 +48,8 @@
from pywikibot import date, config, i18n, xmlreader from pywikibot.comms import http -from pywikibot.exceptions import ArgumentDeprecationWarning +from pywikibot.exceptions import ArgumentDeprecationWarning, UnknownExtension +from pywikibot.proofreadpage import ProofreadPage
if sys.version_info[0] > 2: basestring = (str, ) @@ -278,6 +279,12 @@ Case insensitive regular expressions will be used and dot matches any character, including a newline.
+-ql Filter pages based on page quality. + This is only applicable if contentmodel equals + 'proofread-page', otherwise has no effects. + Valid values are in range 0-4. + Multiple values can be comma-separated. + -onlyif A claim the page needs to contain, otherwise the item won't be returned. The format is property=value,qualifier=value. Multiple (or @@ -339,6 +346,7 @@ self._namespaces = [] self.step = None self.limit = None + self.qualityfilter_list = [] self.articlefilter_list = [] self.titlefilter_list = [] self.claimfilter_list = [] @@ -411,8 +419,11 @@ if self.limit: self.gens[i] = itertools.islice(self.gens[i], self.limit) if len(self.gens) == 0: - if self.titlefilter_list or self.articlefilter_list or \ - self.claimfilter_list or self.subpage_max_depth is not None: + if (self.titlefilter_list or + self.articlefilter_list or + self.claimfilter_list or + self.subpage_max_depth is not None or + self.qualityfilter_list): pywikibot.warning( 'filter(s) specified but no generators.') return None @@ -442,6 +453,10 @@ dupfiltergen = ItemClaimFilterPageGenerator(dupfiltergen, claim[0], claim[1], claim[2], claim[3]) + + if self.qualityfilter_list: + dupfiltergen = QualityFilterPageGenerator( + dupfiltergen, self.qualityfilter_list)
if self.titlefilter_list: dupfiltergen = RegexFilterPageGenerator( @@ -777,6 +792,19 @@ u'Which pattern do you want to grep?')) else: self.articlefilter_list.append(arg[6:]) + return True + elif arg.startswith('-ql:'): + if not self.site.has_extension('ProofreadPage'): + raise UnknownExtension( + 'Ql filtering needs a site with ProofreadPage extension.') + value = map(int, arg[4:].split(',')) + if min(value) < 0 or max(value) > 4: # Invalid input ql. + valid_ql = ['{0}: {1}'.format(*i) for + i in self.site.proofread_levels.items()] + valid_ql = ', '.join(valid_ql) + pywikibot.warning('Acceptable values for -ql are:\n %s' + % valid_ql) + self.qualityfilter_list = value return True elif arg.startswith('-onlyif') or arg.startswith('-onlyifnot'): ifnot = arg.startswith('-onlyifnot') @@ -1501,6 +1529,27 @@ return (page for page in generator if cls.__filter_match(reg, page.text, quantifier))
+ +def QualityFilterPageGenerator(generator, quality): + """ + Wrap a generator to filter pages according to quality levels. + + This is possible only for pages with content_model 'proofread-page'. + In all the other cases, no filter is applied. + + @param generator: A generator object + @param quality: proofread-page quality levels (valid range 0-4) + @type quality: list of int + + """ + for page in generator: + if page.namespace() == page.site.proofread_page_ns: + page = ProofreadPage(page) + if page.quality_level in quality: + yield page + else: + yield page + # name the generator methods RegexFilterPageGenerator = RegexFilter.titlefilter RegexBodyFilterPageGenerator = RegexFilter.contentfilter diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py index 5d0e9ba..5996ee7 100644 --- a/pywikibot/proofreadpage.py +++ b/pywikibot/proofreadpage.py @@ -161,6 +161,26 @@ if hasattr(self, "_index"): del self._index
+ @property + def quality_level(self): + """Return the quality level of this page when it is retrieved from API. + + This is only applicable if contentmodel equals 'proofread-page'. + None is returned otherwise. + + This property is read-only and is applicable only when page is loaded. + If quality level is overwritten during page processing, this property + is no longer necessarily aligned with the new value. + + In this way, no text parsing is necessary to check quality level when + fetching a page. + # TODO: align this value with ProofreadPage.ql + + """ + if self.content_model == 'proofread-page' and hasattr(self, '_quality'): + return self._quality + return self.ql + def decompose(fn): """Decorator.
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py index 3936fca..ed38066 100755 --- a/tests/pagegenerators_tests.py +++ b/tests/pagegenerators_tests.py @@ -19,6 +19,8 @@ import pywikibot from pywikibot import pagegenerators, date
+from pywikibot.exceptions import UnknownExtension + from pywikibot.pagegenerators import ( PagesFromTitlesGenerator, PreloadingGenerator, @@ -179,6 +181,33 @@ gen = pagegenerators.RegexBodyFilterPageGenerator(iter(pages), 'talk', quantifier='none') self.assertEqual(len(tuple(gen)), 9) + + +class TestQualityFilterPageGenerator(TestCase): + + """Test QualityFilterPageGenerator methods.""" + + family = 'wikisource' + code = 'en' + + cached = True + + base_title = 'Page:Popular Science Monthly Volume 1.djvu/%s' + + def setUp(self): + super(TestQualityFilterPageGenerator, self).setUp() + self.site = self.get_site() + self.titles = [self.base_title % i for i in range(1, 11)] + + def test_QualityFilterPageGenerator(self): + site = self.site + gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site) + gen = pagegenerators.QualityFilterPageGenerator(gen, [0]) + self.assertEqual(len(tuple(gen)), 7) + gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site) + gen = pagegenerators.NamespaceFilterPageGenerator(gen, [4]) + gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site) + self.assertEqual(len(tuple(gen)), 10)
class EdittimeFilterPageGeneratorTestCase(TestCase): @@ -511,6 +540,10 @@ gf.handleArg('-ns:0') self.assertEqual(gf.namespaces, set([1, 6]))
+ def test_unsupported_quality_level_filter(self): + gf = pagegenerators.GeneratorFactory(site=self.get_site()) + self.assertRaises(UnknownExtension, gf.handleArg, '-ql:2') +
class TestItemClaimFilterPageGenerator(WikidataTestCase):
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py index 6e517cd..39b464b 100644 --- a/tests/proofreadpage_tests.py +++ b/tests/proofreadpage_tests.py @@ -207,6 +207,24 @@ self.assertEqual(json.loads(page_text), json.loads(loaded_text))
+class TestPageQuality(TestCase): + + """Test page quality.""" + + family = 'wikisource' + code = 'en' + + cached = True + + def test_applicable_quality_level(self): + """Test Page.quality_level when applicable.""" + site = self.get_site() + title = 'Page:Popular Science Monthly Volume 49.djvu/1' + page = ProofreadPage(site, title) + self.assertEqual(page.content_model, 'proofread-page') + self.assertEqual(page.quality_level, 0) + + @require_modules('bs4') class TestProofreadPageIndexProperty(TestCase):