jenkins-bot has submitted this change and it was merged.
Change subject: pagegenerators.py: Allow filtering by category ......................................................................
pagegenerators.py: Allow filtering by category
Allow generated pages to be filtered by category. A category may have large number of pages, while using -intersect filtering pages from other generators will is more efficient than getting all pages in the category. Add related tests in page_generators_tests.py.
Bug: T122392 Change-Id: Ib760712be4b5acc84b09b80e7eaf6b9c11e4c870 --- M pywikibot/pagegenerators.py M tests/pagegenerators_tests.py 2 files changed, 61 insertions(+), 3 deletions(-)
Approvals: Mpaa: Looks good to me, approved Xqt: Looks good to me, but someone else must approve jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index d3a15a9..f198766 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -62,6 +62,9 @@
parameterHelp = u"""\
+-catfilter Filter the page generator to only yield pages in the + specified category. See -cat for argument format. + -cat Work on all pages which are in a specific category. Argument can also be given as "-cat:categoryname" or as "-cat:categoryname|fromtitle" (using # instead of | @@ -350,6 +353,7 @@ self.articlefilter_list = [] self.titlefilter_list = [] self.claimfilter_list = [] + self.catfilter_list = [] self.intersect = False self.subpage_max_depth = None self._site = site @@ -422,6 +426,7 @@ if (self.titlefilter_list or self.articlefilter_list or self.claimfilter_list or + self.catfilter_list or self.subpage_max_depth is not None or self.qualityfilter_list): pywikibot.warning( @@ -466,11 +471,14 @@ dupfiltergen = RegexBodyFilterPageGenerator( PreloadingGenerator(dupfiltergen), self.articlefilter_list)
+ if self.catfilter_list: + dupfiltergen = CategoryFilterPageGenerator( + dupfiltergen, self.catfilter_list, self.site) + return dupfiltergen
- def getCategoryGen(self, arg, recurse=False, content=False, - gen_func=None): - """Return generator based on Category defined by arg and gen_func.""" + def getCategory(self, arg): + """Return Category and start as defined by arg.""" categoryname = arg.partition(':')[2] if not categoryname: categoryname = i18n.input( @@ -491,6 +499,12 @@ categoryname) cat = pywikibot.Category(pywikibot.Link(categoryname, defaultNamespace=14)) + return cat, startfrom + + def getCategoryGen(self, arg, recurse=False, content=False, + gen_func=None): + """Return generator based on Category defined by arg and gen_func.""" + cat, startfrom = self.getCategory(arg)
return gen_func(cat, start=startfrom, @@ -651,6 +665,10 @@ elif arg.startswith('-catr'): gen = self.getCategoryGen(arg, recurse=True, gen_func=CategorizedPageGenerator) + elif arg.startswith('-catfilter'): + cat, _ = self.getCategory(arg) + self.catfilter_list.append(cat) + return True elif arg.startswith('-category'): gen = self.getCategoryGen(arg, gen_func=CategorizedPageGenerator) elif arg.startswith('-cat'): @@ -1550,6 +1568,23 @@ else: yield page
+ +def CategoryFilterPageGenerator(generator, category_list, site=None): + """ + Wrap a generator to filter pages by categories specified. + + @param generator: A generator object + @param category_list: categories used to filter generated pages + @type category_list: list of category objects + + """ + if site is None: + site = pywikibot.Site() + for page in generator: + if all(x in site.pagecategories(page) for x in category_list): + yield page + + # name the generator methods RegexFilterPageGenerator = RegexFilter.titlefilter RegexBodyFilterPageGenerator = RegexFilter.contentfilter diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py index ed38066..76d1c28 100755 --- a/tests/pagegenerators_tests.py +++ b/tests/pagegenerators_tests.py @@ -183,6 +183,29 @@ self.assertEqual(len(tuple(gen)), 9)
+class TestCategoryFilterPageGenerator(TestCase): + + """Test CategoryFilterPageGenerator method.""" + + family = 'wikisource' + code = 'en' + + base_title = 'Page:06-24-1920 -The Story of the Jones County Calf Case.pdf/%s' + category_list = ['Category:Validated'] + + def setUp(self): + super(TestCategoryFilterPageGenerator, self).setUp() + self.site = self.get_site() + self.titles = [self.base_title % i for i in range(1, 11)] + self.catfilter_list = [pywikibot.Category(self.site, cat) for cat in self.category_list] + + def test_CategoryFilterPageGenerator(self): + site = self.site + gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site) + gen = pagegenerators.CategoryFilterPageGenerator(gen, self.catfilter_list, site) + self.assertEqual(len(tuple(gen)), 7) + + class TestQualityFilterPageGenerator(TestCase):
"""Test QualityFilterPageGenerator methods."""
pywikibot-commits@lists.wikimedia.org