jenkins-bot has submitted this change and it was merged.
Change subject: pagegenerators.py: Allow filtering by category
......................................................................
pagegenerators.py: Allow filtering by category
Allow generated pages to be filtered by category.
A category may have large number of pages, while using -intersect filtering
pages from other generators will is more efficient than getting all pages in
the category.
Add related tests in page_generators_tests.py.
Bug: T122392
Change-Id: Ib760712be4b5acc84b09b80e7eaf6b9c11e4c870
---
M pywikibot/pagegenerators.py
M tests/pagegenerators_tests.py
2 files changed, 61 insertions(+), 3 deletions(-)
Approvals:
Mpaa: Looks good to me, approved
Xqt: Looks good to me, but someone else must approve
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index d3a15a9..f198766 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -62,6 +62,9 @@
parameterHelp = u"""\
+-catfilter Filter the page generator to only yield pages in the
+ specified category. See -cat for argument format.
+
-cat Work on all pages which are in a specific category.
Argument can also be given as "-cat:categoryname" or
as "-cat:categoryname|fromtitle" (using # instead of |
@@ -350,6 +353,7 @@
self.articlefilter_list = []
self.titlefilter_list = []
self.claimfilter_list = []
+ self.catfilter_list = []
self.intersect = False
self.subpage_max_depth = None
self._site = site
@@ -422,6 +426,7 @@
if (self.titlefilter_list or
self.articlefilter_list or
self.claimfilter_list or
+ self.catfilter_list or
self.subpage_max_depth is not None or
self.qualityfilter_list):
pywikibot.warning(
@@ -466,11 +471,14 @@
dupfiltergen = RegexBodyFilterPageGenerator(
PreloadingGenerator(dupfiltergen), self.articlefilter_list)
+ if self.catfilter_list:
+ dupfiltergen = CategoryFilterPageGenerator(
+ dupfiltergen, self.catfilter_list, self.site)
+
return dupfiltergen
- def getCategoryGen(self, arg, recurse=False, content=False,
- gen_func=None):
- """Return generator based on Category defined by arg and
gen_func."""
+ def getCategory(self, arg):
+ """Return Category and start as defined by arg."""
categoryname = arg.partition(':')[2]
if not categoryname:
categoryname = i18n.input(
@@ -491,6 +499,12 @@
categoryname)
cat = pywikibot.Category(pywikibot.Link(categoryname,
defaultNamespace=14))
+ return cat, startfrom
+
+ def getCategoryGen(self, arg, recurse=False, content=False,
+ gen_func=None):
+ """Return generator based on Category defined by arg and
gen_func."""
+ cat, startfrom = self.getCategory(arg)
return gen_func(cat,
start=startfrom,
@@ -651,6 +665,10 @@
elif arg.startswith('-catr'):
gen = self.getCategoryGen(arg, recurse=True,
gen_func=CategorizedPageGenerator)
+ elif arg.startswith('-catfilter'):
+ cat, _ = self.getCategory(arg)
+ self.catfilter_list.append(cat)
+ return True
elif arg.startswith('-category'):
gen = self.getCategoryGen(arg, gen_func=CategorizedPageGenerator)
elif arg.startswith('-cat'):
@@ -1550,6 +1568,23 @@
else:
yield page
+
+def CategoryFilterPageGenerator(generator, category_list, site=None):
+ """
+ Wrap a generator to filter pages by categories specified.
+
+ @param generator: A generator object
+ @param category_list: categories used to filter generated pages
+ @type category_list: list of category objects
+
+ """
+ if site is None:
+ site = pywikibot.Site()
+ for page in generator:
+ if all(x in site.pagecategories(page) for x in category_list):
+ yield page
+
+
# name the generator methods
RegexFilterPageGenerator = RegexFilter.titlefilter
RegexBodyFilterPageGenerator = RegexFilter.contentfilter
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index ed38066..76d1c28 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -183,6 +183,29 @@
self.assertEqual(len(tuple(gen)), 9)
+class TestCategoryFilterPageGenerator(TestCase):
+
+ """Test CategoryFilterPageGenerator method."""
+
+ family = 'wikisource'
+ code = 'en'
+
+ base_title = 'Page:06-24-1920 -The Story of the Jones County Calf
Case.pdf/%s'
+ category_list = ['Category:Validated']
+
+ def setUp(self):
+ super(TestCategoryFilterPageGenerator, self).setUp()
+ self.site = self.get_site()
+ self.titles = [self.base_title % i for i in range(1, 11)]
+ self.catfilter_list = [pywikibot.Category(self.site, cat) for cat in
self.category_list]
+
+ def test_CategoryFilterPageGenerator(self):
+ site = self.site
+ gen = pagegenerators.PagesFromTitlesGenerator(self.titles, site)
+ gen = pagegenerators.CategoryFilterPageGenerator(gen, self.catfilter_list, site)
+ self.assertEqual(len(tuple(gen)), 7)
+
+
class TestQualityFilterPageGenerator(TestCase):
"""Test QualityFilterPageGenerator methods."""
--
To view, visit
https://gerrit.wikimedia.org/r/261877
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ib760712be4b5acc84b09b80e7eaf6b9c11e4c870
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Vadiraja.k <vadi.fedx(a)gmail.com>
Gerrit-Reviewer: Billinghurst <billinghurstwiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Siebrand <siebrand(a)kitano.nl>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>