jenkins-bot has submitted this change and it was merged.
Change subject: pagegenerators.py: titleregex as filter of other generators ......................................................................
pagegenerators.py: titleregex as filter of other generators
Implement -titleregex as a a filter, applying a regex to titles of pages returned by the other page generators.
-start now defaults to ! if nothing is specified, instead of asking for a page as today.
Functionality before this patch shall now be obtained using: -start -titleregex:my_regex
Warning is emitted if grep/titleregex filters are specified but no generators are requested.
Bug:T114015 Change-Id: I249c4ee61b89ea4042b08fff0a3dc4557170e6f4 --- M pywikibot/pagegenerators.py M tests/pagegenerators_tests.py 2 files changed, 40 insertions(+), 30 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved XZise: Looks good to me, but someone else must approve jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index eb6f583..ecffb9a 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -134,8 +134,9 @@ before -newpages. If used with -recentchanges, efficiency is improved if -namepace/ns is provided before -recentchanges. - If used with -titleregex, -namepace/ns must be provided - before -titleregex and shall contain only one value. + + If used with -start, -namepace/ns shall contain only one + value.
-interwiki Work on the given page and all equivalent pages in other languages. This can, for example, be used to fight @@ -181,13 +182,21 @@ "-start:Template:!" will make the bot work on all pages in the template namespace.
+ default value is start:! + -prefixindex Work on pages commencing with a common prefix.
-step:n When used with any other argument that specifies a set of pages, only retrieve n pages at a time from the wiki server.
--titleregex Work on titles that match the given regular expression. +-titleregex A regular expression that needs to match the article title + otherwise the page won't be returned. + Multiple -titleregex:regexpr can be provided and the page will + be returned if title is matched by any of the regexpr + provided. + Case insensitive regular expressions will be used and + dot matches any character.
-transcludes Work on all pages that use a certain template. Argument can also be given as "-transcludes:Title". @@ -327,6 +336,7 @@ self.step = None self.limit = None self.articlefilter_list = [] + self.titlefilter_list = [] self.claimfilter_list = [] self.intersect = False self._site = site @@ -396,6 +406,9 @@ if self.limit: self.gens[i] = itertools.islice(self.gens[i], self.limit) if len(self.gens) == 0: + if self.titlefilter_list or self.articlefilter_list: + pywikibot.warning( + 'grep/titleregex filters specified but no generators.') return None elif len(self.gens) == 1: gensList = self.gens[0] @@ -419,11 +432,15 @@ claim[0], claim[1], claim[2], claim[3])
+ if self.titlefilter_list: + dupfiltergen = RegexFilterPageGenerator( + dupfiltergen, self.titlefilter_list) + if self.articlefilter_list: - return RegexBodyFilterPageGenerator( + dupfiltergen = RegexBodyFilterPageGenerator( PreloadingGenerator(dupfiltergen), self.articlefilter_list) - else: - return dupfiltergen + + return dupfiltergen
def getCategoryGen(self, arg, recurse=False, content=False, gen_func=None): @@ -672,8 +689,7 @@ elif arg.startswith('-start'): firstPageTitle = arg[7:] if not firstPageTitle: - firstPageTitle = pywikibot.input( - u'At which page do you want to start?') + firstPageTitle = '!' firstpagelink = pywikibot.Link(firstPageTitle, self.site) namespace = firstpagelink.namespace @@ -739,18 +755,11 @@ gen = GoogleSearchPageGenerator(arg[8:]) elif arg.startswith('-titleregex'): if len(arg) == 11: - regex = pywikibot.input(u'What page names are you looking for?') + self.titlefilter_list.append(pywikibot.input( + 'What page names are you looking for?')) else: - regex = arg[12:] - # partial workaround for bug T85389 - # to use -namespace/ns with -newpages, -ns must be given - # before -titleregex, otherwise default namespace is 0. - # allpages only accepts a single namespace, and will raise a - # TypeError if self.namespaces contains more than one namespace. - namespaces = self.namespaces or 0 - gen = RegexFilterPageGenerator( - self.site.allpages(namespace=namespaces), - regex) + self.titlefilter_list.append(arg[12:]) + return True elif arg.startswith('-grep'): if len(arg) == 5: self.articlefilter_list.append(pywikibot.input( diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py index 1e8cd20..3f9cb4f 100755 --- a/tests/pagegenerators_tests.py +++ b/tests/pagegenerators_tests.py @@ -592,6 +592,7 @@ def test_regexfilter_default(self): gf = pagegenerators.GeneratorFactory() # Matches titles with the same two or more continous characters + self.assertTrue(gf.handleArg('-start')) self.assertTrue(gf.handleArg('-titleregex:(.)\1+')) gf.handleArg('-limit:10') gen = gf.getCombinedGenerator() @@ -603,39 +604,39 @@ self.assertRegex(page.title().lower(), '(.)\1+')
def test_regexfilter_ns_after(self): - """Bug: T85389: -ns after -titleregex is ignored with a warning.""" gf = pagegenerators.GeneratorFactory() + self.assertTrue(gf.handleArg('-start')) self.assertTrue(gf.handleArg('-titleregex:.*')) gf.handleArg('-ns:1') gf.handleArg('-limit:10') gen = gf.getCombinedGenerator() pages = list(gen) - self.assertGreater(len(pages), 0) self.assertLessEqual(len(pages), 10) - self.assertPagesInNamespaces(pages, 0) + self.assertPagesInNamespaces(pages, 1)
- def test_regexfilter_ns_first(self): + def test_regexfilter_ns_before(self): gf = pagegenerators.GeneratorFactory() - # Workaround for Bug: T85389 - # Give -ns before -titleregex (as for -newpages) + self.assertTrue(gf.handleArg('-start')) gf.handleArg('-ns:1') self.assertTrue(gf.handleArg('-titleregex:.*')) gf.handleArg('-limit:10') gen = gf.getCombinedGenerator() self.assertIsNotNone(gen) pages = list(gen) - self.assertGreater(len(pages), 0) self.assertLessEqual(len(pages), 10) self.assertPagesInNamespaces(pages, 1)
- def test_regexfilter_two_ns_first(self): + def test_allpages_with_two_ns(self): + """Test that allpages fails with two ns as parameter.""" gf = pagegenerators.GeneratorFactory() + self.assertTrue(gf.handleArg('-start')) gf.handleArg('-ns:3,1') - self.assertRaisesRegex( + # allpages only accepts a single namespace, and will raise a + # TypeError if self.namespaces contains more than one namespace. + self.assertRaises( TypeError, 'allpages module does not support multiple namespaces', - gf.handleArg, - '-titleregex:.*') + gf.getCombinedGenerator)
def test_prefixing_default(self): gf = pagegenerators.GeneratorFactory()
pywikibot-commits@lists.wikimedia.org