jenkins-bot has submitted this change and it was merged.
Change subject: Subpage filter generator
......................................................................
Subpage filter generator
* Filter that excludes subpages that have a high depth
i.e. how many parents
* Siteinfo converts boolean properties (that we use) into actual
boolean values
* Page.depth returns the subpage depth
* Page._namespace_obj returns the Namespace object, rather than
the int Namespace id
* Update interwiki script to use 'content' property of Namespace
(which is now a boolean) for page_empty_check()
* Unit tests for above
Bug: T121323
Change-Id: Ia53580cf8ad7387c14d6ca3bf4fcf5b35f53edd4
---
M pywikibot/page.py
M pywikibot/pagegenerators.py
M pywikibot/site.py
M scripts/interwiki.py
M tests/page_tests.py
M tests/pagegenerators_tests.py
M tests/site_tests.py
7 files changed, 161 insertions(+), 3 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/page.py b/pywikibot/page.py
index aa90752..c35c6cf 100644
--- a/pywikibot/page.py
+++ b/pywikibot/page.py
@@ -176,6 +176,12 @@
return self._link.namespace
@property
+ def _namespace_obj(self):
+ """Return the namespace object of the page."""
+ # TODO: T104864: Temporary until Page.namespace() is consistent
+ return self.site.namespaces[self.namespace()]
+
+ @property
def content_model(self):
"""Return the content model for this page.
@@ -187,6 +193,20 @@
self.site.loadpageinfo(self)
return self._contentmodel
+ @property
+ def depth(self):
+ """Return the depth/subpage level of the page."""
+ if not hasattr(self, '_depth'):
+ # Check if the namespace allows subpages
+ if self._namespace_obj.subpages:
+ # Count how many '/'s we have in the title
+ _depth = len(list(re.finditer('/', self.title())))
+ else:
+ # Does not allow subpages, which means depth is always 0
+ _depth = 0
+
+ return _depth
+
@deprecated_args(decode=None, savetitle="asUrl")
def title(self, underscore=False, withNamespace=True,
withSection=True, asUrl=False, asLink=False,
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 9198dca..e205ff6 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -190,6 +190,10 @@
of pages, only retrieve n pages at a time from the wiki
server.
+-subpage:n Filters pages to only those that have depth n
+ i.e. a depth of 0 filters out all pages that are subpages, and
+ a depth of 1 filters out all pages that are subpages of subpages.
+
-titleregex A regular expression that needs to match the article title
otherwise the page won't be returned.
Multiple -titleregex:regexpr can be provided and the page will
@@ -339,6 +343,7 @@
self.titlefilter_list = []
self.claimfilter_list = []
self.intersect = False
+ self.subpage_max_depth = None
self._site = site
@property
@@ -406,9 +411,10 @@
if self.limit:
self.gens[i] = itertools.islice(self.gens[i], self.limit)
if len(self.gens) == 0:
- if self.titlefilter_list or self.articlefilter_list:
+ if self.titlefilter_list or self.articlefilter_list or \
+ self.claimfilter_list or self.subpage_max_depth is not None:
pywikibot.warning(
- 'grep/titleregex filters specified but no generators.')
+ 'filter(s) specified but no generators.')
return None
elif len(self.gens) == 1:
gensList = self.gens[0]
@@ -424,6 +430,11 @@
else:
gensList = CombinedPageGenerator(self.gens)
dupfiltergen = self._filter_unique(gensList)
+
+ # Add on subpage filter generator
+ if self.subpage_max_depth is not None:
+ dupfiltergen = SubpageFilterGenerator(
+ dupfiltergen, self.subpage_max_depth)
if self.claimfilter_list:
dupfiltergen = PreloadingItemGenerator(dupfiltergen)
@@ -799,6 +810,13 @@
gen = MySQLPageGenerator(query, site=self.site)
elif arg.startswith('-intersect'):
self.intersect = True
+ return True
+ elif arg.startswith('-subpage'):
+ max_depth = arg[len('-subpage:'):]
+ if not max_depth:
+ max_depth = pywikibot.input(
+ 'Maximum subpage depth:')
+ self.subpage_max_depth = int(max_depth)
return True
elif arg.startswith('-logevents:'):
gen =
self._parse_log_events(*arg[len('-logevents:'):].split(','))
@@ -1376,6 +1394,32 @@
ItemClaimFilterPageGenerator = ItemClaimFilter.filter
+def SubpageFilterGenerator(generator, max_depth=0, show_filtered=False):
+ """
+ Generator which filters out subpages based on depth.
+
+ It looks at the namespace of each page and checks if that namespace has
+ subpages enabled. If so, pages with forward slashes ('/') are excluded.
+
+ @param generator: A generator object
+ @type generator: any generator or iterator
+ @param max_depth: Max depth of subpages to yield, at least zero
+ @type max_depth: int
+ @param show_filtered: Output a message for each page not yielded
+ @type show_filtered: bool
+ """
+ assert max_depth >= 0, 'Max subpage depth must be at least 0'
+
+ for page in generator:
+ if page.depth <= max_depth:
+ yield page
+ else:
+ if show_filtered:
+ pywikibot.output(
+ 'Page %s is a subpage that is too deep. Skipping.'
+ % page)
+
+
class RegexFilter(object):
"""Regex filter."""
diff --git a/pywikibot/site.py b/pywikibot/site.py
index 0c486d7..0d7623d 100644
--- a/pywikibot/site.py
+++ b/pywikibot/site.py
@@ -1372,6 +1372,29 @@
WARNING_REGEX = re.compile(u"^Unrecognized values? for parameter "
u"'siprop': ([^,]+(?:, [^,]+)*)$")
+ # Until we get formatversion=2, we have to convert empty-string properties
+ # into booleans so they are easier to use.
+ BOOLEAN_PROPS = {
+ 'general': [
+ 'imagewhitelistenabled',
+ 'langconversion',
+ 'titleconversion',
+ 'rtl',
+ 'readonly',
+ 'writeapi',
+ 'variantarticlepath',
+ 'misermode',
+ ],
+ 'namespaces': [ # for each namespace
+ 'subpages',
+ 'content',
+ 'nonincludable',
+ ],
+ 'magicwords': [ # for each magicword
+ 'case-sensitive',
+ ],
+ }
+
def __init__(self, site):
"""Initialise it with an empty cache."""
self._site = site
@@ -1412,12 +1435,32 @@
"""Do some default handling of data. Directly modifies
data."""
# Be careful with version tests inside this here as it might need to
# query this method to actually get the version number
+
if prop == 'general':
if 'articlepath' not in data: # Introduced in 1.16.0
# Old version of MediaWiki, extract from base
path = urlparse(data['base'])[2].rsplit('/', 1)[0] +
'/$1'
data['articlepath'] = path
+ # Convert boolean props from empty strings to actual boolean values
+ if prop in Siteinfo.BOOLEAN_PROPS.keys():
+ # siprop=namespaces and magicwords has properties per item in result
+ if prop == 'namespaces' or prop == 'magicwords':
+ for index, value in enumerate(data):
+ # namespaces uses a dict, while magicwords uses a list
+ key = index if type(data) is list else value
+ for p in Siteinfo.BOOLEAN_PROPS[prop]:
+ if p in data[key]:
+ data[key][p] = True
+ else:
+ data[key][p] = False
+ else:
+ for p in Siteinfo.BOOLEAN_PROPS[prop]:
+ if p in data:
+ data[p] = True
+ else:
+ data[p] = False
+
def _get_siteinfo(self, prop, expiry):
"""
Retrieve a siteinfo property.
diff --git a/scripts/interwiki.py b/scripts/interwiki.py
index dcd2095..5bf3dee 100755
--- a/scripts/interwiki.py
+++ b/scripts/interwiki.py
@@ -2466,7 +2466,7 @@
@rtype: bool
"""
# Check if the page is in content namespace
- if page.namespace() == 0:
+ if page._namespace_obj.content:
# Check if the page contains at least 50 characters
return len(page.text) < 50
else:
diff --git a/tests/page_tests.py b/tests/page_tests.py
index 73bb2d3..1cc6248 100644
--- a/tests/page_tests.py
+++ b/tests/page_tests.py
@@ -516,6 +516,21 @@
page_copy.isDisambig()
self.assertTrue(page_copy.isRedirectPage())
+ def test_depth(self):
+ """Test page depth calculation."""
+ site = self.get_site()
+ page_d0 = pywikibot.Page(site, '/home/test/')
+ if site.namespaces[0].subpages:
+ self.assertEqual(page_d0.depth, 3)
+ else:
+ self.assertEqual(page_d0.depth, 0)
+
+ page_user_d0 = pywikibot.Page(site, 'User:Sn1per')
+ self.assertEqual(page_user_d0.depth, 0)
+
+ page_d3 = pywikibot.Page(site, 'User:Sn1per/ProtectTest1/test/test')
+ self.assertEqual(page_d3.depth, 3)
+
class TestPageDeprecation(DefaultSiteTestCase, DeprecationTestCase):
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index 5c5e207..3936fca 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -22,6 +22,7 @@
from pywikibot.pagegenerators import (
PagesFromTitlesGenerator,
PreloadingGenerator,
+ CategorizedPageGenerator
)
from tests import join_data_path
@@ -233,6 +234,33 @@
self.assertEqual(len(list(gen)), 0)
+class SubpageFilterGeneratorTestCase(TestCase):
+
+ """Test SubpageFilterGenerator."""
+
+ family = 'test'
+ code = 'test'
+
+ def test_subpage_filter(self):
+ site = self.get_site()
+ test_cat = pywikibot.Category(site, 'Subpage testing')
+
+ gen = CategorizedPageGenerator(test_cat)
+ gen = pagegenerators.SubpageFilterGenerator(gen, 0)
+ expect_0 = ('/home/test',)
+ self.assertPagelistTitles(gen, titles=expect_0, site=site)
+
+ gen = CategorizedPageGenerator(test_cat)
+ gen = pagegenerators.SubpageFilterGenerator(gen, 3)
+ expect_3 = (
+ '/home/test',
+ 'User:Sn1per/ProtectTest1/test',
+ 'User:Sn1per/ProtectTest1/test/test',
+ 'User:Sn1per/sandbox',
+ )
+ self.assertPagelistTitles(gen, titles=expect_3, site=site)
+
+
class TestRepeatingGenerator(RecentChangesTestCase):
"""Test RepeatingGenerator."""
diff --git a/tests/site_tests.py b/tests/site_tests.py
index ab547f7..d2cfcd1 100644
--- a/tests/site_tests.py
+++ b/tests/site_tests.py
@@ -2015,6 +2015,14 @@
self.assertRegex(mysite.siteinfo['timezone'],
"([A-Z]{3,4}|[A-Z][a-z]+/[A-Z][a-z]+)")
self.assertIn(mysite.siteinfo['case'], ["first-letter",
"case-sensitive"])
+ def test_siteinfo_boolean(self):
+ """Test conversion of boolean properties from empty strings to
True/False."""
+ mysite = self.get_site()
+ self.assertIsInstance(mysite.siteinfo['titleconversion'], bool)
+
+ self.assertIsInstance(mysite.namespaces[0].subpages, bool)
+ self.assertIsInstance(mysite.namespaces[0].content, bool)
+
def test_siteinfo_v1_16(self):
"""Test v.16+ siteinfo values."""
if MediaWikiVersion(self.site.version()) < MediaWikiVersion('1.16'):
--
To view, visit
https://gerrit.wikimedia.org/r/258422
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ia53580cf8ad7387c14d6ca3bf4fcf5b35f53edd4
Gerrit-PatchSet: 20
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Sn1per <geofbot(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: jenkins-bot <>