jenkins-bot has submitted this change and it was merged.
Change subject: proofreadpage.py: manage also non existing links in Index page
......................................................................
proofreadpage.py: manage also non existing links in Index page
Add possibility to handle not existing pages when quering
labels/numbers.
Does not force namespace in ProofreadPage/IndexPage.__init__() so the
ns check is now effective.
Change-Id: I8dea8731de76a56572fb8dd5865848b7a2d910d0
---
M pywikibot/proofreadpage.py
M tests/proofreadpage_tests.py
2 files changed, 99 insertions(+), 28 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 29458b3..d27b08b 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -91,11 +91,10 @@
site = source.site
else:
site = source
- ns = site.proofread_page_ns
- super(ProofreadPage, self).__init__(source, title, ns=ns)
+ super(ProofreadPage, self).__init__(source, title)
if self.namespace() != site.proofread_page_ns:
raise ValueError('Page %s must belong to %s namespace'
- % (self.title(), ns))
+ % (self.title(), site.proofread_page_ns))
# Ensure that constants are in line with Extension values.
if list(self.site.proofread_levels.keys()) != self.PROOFREAD_LEVELS:
raise ValueError('QLs do not match site values: %s != %s'
@@ -401,8 +400,6 @@
"""Index Page page used in Mediawiki ProofreadPage
extension."""
- # TODO: handle not existing pages when quering labels/nubers?
- # Currently APIError is thrown.
def __init__(self, source, title=''):
"""Instantiate a IndexPage object.
@@ -433,14 +430,13 @@
site = source.site
else:
site = source
- ns = site.proofread_index_ns
- super(IndexPage, self).__init__(source, title, ns=site.proofread_index_ns)
+ super(IndexPage, self).__init__(source, title)
if self.namespace() != site.proofread_index_ns:
raise ValueError('Page %s must belong to %s namespace'
- % (self.title(), ns))
+ % (self.title(), site.proofread_index_ns))
self._all_page_links = set(
- self.site.pagelinks(self, namespaces=self.site.proofread_page_ns))
+ self.site.pagelinks(self, namespaces=site.proofread_page_ns))
self._cached = False
@@ -451,6 +447,15 @@
self._get_page_mappings()
return fn(self, *args, **kwargs)
return wrapper
+
+ def _parse_redlink(self, href):
+ """Parse page title when link in Index is a
redlink."""
+ p_href =
re.compile('/w/index\.php\?title=(.+?)&action=edit&redlink=1')
+ title = p_href.search(href)
+ if title:
+ return title.group(1)
+ else:
+ return None
def _get_page_mappings(self):
"""Associate label and number for each page linked to the
index."""
@@ -466,13 +471,20 @@
self._parsed_text = self._get_parsed_page()
self._soup = BeautifulSoup(self._parsed_text, 'html.parser')
- attrs = {'class': re.compile('prp-pagequality')}
+ attrs = {'class': re.compile('prp-pagequality|new')}
- # Search for attribute "prp-pagequality" in tags like:
- # <a class="quality1 prp-pagequality-1"
- # href="/wiki/Page:xxx.djvu/n"
+ # Search for attribute "prp-pagequality" in tags:
+ # Existing pages:
+ # <a href="/wiki/Page:xxx.djvu/n"
# title="Page:xxx.djvu/n">m
+ # class="quality1 prp-pagequality-1"
+ # </a> or
+ # Non-existing pages:
+ # <a
href="/w/index.php?title=xxx&action=edit&redlink=1"
+ # class="new"
+ # title="Page:xxx.djvu/n (page does not exist)">m
# </a>
+
# Try to purge or raise ValueError.
if not self._soup.find_all('a', attrs=attrs):
self.purge()
@@ -481,17 +493,30 @@
self._soup = BeautifulSoup(self._parsed_text, 'html.parser')
if not self._soup.find_all('a', attrs=attrs):
raise ValueError(
- 'Missing class="qualityN prp-pagequality-N" in:
%s.'
+ 'Missing class="qualityN prp-pagequality-N" or'
+ 'class="new" in: %s.'
% self)
page_cnt = 0
for a_tag in self._soup.find_all('a', attrs=attrs):
- page_cnt += 1
label = a_tag.text.lstrip('0') # Label is not converted to int.
- title = a_tag.get('title')
+ class_ = a_tag.get('class')
+ href = a_tag.get('href')
- page = ProofreadPage(self.site, title)
- page.index = self # set index property for page
+ if 'new' in class_:
+ title = self._parse_redlink(href) # non-existing page
+ if title is None: # title not conforming to required format
+ continue
+ else:
+ title = a_tag.get('title') # existing page
+ try:
+ page = ProofreadPage(self.site, title)
+ page.index = self # set index property for page
+ page_cnt += 1
+ except ValueError:
+ # title is not in site.proofread_page_ns; do not consider it
+ continue
+
if page not in self._all_page_links:
raise pywikibot.Error('Page %s not recognised.' % page)
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index e28d9a7..67927fd 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -108,7 +108,7 @@
def test_valid_site_source(self):
"""Test ProofreadPage from valid Site as
source."""
- page = ProofreadPage(self.site, 'title')
+ page = ProofreadPage(self.site, 'Page:dummy test page')
self.assertEqual(page.namespace(), self.site.proofread_page_ns)
def test_invalid_existing_page_source(self):
@@ -166,7 +166,7 @@
def test_preload_from_not_existing_page(self):
"""Test ProofreadPage page decomposing/composing
text."""
- page = ProofreadPage(self.site, 'dummy test page')
+ page = ProofreadPage(self.site, 'Page:dummy test page')
self.assertEqual(page.text,
'<noinclude><pagequality level="1"
user="%s" />'
'<div
class="pagetext">\n\n\n</noinclude>'
@@ -175,7 +175,7 @@
def test_preload_from_empty_text(self):
"""Test ProofreadPage page decomposing/composing
text."""
- page = ProofreadPage(self.site, 'dummy test page')
+ page = ProofreadPage(self.site, 'Page:dummy test page')
page.text = ''
self.assertEqual(page.text,
'<noinclude><pagequality level="1"
user="%s" />'
@@ -300,7 +300,7 @@
def test_valid_site_as_source(self):
"""Test IndexPage from valid Site as source."""
- page = IndexPage(self.site, 'title')
+ page = IndexPage(self.site, 'Index:dummy test page')
self.assertEqual(page.namespace(), self.site.proofread_index_ns)
def test_invalid_existing_page_as_source(self):
@@ -377,9 +377,9 @@
'enws': {
'family': 'wikisource',
'code': 'en',
- 'index': 'Popular Science Monthly Volume 1.djvu',
+ 'index': 'Index:Popular Science Monthly Volume 1.djvu',
'num_pages': 804,
- 'page': 'Popular Science Monthly Volume 1.djvu/{0}',
+ 'page': 'Page:Popular Science Monthly Volume 1.djvu/{0}',
'get_label': [11, 11, '1'],
'get_number': [[1, set([11])],
['Cvr', set([1, 9, 10, 804])],
@@ -389,9 +389,9 @@
'dews': { # dews does not use page convention name/number.
'family': 'wikisource',
'code': 'de',
- 'index': 'Musen-Almanach für das Jahr 1799',
+ 'index': 'Index:Musen-Almanach für das Jahr 1799',
'num_pages': 272,
- 'page': 'Schiller_Musenalmanach_1799_{0:3d}.jpg',
+ 'page': 'Seite:Schiller_Musenalmanach_1799_{0:3d}.jpg',
'get_label': [120, 120, '120'], # page no, title no, label
'get_number': [[120, set([120])],
],
@@ -400,9 +400,9 @@
'frws': {
'family': 'wikisource',
'code': 'fr',
- 'index': 'Segard - Hymnes profanes, 1894.djvu',
+ 'index': 'Index:Segard - Hymnes profanes, 1894.djvu',
'num_pages': 107,
- 'page': 'Segard - Hymnes profanes, 1894.djvu/{0}',
+ 'page': 'Page:Segard - Hymnes profanes, 1894.djvu/{0}',
'get_label': [11, 11, '8'],
'get_number': [[8, set([11])],
['-', set(range(1, 4)) | set(range(101, 108))],
@@ -532,6 +532,52 @@
self.assertEqual(list(gen), [])
+class TestIndexPageMappingsRedlinks(IndexPageTestCase):
+
+ """Test IndexPage mappings with redlinks."""
+
+ family = 'wikisource'
+ code = 'en'
+
+ cached = True
+
+ with_redlink = {
+ 'title': {'blue': 'Page:Pywikibot test page 1/1',
+ 'red': 'Page:Pywikibot test page 2/2',
+ },
+ 'index': 'Index:Pywikibot test page 1'
+ }
+
+ def test_index_redlink(self):
+ """Test index property with redlink."""
+ page = ProofreadPage(self.site,
self.with_redlink['title']['red'])
+ index_page = IndexPage(self.site, self.with_redlink['index'])
+ self.assertEqual(page.index, index_page)
+
+ def test_get_page_and_number_redlink(self):
+ """Test IndexPage page get_page_number functions with
redlinks."""
+ index_page = IndexPage(self.site, self.with_redlink['index'])
+
+ for title in self.with_redlink['title'].values():
+ p = ProofreadPage(self.site, title)
+ n = index_page.get_number(p)
+ self.assertEqual(index_page.get_page(n), p)
+
+ def test_page_gen_redlink(self):
+ """Test Index page generator with redlinks."""
+ index_page = IndexPage(self.site, self.with_redlink['index'])
+ proofread_pages = [ProofreadPage(self.site, page_title) for
+ page_title in self.with_redlink['title'].values()]
+
+ # Check start/end limits.
+ self.assertRaises(ValueError, index_page.page_gen, -1, 2)
+ self.assertRaises(ValueError, index_page.page_gen, 1, -1)
+ self.assertRaises(ValueError, index_page.page_gen, 2, 1)
+
+ gen = index_page.page_gen(1, None, filter_ql=range(5))
+ self.assertEqual(list(gen), proofread_pages)
+
+
if __name__ == '__main__':
try:
unittest.main()
--
To view, visit
https://gerrit.wikimedia.org/r/244807
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8dea8731de76a56572fb8dd5865848b7a2d910d0
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>