jenkins-bot has submitted this change and it was merged.
Change subject: proofreadpage.py: manage also non existing links in Index page ......................................................................
proofreadpage.py: manage also non existing links in Index page
Add possibility to handle not existing pages when quering labels/numbers.
Does not force namespace in ProofreadPage/IndexPage.__init__() so the ns check is now effective.
Change-Id: I8dea8731de76a56572fb8dd5865848b7a2d910d0 --- M pywikibot/proofreadpage.py M tests/proofreadpage_tests.py 2 files changed, 99 insertions(+), 28 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py index 29458b3..d27b08b 100644 --- a/pywikibot/proofreadpage.py +++ b/pywikibot/proofreadpage.py @@ -91,11 +91,10 @@ site = source.site else: site = source - ns = site.proofread_page_ns - super(ProofreadPage, self).__init__(source, title, ns=ns) + super(ProofreadPage, self).__init__(source, title) if self.namespace() != site.proofread_page_ns: raise ValueError('Page %s must belong to %s namespace' - % (self.title(), ns)) + % (self.title(), site.proofread_page_ns)) # Ensure that constants are in line with Extension values. if list(self.site.proofread_levels.keys()) != self.PROOFREAD_LEVELS: raise ValueError('QLs do not match site values: %s != %s' @@ -401,8 +400,6 @@
"""Index Page page used in Mediawiki ProofreadPage extension."""
- # TODO: handle not existing pages when quering labels/nubers? - # Currently APIError is thrown. def __init__(self, source, title=''): """Instantiate a IndexPage object.
@@ -433,14 +430,13 @@ site = source.site else: site = source - ns = site.proofread_index_ns - super(IndexPage, self).__init__(source, title, ns=site.proofread_index_ns) + super(IndexPage, self).__init__(source, title) if self.namespace() != site.proofread_index_ns: raise ValueError('Page %s must belong to %s namespace' - % (self.title(), ns)) + % (self.title(), site.proofread_index_ns))
self._all_page_links = set( - self.site.pagelinks(self, namespaces=self.site.proofread_page_ns)) + self.site.pagelinks(self, namespaces=site.proofread_page_ns))
self._cached = False
@@ -451,6 +447,15 @@ self._get_page_mappings() return fn(self, *args, **kwargs) return wrapper + + def _parse_redlink(self, href): + """Parse page title when link in Index is a redlink.""" + p_href = re.compile('/w/index.php?title=(.+?)&action=edit&redlink=1') + title = p_href.search(href) + if title: + return title.group(1) + else: + return None
def _get_page_mappings(self): """Associate label and number for each page linked to the index.""" @@ -466,13 +471,20 @@
self._parsed_text = self._get_parsed_page() self._soup = BeautifulSoup(self._parsed_text, 'html.parser') - attrs = {'class': re.compile('prp-pagequality')} + attrs = {'class': re.compile('prp-pagequality|new')}
- # Search for attribute "prp-pagequality" in tags like: - # <a class="quality1 prp-pagequality-1" - # href="/wiki/Page:xxx.djvu/n" + # Search for attribute "prp-pagequality" in tags: + # Existing pages: + # <a href="/wiki/Page:xxx.djvu/n" # title="Page:xxx.djvu/n">m + # class="quality1 prp-pagequality-1" + # </a> or + # Non-existing pages: + # <a href="/w/index.php?title=xxx&action=edit&redlink=1" + # class="new" + # title="Page:xxx.djvu/n (page does not exist)">m # </a> + # Try to purge or raise ValueError. if not self._soup.find_all('a', attrs=attrs): self.purge() @@ -481,17 +493,30 @@ self._soup = BeautifulSoup(self._parsed_text, 'html.parser') if not self._soup.find_all('a', attrs=attrs): raise ValueError( - 'Missing class="qualityN prp-pagequality-N" in: %s.' + 'Missing class="qualityN prp-pagequality-N" or' + 'class="new" in: %s.' % self)
page_cnt = 0 for a_tag in self._soup.find_all('a', attrs=attrs): - page_cnt += 1 label = a_tag.text.lstrip('0') # Label is not converted to int. - title = a_tag.get('title') + class_ = a_tag.get('class') + href = a_tag.get('href')
- page = ProofreadPage(self.site, title) - page.index = self # set index property for page + if 'new' in class_: + title = self._parse_redlink(href) # non-existing page + if title is None: # title not conforming to required format + continue + else: + title = a_tag.get('title') # existing page + try: + page = ProofreadPage(self.site, title) + page.index = self # set index property for page + page_cnt += 1 + except ValueError: + # title is not in site.proofread_page_ns; do not consider it + continue + if page not in self._all_page_links: raise pywikibot.Error('Page %s not recognised.' % page)
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py index e28d9a7..67927fd 100644 --- a/tests/proofreadpage_tests.py +++ b/tests/proofreadpage_tests.py @@ -108,7 +108,7 @@
def test_valid_site_source(self): """Test ProofreadPage from valid Site as source.""" - page = ProofreadPage(self.site, 'title') + page = ProofreadPage(self.site, 'Page:dummy test page') self.assertEqual(page.namespace(), self.site.proofread_page_ns)
def test_invalid_existing_page_source(self): @@ -166,7 +166,7 @@
def test_preload_from_not_existing_page(self): """Test ProofreadPage page decomposing/composing text.""" - page = ProofreadPage(self.site, 'dummy test page') + page = ProofreadPage(self.site, 'Page:dummy test page') self.assertEqual(page.text, '<noinclude><pagequality level="1" user="%s" />' '<div class="pagetext">\n\n\n</noinclude>' @@ -175,7 +175,7 @@
def test_preload_from_empty_text(self): """Test ProofreadPage page decomposing/composing text.""" - page = ProofreadPage(self.site, 'dummy test page') + page = ProofreadPage(self.site, 'Page:dummy test page') page.text = '' self.assertEqual(page.text, '<noinclude><pagequality level="1" user="%s" />' @@ -300,7 +300,7 @@
def test_valid_site_as_source(self): """Test IndexPage from valid Site as source.""" - page = IndexPage(self.site, 'title') + page = IndexPage(self.site, 'Index:dummy test page') self.assertEqual(page.namespace(), self.site.proofread_index_ns)
def test_invalid_existing_page_as_source(self): @@ -377,9 +377,9 @@ 'enws': { 'family': 'wikisource', 'code': 'en', - 'index': 'Popular Science Monthly Volume 1.djvu', + 'index': 'Index:Popular Science Monthly Volume 1.djvu', 'num_pages': 804, - 'page': 'Popular Science Monthly Volume 1.djvu/{0}', + 'page': 'Page:Popular Science Monthly Volume 1.djvu/{0}', 'get_label': [11, 11, '1'], 'get_number': [[1, set([11])], ['Cvr', set([1, 9, 10, 804])], @@ -389,9 +389,9 @@ 'dews': { # dews does not use page convention name/number. 'family': 'wikisource', 'code': 'de', - 'index': 'Musen-Almanach für das Jahr 1799', + 'index': 'Index:Musen-Almanach für das Jahr 1799', 'num_pages': 272, - 'page': 'Schiller_Musenalmanach_1799_{0:3d}.jpg', + 'page': 'Seite:Schiller_Musenalmanach_1799_{0:3d}.jpg', 'get_label': [120, 120, '120'], # page no, title no, label 'get_number': [[120, set([120])], ], @@ -400,9 +400,9 @@ 'frws': { 'family': 'wikisource', 'code': 'fr', - 'index': 'Segard - Hymnes profanes, 1894.djvu', + 'index': 'Index:Segard - Hymnes profanes, 1894.djvu', 'num_pages': 107, - 'page': 'Segard - Hymnes profanes, 1894.djvu/{0}', + 'page': 'Page:Segard - Hymnes profanes, 1894.djvu/{0}', 'get_label': [11, 11, '8'], 'get_number': [[8, set([11])], ['-', set(range(1, 4)) | set(range(101, 108))], @@ -532,6 +532,52 @@ self.assertEqual(list(gen), [])
+class TestIndexPageMappingsRedlinks(IndexPageTestCase): + + """Test IndexPage mappings with redlinks.""" + + family = 'wikisource' + code = 'en' + + cached = True + + with_redlink = { + 'title': {'blue': 'Page:Pywikibot test page 1/1', + 'red': 'Page:Pywikibot test page 2/2', + }, + 'index': 'Index:Pywikibot test page 1' + } + + def test_index_redlink(self): + """Test index property with redlink.""" + page = ProofreadPage(self.site, self.with_redlink['title']['red']) + index_page = IndexPage(self.site, self.with_redlink['index']) + self.assertEqual(page.index, index_page) + + def test_get_page_and_number_redlink(self): + """Test IndexPage page get_page_number functions with redlinks.""" + index_page = IndexPage(self.site, self.with_redlink['index']) + + for title in self.with_redlink['title'].values(): + p = ProofreadPage(self.site, title) + n = index_page.get_number(p) + self.assertEqual(index_page.get_page(n), p) + + def test_page_gen_redlink(self): + """Test Index page generator with redlinks.""" + index_page = IndexPage(self.site, self.with_redlink['index']) + proofread_pages = [ProofreadPage(self.site, page_title) for + page_title in self.with_redlink['title'].values()] + + # Check start/end limits. + self.assertRaises(ValueError, index_page.page_gen, -1, 2) + self.assertRaises(ValueError, index_page.page_gen, 1, -1) + self.assertRaises(ValueError, index_page.page_gen, 2, 1) + + gen = index_page.page_gen(1, None, filter_ql=range(5)) + self.assertEqual(list(gen), proofread_pages) + + if __name__ == '__main__': try: unittest.main()