jenkins-bot has submitted this change and it was merged.
Change subject: proofreadpage.py: add ProofreadPage.index property and other methods ......................................................................
proofreadpage.py: add ProofreadPage.index property and other methods
Add: - ProofreadPage.index property to get Index page containing the page - IndexPage.page_gen() to load pages related to an Index page in specified page range (filters are available for quality level and page existance) - IndexPage.get_number() to get page number of a page - IndexPage.pages() to get the list of pages in Index
Rename: - IndexPage.get_page_from_number() to get_page()
bs4 is now mandatory for ProofreadPage if ProofreadPage.index is used.
Added and cleaned up docstrings.
Added related tests.
Change-Id: I9dab8c2e75dc27fe87500eac3202f14553525a82 --- M pywikibot/proofreadpage.py M tests/proofreadpage_tests.py 2 files changed, 238 insertions(+), 22 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py index c04951c..29458b3 100644 --- a/pywikibot/proofreadpage.py +++ b/pywikibot/proofreadpage.py @@ -7,6 +7,7 @@ This module includes objects: * ProofreadPage(Page) * FullHeader +* IndexPage(Page)
""" # @@ -69,6 +70,12 @@ PROBLEMATIC = 2 PROOFREAD = 3 VALIDATED = 4 + PROOFREAD_LEVELS = [WITHOUT_TEXT, + NOT_PROOFREAD, + PROBLEMATIC, + PROOFREAD, + VALIDATED, + ]
open_tag = '<noinclude>' close_tag = '</noinclude>' @@ -78,7 +85,7 @@ def __init__(self, source, title=''): """Instantiate a ProofreadPage object.
- Raises UnknownExtension if source Site has no ProofreadPage Extension. + @raise UnknownExtension: source Site has no ProofreadPage Extension. """ if not isinstance(source, pywikibot.site.BaseSite): site = source.site @@ -89,6 +96,67 @@ if self.namespace() != site.proofread_page_ns: raise ValueError('Page %s must belong to %s namespace' % (self.title(), ns)) + # Ensure that constants are in line with Extension values. + if list(self.site.proofread_levels.keys()) != self.PROOFREAD_LEVELS: + raise ValueError('QLs do not match site values: %s != %s' + % (self.site.proofread_levels.keys(), + self.PROOFREAD_LEVELS)) + + @property + def index(self): + """Get the Index page which contains ProofreadPage. + + To force reload, delete index and call it again. + + Returns: + None: if ProofreadPage is linked to no or several Index pages + and no inerence can be done from titles. + IndexPage: if ProofreadPage is linked to one Index page. + """ + if not hasattr(self, '_index'): + index_ns = self.site.proofread_index_ns + what_links_here = [IndexPage(page) for + page in self.getReferences(namespaces=index_ns)] + + if not what_links_here: + self._index = (None, []) + elif len(what_links_here) == 1: + self._index = (what_links_here[0], []) + else: + self._index = (None, what_links_here) + # Try to infer names form page titles. + base, sep, num = self.title(withNamespace=False).rpartition('/') + if sep == '/': + for page in what_links_here: + if page.title(withNamespace=False) == base: + what_links_here.remove(page) + self._index = (page, what_links_here) + break + + page, others = self._index + if others: + pywikibot.warning('Page %s is linked to several Index pages: %s.' + % (self, others)) + if page: + pywikibot.warning(' %s selected as Index.' % page) + pywikibot.warning(' %s remaining.' % others) + elif not page: + pywikibot.warning('Page %s is not linked to any Index page.' + % self) + + return page + + @index.setter + def index(self, value): + if not isinstance(value, IndexPage): + raise ValueError('value %s must be a IndexPage object.' + % value) + self._index = (value, None) + + @index.deleter + def index(self): + if hasattr(self, "_index"): + del self._index
def decompose(fn): """Decorator. @@ -347,7 +415,15 @@ on de wikisource). page label is the label associated with a page in the Index page.
- Raises UnknownExtension if source Site has no ProofreadPage Extension. + This class provides methods to get pages contained in Index page, + and relative page numbers and labels by means of several helper + functions. + + It also providesa generator to pages contained in Index page, with + possibility to define range, filter by quality levels and page existance. + + @raise UnknownExtension: source Site has no ProofreadPage Extension. + @raise ImportError: bs4 is not installed. """ # Check if BeautifulSoup is imported. if isinstance(BeautifulSoup, ImportError): @@ -415,6 +491,7 @@ title = a_tag.get('title')
page = ProofreadPage(self.site, title) + page.index = self # set index property for page if page not in self._all_page_links: raise pywikibot.Error('Page %s not recognised.' % page)
@@ -458,6 +535,55 @@ """ return len(self._page_from_numbers)
+ def page_gen(self, start=1, end=None, filter_ql=None, + only_existing=False, content=True): + """Return a page generator which yields pages contained in Index page. + + Range is [start ... end], extremes included. + + @param start: first page, defaults to 1 + @type start: int + @param end: num_pages if end is None + @type end: int + @param filter_ql: filters quality levels + if None: all but 'Without Text'. + @type filter_ql: list of ints (corresponding to ql constants + defined in ProofreadPage). + @param only_existing: yields only existing pages. + @type only_existing: bool + @param content: preload content. + @type content: bool + """ + if end is None: + end = self.num_pages + + if not ((1 <= start <= self.num_pages) and + (1 <= end <= self.num_pages) and + (start <= end)): + raise ValueError('start=%s, end=%s are not in valid range (%s, %s)' + % (start, end, 1, self.num_pages)) + + # All but 'Without Text' + if filter_ql is None: + filter_ql = list(self.site.proofread_levels.keys()) + filter_ql.remove(ProofreadPage.WITHOUT_TEXT) + + gen = (self.get_page(i) for i in range(start, end + 1)) + if content: + gen = self.site.preloadpages(gen) + # Decorate and sort by page number because preloadpages does not + # guarantee order. + # TODO: remove if preloadpages will guarantee order. + gen = ((p, self.get_number(p)) for p in gen) + gen = (p[0] for p in sorted(gen, key=lambda x: x[1])) + # Filter by QL. + gen = (p for p in gen if p.ql in filter_ql) + # Yield only existing. + if only_existing: + gen = (p for p in gen if p.exists()) + + return gen + @check_if_cached def get_label_from_page(self, page): """Return 'page label' for page. @@ -486,7 +612,7 @@ try: return self._labels_from_page_number[page_number] except KeyError: - raise KeyError('Page number ".../%s" not range.' + raise KeyError('Page number ".../%s" not in range.' % page_number)
def _get_from_label(self, mapping_dict, label): @@ -523,14 +649,26 @@ return self._get_from_label(self._pages_from_label, label)
@check_if_cached - def get_page_from_number(self, page_number): - """Return a page object from page number. - - @param page_number: int - @return: page - @rtype: page object - """ + def get_page(self, page_number): + """Return a page object from page number.""" try: return self._page_from_numbers[page_number] except KeyError: raise KeyError('Invalid page number: %s.' % page_number) + + @check_if_cached + def pages(self): + """Return the list of pages in Index, sorted by page number. + + @return: list of pages + @rtype: list + """ + return [self._page_from_numbers[i] for i in range(1, self.num_pages + 1)] + + @check_if_cached + def get_number(self, page): + """Return a page number from page object.""" + try: + return self._numbers_from_page[page] + except KeyError: + raise KeyError('Invalid page: %s.' % page) diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py index 28dfb5f..e28d9a7 100644 --- a/tests/proofreadpage_tests.py +++ b/tests/proofreadpage_tests.py @@ -86,6 +86,7 @@
valid = { 'title': 'Page:Popular Science Monthly Volume 1.djvu/12', + 'index': 'Index:Popular Science Monthly Volume 1.djvu', 'ql': 4, 'user': 'T. Mazzei', 'header': u"{{rh|2|''THE POPULAR SCIENCE MONTHLY.''}}", @@ -94,6 +95,10 @@
existing_invalid = { 'title': 'Main Page', + } + + existing_unlinked = { + 'title': 'Page:Pywikibot unlinked test page', }
not_existing_invalid = { @@ -203,6 +208,61 @@
@require_modules('bs4') +class TestProofreadPageIndexProperty(TestCase): + + """Test ProofreadPage index property.""" + + family = 'wikisource' + code = 'en' + + cached = True + + valid = { + 'title': 'Page:Popular Science Monthly Volume 1.djvu/12', + 'index': 'Index:Popular Science Monthly Volume 1.djvu', + } + + existing_multilinked = { + 'title': 'Page:Pywikibot test page 1/1', + 'index_1': 'Index:Pywikibot test page 1', + 'index_2': 'Index:Pywikibot test page 2', + } + + existing_unlinked = { + 'title': 'Page:Pywikibot unlinked test page', + } + + def test_index(self): + """Test index property.""" + # Page with Index. + page = ProofreadPage(self.site, self.valid['title']) + index_page = IndexPage(self.site, self.valid['index']) + + # Test propery. + self.assertEqual(page.index, index_page) + + # Test deleter + del page.index + self.assertFalse(hasattr(page, '_index')) + # Test setter + page.index = index_page + self.assertEqual(page.index, index_page) + + # Page without Index. + page = ProofreadPage(self.site, self.existing_multilinked['title']) + index_page_1 = IndexPage(self.site, self.existing_multilinked['index_1']) + index_page_2 = IndexPage(self.site, self.existing_multilinked['index_2']) + self.assertEqual(page.index, index_page_1) + self.assertNotEqual(page.index, index_page_2) + self.assertEqual(page._index, (index_page_1, [index_page_2])) + + # Page without Index. + page = ProofreadPage(self.site, self.existing_unlinked['title']) + self.assertIs(page.index, None) + self.assertEqual(page._index, (None, [])) + + +@require_modules('bs4') class IndexPageTestCase(TestCase):
"""Run tests related to IndexPage ProofreadPage extension.""" @@ -270,7 +330,8 @@ self.assertEqual(page.namespace(), source.namespace)
-class TestBasePageMethodsIndexPage(IndexPageTestCase, BasePageMethodsTestBase): +@require_modules('bs4') +class TestBasePageMethodsIndexPage(BasePageMethodsTestBase):
"""Test behavior of ProofreadPage methods inherited from BasePage."""
@@ -439,19 +500,36 @@ # Error if label does not exists. self.assertRaises(KeyError, index_page.get_page_from_label, 'dummy label')
- # Test consistency of page <-> numbers mapping on last page_set and - # num_set used. - for p in page_set: - n = index_page._numbers_from_page[p] - self.assertEqual(index_page._page_from_numbers[n], p) + # Test get_page. for n in num_set: - p = index_page._page_from_numbers[n] - self.assertEqual(index_page._numbers_from_page[p], n) + p = index_page.get_page(n) + self.assertEqual(index_page.get_number(p), n)
- # Test get_page_from_number. - for n in num_set: - p = index_page.get_page_from_number(n) - self.assertEqual(index_page._numbers_from_page[p], n) + # Test get_number. + for p in page_set: + n = index_page.get_number(p) + self.assertEqual(index_page.get_page(n), p) + + def test_page_gen(self, key): + """Test Index page generator.""" + data = self.sites[key] + num, title_num, label = data['get_label'] + + index_page = IndexPage(self.site, self.sites[key]['index']) + page_title = self.sites[key]['page'].format(title_num) + proofread_page = ProofreadPage(self.site, page_title) + + # Check start/end limits. + self.assertRaises(ValueError, index_page.page_gen, -1, 2) + self.assertRaises(ValueError, index_page.page_gen, 1, -1) + self.assertRaises(ValueError, index_page.page_gen, 2, 1) + + # Check quality filters. + gen = index_page.page_gen(num, num, filter_ql=range(5)) + self.assertEqual(list(gen), [proofread_page]) + + gen = index_page.page_gen(num, num, filter_ql=[0]) + self.assertEqual(list(gen), [])
if __name__ == '__main__':
pywikibot-commits@lists.wikimedia.org