jenkins-bot submitted this change.

View Change

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

[IMPR]: Use proofreadpagesinindex query module

Use proofreadpagesinindex query module to get pages in IndexPage.

See:
- https://www.mediawiki.org/wiki/Extension:ProofreadPage/Index_pagination_API

Now self._all_page_links is a dict that contains all pages included in
IndexPage. When pages are fetched via self.page_gen() no new Page objects are
created; they are retrieved from self._all_page_links instead.

This also makes IndexPage more robust when getting links in Page ns, see
bug T307280.

Change-Id: I1d36dbde0ff12078c45c3e80c69912bbe4436039
---
M pywikibot/proofreadpage.py
1 file changed, 40 insertions(+), 24 deletions(-)

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 9b0650b..758ed41 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -47,7 +47,7 @@
     Tuple,
 )
 from pywikibot.comms import http
-from pywikibot.data.api import Request
+from pywikibot.data.api import ListGenerator, Request
 from pywikibot.exceptions import Error, OtherPageSaveError
 from pywikibot.page import PageSourceType
 from pywikibot.tools import cached
@@ -824,14 +824,30 @@
             raise ValueError('Page {} must belong to {} namespace'
                              .format(self.title(), site.proofread_index_ns))
 
-        self._all_page_links = set(
-            self.site.pagelinks(self, namespaces=site.proofread_page_ns))
-        # bug T307280
-        self._all_page_links |= set(
-            self.site.pagetemplates(self, namespaces=site.proofread_page_ns))
+        self._all_page_links = {}
+
+        for page in self._get_prp_index_pagelist():
+            self._all_page_links[page.title()] = page
 
         self._cached = False
 
+    def _get_prp_index_pagelist(self):
+        """Get all pages in an IndexPage page list."""
+        site = self.site
+        ppi_args = {}
+        if hasattr(self, '_pageid'):
+            ppi_args['prppiipageid'] = str(self._pageid)
+        else:
+            ppi_args['prppiititle'] = self.title().encode(site.encoding())
+
+        ppi_gen = site._generator(ListGenerator, 'proofreadpagesinindex',
+                                  **ppi_args)
+        for item in ppi_gen:
+            page = ProofreadPage(site, item['title'])
+            page.page_offset = item['pageoffset']
+            page.index = self
+            yield page
+
     @staticmethod
     def _parse_redlink(href: str) -> Optional[str]:
         """Parse page title when link in Index is a redlink."""
@@ -839,7 +855,7 @@
             r'/w/index\.php\?title=(.+?)&action=edit&redlink=1')
         title = p_href.search(href)
         if title:
-            return title.group(1)
+            return title.group(1).replace('_', ' ')
         return None
 
     def save(self, *args: Any, **kwargs: Any) -> None:  # See Page.save().
@@ -907,23 +923,27 @@
         self._soup = _bs4_soup(self.get_parsed_page(True))  # type: ignore
         # Do not search for "new" here, to avoid to skip purging if links
         # to non-existing pages are present.
-        attrs = {'class': re.compile('prp-pagequality')}
+        attrs = {'class': re.compile('prp-pagequality-[0-4]')}
 
         # Search for attribute "prp-pagequality" in tags:
         # Existing pages:
         # <a href="/wiki/Page:xxx.djvu/n"
+        #    class="prp-pagequality-0 quality0" or
+        #    class="prp-index-pagelist-page prp-pagequality-0 quality0"
         #    title="Page:xxx.djvu/n">m
-        #    class="quality1 prp-pagequality-1"
         # </a>
         # Non-existing pages:
         # <a href="/w/index.php?title=xxx&amp;action=edit&amp;redlink=1"
-        #    class="new"
+        #    class="new prp-index-pagelist-page"
         #    title="Page:xxx.djvu/n (page does not exist)">m
         # </a>
 
         # Try to purge or raise ValueError.
         found = self._soup.find_all('a', attrs=attrs)
-        attrs = {'class': re.compile('prp-pagequality|new')}
+        attrs = {'class': re.compile('prp-pagequality-[0-4]|'
+                                     'new prp-index-pagelist-page|'
+                                     'prp-index-pagelist-page')
+                 }
         if not found:
             self.purge()
             self._soup = _bs4_soup(self.get_parsed_page(True))  # type: ignore
@@ -932,7 +952,6 @@
                     'Missing class="qualityN prp-pagequality-N" or '
                     'class="new" in: {}.'.format(self))
 
-        # Search for attribute "prp-pagequality" or "new" in tags:
         page_cnt = 0
         for a_tag in self._soup.find_all('a', attrs=attrs):
             label = a_tag.text.lstrip('0')  # Label is not converted to int.
@@ -947,16 +966,12 @@
                 title = a_tag.get('title')   # existing page
 
             assert title is not None
-            try:
-                page = ProofreadPage(self.site, title)
-                page.index = self  # set index property for page
-                page_cnt += 1
-            except ValueError:
-                # title is not in site.proofread_page_ns; do not consider it
-                continue
 
-            if page not in self._all_page_links:
-                raise Error('Page {} not recognised.'.format(page))
+            try:
+                page = self._all_page_links[title]
+                page_cnt += 1
+            except KeyError:
+                continue
 
             # In order to avoid to fetch other Page:title links outside
             # the Pages section of the Index page; these should hopefully be
@@ -982,7 +997,8 @@
             self._pages_from_label.setdefault(label, set()).add(page)
 
         # Sanity check: all links to Page: ns must have been considered.
-        assert set(self._labels_from_page) == set(self._all_page_links)
+        assert (set(self._labels_from_page)
+                == set(self._all_page_links.values()))
 
         # Info cached.
         self._cached = True
@@ -1036,8 +1052,8 @@
         # Decorate and sort by page number because preloadpages does not
         # guarantee order.
         # TODO: remove if preloadpages will guarantee order.
-        gen = ((p, self.get_number(p)) for p in gen)
-        gen = (p[0] for p in sorted(gen, key=lambda x: x[1]))
+        gen = ((self.get_number(p), p) for p in gen)
+        gen = (p for n, p in sorted(gen))
 
         return gen

To view, visit change 788789. To unsubscribe, or for help writing mail filters, visit settings.