jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR]: Use proofreadpagesinindex query module

Use proofreadpagesinindex query module to get pages in IndexPage.

See:
- https://www.mediawiki.org/wiki/Extension:ProofreadPage/Index_pagination_API

Now self._all_page_links is a dict that contains all pages included in
IndexPage. When pages are fetched via self.page_gen() no new Page objects are
created; they are retrieved from self._all_page_links instead.

This also makes IndexPage more robust when getting links in Page ns, see
bug T307280.

Change-Id: I1d36dbde0ff12078c45c3e80c69912bbe4436039
---
M pywikibot/proofreadpage.py
1 file changed, 40 insertions(+), 24 deletions(-)

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 9b0650b..758ed41 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -47,7 +47,7 @@
Tuple,
)
from pywikibot.comms import http
-from pywikibot.data.api import Request
+from pywikibot.data.api import ListGenerator, Request
from pywikibot.exceptions import Error, OtherPageSaveError
from pywikibot.page import PageSourceType
from pywikibot.tools import cached
@@ -824,14 +824,30 @@
raise ValueError('Page {} must belong to {} namespace'
.format(self.title(), site.proofread_index_ns))

- self._all_page_links = set(
- self.site.pagelinks(self, namespaces=site.proofread_page_ns))
- # bug T307280
- self._all_page_links |= set(
- self.site.pagetemplates(self, namespaces=site.proofread_page_ns))
+ self._all_page_links = {}
+
+ for page in self._get_prp_index_pagelist():
+ self._all_page_links[page.title()] = page

self._cached = False

+ def _get_prp_index_pagelist(self):
+ """Get all pages in an IndexPage page list."""
+ site = self.site
+ ppi_args = {}
+ if hasattr(self, '_pageid'):
+ ppi_args['prppiipageid'] = str(self._pageid)
+ else:
+ ppi_args['prppiititle'] = self.title().encode(site.encoding())
+
+ ppi_gen = site._generator(ListGenerator, 'proofreadpagesinindex',
+ **ppi_args)
+ for item in ppi_gen:
+ page = ProofreadPage(site, item['title'])
+ page.page_offset = item['pageoffset']
+ page.index = self
+ yield page
+
@staticmethod
def _parse_redlink(href: str) -> Optional[str]:
"""Parse page title when link in Index is a redlink."""
@@ -839,7 +855,7 @@
r'/w/index\.php\?title=(.+?)&action=edit&redlink=1')
title = p_href.search(href)
if title:
- return title.group(1)
+ return title.group(1).replace('_', ' ')
return None

def save(self, *args: Any, **kwargs: Any) -> None: # See Page.save().
@@ -907,23 +923,27 @@
self._soup = _bs4_soup(self.get_parsed_page(True)) # type: ignore
# Do not search for "new" here, to avoid to skip purging if links
# to non-existing pages are present.
- attrs = {'class': re.compile('prp-pagequality')}
+ attrs = {'class': re.compile('prp-pagequality-[0-4]')}

# Search for attribute "prp-pagequality" in tags:
# Existing pages:
# <a href="/wiki/Page:xxx.djvu/n"
+ # class="prp-pagequality-0 quality0" or
+ # class="prp-index-pagelist-page prp-pagequality-0 quality0"
# title="Page:xxx.djvu/n">m
- # class="quality1 prp-pagequality-1"
# </a>
# Non-existing pages:
# <a href="/w/index.php?title=xxx&amp;action=edit&amp;redlink=1"
- # class="new"
+ # class="new prp-index-pagelist-page"
# title="Page:xxx.djvu/n (page does not exist)">m
# </a>

# Try to purge or raise ValueError.
found = self._soup.find_all('a', attrs=attrs)
- attrs = {'class': re.compile('prp-pagequality|new')}
+ attrs = {'class': re.compile('prp-pagequality-[0-4]|'
+ 'new prp-index-pagelist-page|'
+ 'prp-index-pagelist-page')
+ }
if not found:
self.purge()
self._soup = _bs4_soup(self.get_parsed_page(True)) # type: ignore
@@ -932,7 +952,6 @@
'Missing class="qualityN prp-pagequality-N" or '
'class="new" in: {}.'.format(self))

- # Search for attribute "prp-pagequality" or "new" in tags:
page_cnt = 0
for a_tag in self._soup.find_all('a', attrs=attrs):
label = a_tag.text.lstrip('0') # Label is not converted to int.
@@ -947,16 +966,12 @@
title = a_tag.get('title') # existing page

assert title is not None
- try:
- page = ProofreadPage(self.site, title)
- page.index = self # set index property for page
- page_cnt += 1
- except ValueError:
- # title is not in site.proofread_page_ns; do not consider it
- continue

- if page not in self._all_page_links:
- raise Error('Page {} not recognised.'.format(page))
+ try:
+ page = self._all_page_links[title]
+ page_cnt += 1
+ except KeyError:
+ continue

# In order to avoid to fetch other Page:title links outside
# the Pages section of the Index page; these should hopefully be
@@ -982,7 +997,8 @@
self._pages_from_label.setdefault(label, set()).add(page)

# Sanity check: all links to Page: ns must have been considered.
- assert set(self._labels_from_page) == set(self._all_page_links)
+ assert (set(self._labels_from_page)
+ == set(self._all_page_links.values()))

# Info cached.
self._cached = True
@@ -1036,8 +1052,8 @@
# Decorate and sort by page number because preloadpages does not
# guarantee order.
# TODO: remove if preloadpages will guarantee order.
- gen = ((p, self.get_number(p)) for p in gen)
- gen = (p[0] for p in sorted(gen, key=lambda x: x[1]))
+ gen = ((self.get_number(p), p) for p in gen)
+ gen = (p for n, p in sorted(gen))

return gen


To view, visit change 788789. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I1d36dbde0ff12078c45c3e80c69912bbe4436039
Gerrit-Change-Number: 788789
Gerrit-PatchSet: 5
Gerrit-Owner: Mpaa <mpaa.wiki@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged