jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/788789 )
Change subject: [IMPR]: Use proofreadpagesinindex query module ......................................................................
[IMPR]: Use proofreadpagesinindex query module
Use proofreadpagesinindex query module to get pages in IndexPage.
See: - https://www.mediawiki.org/wiki/Extension:ProofreadPage/Index_pagination_API
Now self._all_page_links is a dict that contains all pages included in IndexPage. When pages are fetched via self.page_gen() no new Page objects are created; they are retrieved from self._all_page_links instead.
This also makes IndexPage more robust when getting links in Page ns, see bug T307280.
Change-Id: I1d36dbde0ff12078c45c3e80c69912bbe4436039 --- M pywikibot/proofreadpage.py 1 file changed, 40 insertions(+), 24 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py index 9b0650b..758ed41 100644 --- a/pywikibot/proofreadpage.py +++ b/pywikibot/proofreadpage.py @@ -47,7 +47,7 @@ Tuple, ) from pywikibot.comms import http -from pywikibot.data.api import Request +from pywikibot.data.api import ListGenerator, Request from pywikibot.exceptions import Error, OtherPageSaveError from pywikibot.page import PageSourceType from pywikibot.tools import cached @@ -824,14 +824,30 @@ raise ValueError('Page {} must belong to {} namespace' .format(self.title(), site.proofread_index_ns))
- self._all_page_links = set( - self.site.pagelinks(self, namespaces=site.proofread_page_ns)) - # bug T307280 - self._all_page_links |= set( - self.site.pagetemplates(self, namespaces=site.proofread_page_ns)) + self._all_page_links = {} + + for page in self._get_prp_index_pagelist(): + self._all_page_links[page.title()] = page
self._cached = False
+ def _get_prp_index_pagelist(self): + """Get all pages in an IndexPage page list.""" + site = self.site + ppi_args = {} + if hasattr(self, '_pageid'): + ppi_args['prppiipageid'] = str(self._pageid) + else: + ppi_args['prppiititle'] = self.title().encode(site.encoding()) + + ppi_gen = site._generator(ListGenerator, 'proofreadpagesinindex', + **ppi_args) + for item in ppi_gen: + page = ProofreadPage(site, item['title']) + page.page_offset = item['pageoffset'] + page.index = self + yield page + @staticmethod def _parse_redlink(href: str) -> Optional[str]: """Parse page title when link in Index is a redlink.""" @@ -839,7 +855,7 @@ r'/w/index.php?title=(.+?)&action=edit&redlink=1') title = p_href.search(href) if title: - return title.group(1) + return title.group(1).replace('_', ' ') return None
def save(self, *args: Any, **kwargs: Any) -> None: # See Page.save(). @@ -907,23 +923,27 @@ self._soup = _bs4_soup(self.get_parsed_page(True)) # type: ignore # Do not search for "new" here, to avoid to skip purging if links # to non-existing pages are present. - attrs = {'class': re.compile('prp-pagequality')} + attrs = {'class': re.compile('prp-pagequality-[0-4]')}
# Search for attribute "prp-pagequality" in tags: # Existing pages: # <a href="/wiki/Page:xxx.djvu/n" + # class="prp-pagequality-0 quality0" or + # class="prp-index-pagelist-page prp-pagequality-0 quality0" # title="Page:xxx.djvu/n">m - # class="quality1 prp-pagequality-1" # </a> # Non-existing pages: # <a href="/w/index.php?title=xxx&action=edit&redlink=1" - # class="new" + # class="new prp-index-pagelist-page" # title="Page:xxx.djvu/n (page does not exist)">m # </a>
# Try to purge or raise ValueError. found = self._soup.find_all('a', attrs=attrs) - attrs = {'class': re.compile('prp-pagequality|new')} + attrs = {'class': re.compile('prp-pagequality-[0-4]|' + 'new prp-index-pagelist-page|' + 'prp-index-pagelist-page') + } if not found: self.purge() self._soup = _bs4_soup(self.get_parsed_page(True)) # type: ignore @@ -932,7 +952,6 @@ 'Missing class="qualityN prp-pagequality-N" or ' 'class="new" in: {}.'.format(self))
- # Search for attribute "prp-pagequality" or "new" in tags: page_cnt = 0 for a_tag in self._soup.find_all('a', attrs=attrs): label = a_tag.text.lstrip('0') # Label is not converted to int. @@ -947,16 +966,12 @@ title = a_tag.get('title') # existing page
assert title is not None - try: - page = ProofreadPage(self.site, title) - page.index = self # set index property for page - page_cnt += 1 - except ValueError: - # title is not in site.proofread_page_ns; do not consider it - continue
- if page not in self._all_page_links: - raise Error('Page {} not recognised.'.format(page)) + try: + page = self._all_page_links[title] + page_cnt += 1 + except KeyError: + continue
# In order to avoid to fetch other Page:title links outside # the Pages section of the Index page; these should hopefully be @@ -982,7 +997,8 @@ self._pages_from_label.setdefault(label, set()).add(page)
# Sanity check: all links to Page: ns must have been considered. - assert set(self._labels_from_page) == set(self._all_page_links) + assert (set(self._labels_from_page) + == set(self._all_page_links.values()))
# Info cached. self._cached = True @@ -1036,8 +1052,8 @@ # Decorate and sort by page number because preloadpages does not # guarantee order. # TODO: remove if preloadpages will guarantee order. - gen = ((p, self.get_number(p)) for p in gen) - gen = (p[0] for p in sorted(gen, key=lambda x: x[1])) + gen = ((self.get_number(p), p) for p in gen) + gen = (p for n, p in sorted(gen))
return gen