jenkins-bot has submitted this change. (
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/969516 )
Change subject: proofreadpage.py: fetch URL of page scan via API
......................................................................
proofreadpage.py: fetch URL of page scan via API
Fetch URL of Page image using new API for MW >= 1.40:
- query+prop=imageforpage
No more HTML page scraping is needed, except for MW version < 1.40.
This should also fix bug T181913, tests are re-added.
Change-Id: I374e878d0b321024903be8d5194b2878355667b6
Bug: T352524
Bug: T181913
Bug: T114318
---
M tests/proofreadpage_tests.py
M pywikibot/page/_basepage.py
M pywikibot/site/_extensions.py
M pywikibot/data/api/_generators.py
M pywikibot/proofreadpage.py
5 files changed, 82 insertions(+), 9 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/data/api/_generators.py b/pywikibot/data/api/_generators.py
index 902c561..07f0338 100644
--- a/pywikibot/data/api/_generators.py
+++ b/pywikibot/data/api/_generators.py
@@ -1038,3 +1038,8 @@
page._lintinfo.pop('pageid')
page._lintinfo.pop('title')
page._lintinfo.pop('ns')
+
+ if 'imageforpage' in props and 'imagesforpage' in pagedict:
+ # proofreadpage will work always on dicts
+ # it serves also as workaround for T352482
+ page._imageforpage = pagedict['imagesforpage'] or {}
diff --git a/pywikibot/page/_basepage.py b/pywikibot/page/_basepage.py
index 796eac5..caef2f3 100644
--- a/pywikibot/page/_basepage.py
+++ b/pywikibot/page/_basepage.py
@@ -73,7 +73,7 @@
'_contentmodel', '_langlinks', '_isredir',
'_coords',
'_preloadedtext', '_timestamp',
'_applicable_protections',
'_flowinfo', '_quality', '_pageprops', '_revid',
'_quality_text',
- '_pageimage', '_item', '_lintinfo',
+ '_pageimage', '_item', '_lintinfo',
'_imageforpage',
)
def __init__(self, source, title: str = '', ns=0) -> None:
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index a4ae60f..7157316 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -54,7 +54,7 @@
from pywikibot.data.api import ListGenerator, Request
from pywikibot.exceptions import Error, InvalidTitleError, OtherPageSaveError
from pywikibot.page import PageSourceType
-from pywikibot.tools import cached
+from pywikibot.tools import MediaWikiVersion, cached
try:
@@ -825,9 +825,7 @@
"""
return f'/* {self.status} */ '
- @property
- @cached
- def url_image(self) -> str:
+ def __url_image_lt_140(self) -> str:
"""Get the file url of the scan of ProofreadPage.
:return: file url of the scan ProofreadPage or None.
@@ -864,6 +862,36 @@
return url_image
+ def __url_image(self) -> str:
+ """Get the file url of the scan of ProofreadPage.
+
+ :return: file url of the scan of ProofreadPage or None.
+ :raises ValueError: in case of no image found for scan
+ """
+ self.site.loadpageurls(self)
+ url = self._imageforpage.get('fullsize')
+ if url is not None:
+ return f'{self.site.family.protocol(self.site.code)}:{url}'
+ else:
+ raise ValueError(f'imagesforpage is empty for {self}.')
+
+ @property
+ @cached
+ def url_image(self) -> str:
+ """Get the file url of the scan of ProofreadPage.
+
+ :return: file url of the scan of ProofreadPage or None.
+
+ For MW version < 1.40:
+ :raises Exception: in case of http errors
+ :raises ImportError: if bs4 is not installed, _bs4_soup() will raise
+ :raises ValueError: in case of no prp_page_image src found for scan
+ """
+ if self.site.version() < MediaWikiVersion('1.40'):
+ return self.__url_image_lt_140()
+ else:
+ return self.__url_image()
+
def _ocr_callback(self, cmd_uri: str,
parser_func: Optional[Callable[[str], str]] = None,
ocr_tool: Optional[str] = None
diff --git a/pywikibot/site/_extensions.py b/pywikibot/site/_extensions.py
index f6c5859..4e6977e 100644
--- a/pywikibot/site/_extensions.py
+++ b/pywikibot/site/_extensions.py
@@ -141,6 +141,29 @@
self._cache_proofreadinfo()
return self._proofread_levels
+ @need_extension('ProofreadPage')
+ def loadpageurls(
+ self,
+ page: 'pywikibot.page.BasePage'
+ ) -> None:
+ """Load URLs from api and store in page attributes.
+
+ Load URLs to images for a given page in the "Page:" namespace.
+ No effect for pages in other namespaces.
+
+ .. seealso:: :api:`imageforpage`
+ """
+ title = page.title(with_section=False)
+ # responsiveimages: server would try to render the other images as well
+ # let's not load the server unless needed.
+ prppifpprop = 'filename|size|fullsize'
+
+ query = self._generator(api.PropertyGenerator,
+ type_arg='imageforpage',
+ titles=title.encode(self.encoding()),
+ prppifpprop=prppifpprop)
+ self._update_page(page, query)
+
class GeoDataMixin:
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 40a1728..13534e1 100755
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -26,7 +26,6 @@
BasePageLoadRevisionsCachingTestBase,
BasePageMethodsTestBase,
)
-from tests.utils import skipping
class TestPagesTagParser(TestCase):
@@ -250,7 +249,7 @@
'footer': '\n{{smallrefs}}',
'url_image': ('https://upload.wikimedia.org/wikipedia/commons/'
'thumb/a/ac/Popular_Science_Monthly_Volume_1.djvu/'
- 'page12-1024px-Popular_Science_Monthly_Volume_1.djvu'
+ 'page12-2267px-Popular_Science_Monthly_Volume_1.djvu'
'.jpg'),
}
@@ -412,8 +411,7 @@
page.url_image
page = ProofreadPage(self.site, self.valid_redlink['title'])
- with skipping(ValueError, msg='T181913, T114318'):
- self.assertEqual(page.url_image, self.valid_redlink['url_image'])
+ self.assertEqual(page.url_image, self.valid_redlink['url_image'])
class TestPageQuality(TestCase):
--
To view, visit
https://gerrit.wikimedia.org/r/c/pywikibot/core/+/969516
To unsubscribe, or for help writing mail filters, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I374e878d0b321024903be8d5194b2878355667b6
Gerrit-Change-Number: 969516
Gerrit-PatchSet: 13
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Sohom Datta <sohomdatta1(a)gmail.com>
Gerrit-MessageType: merged