jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/969516 )
Change subject: proofreadpage.py: fetch URL of page scan via API
......................................................................
proofreadpage.py: fetch URL of page scan via API
Fetch URL of Page image using new API for MW >= 1.40:
- query+prop=imageforpage
No more HTML page scraping is needed, except for MW version < 1.40.
This should also fix bug T181913, tests are re-added.
Change-Id: I374e878d0b321024903be8d5194b2878355667b6
Bug: T352524
Bug: T181913
Bug: T114318
---
M tests/proofreadpage_tests.py
M pywikibot/page/_basepage.py
M pywikibot/site/_extensions.py
M pywikibot/data/api/_generators.py
M pywikibot/proofreadpage.py
5 files changed, 82 insertions(+), 9 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/data/api/_generators.py b/pywikibot/data/api/_generators.py
index 902c561..07f0338 100644
--- a/pywikibot/data/api/_generators.py
+++ b/pywikibot/data/api/_generators.py
@@ -1038,3 +1038,8 @@
page._lintinfo.pop('pageid')
page._lintinfo.pop('title')
page._lintinfo.pop('ns')
+
+ if 'imageforpage' in props and 'imagesforpage' in pagedict:
+ # proofreadpage will work always on dicts
+ # it serves also as workaround for T352482
+ page._imageforpage = pagedict['imagesforpage'] or {}
diff --git a/pywikibot/page/_basepage.py b/pywikibot/page/_basepage.py
index 796eac5..caef2f3 100644
--- a/pywikibot/page/_basepage.py
+++ b/pywikibot/page/_basepage.py
@@ -73,7 +73,7 @@
'_contentmodel', '_langlinks', '_isredir', '_coords',
'_preloadedtext', '_timestamp', '_applicable_protections',
'_flowinfo', '_quality', '_pageprops', '_revid', '_quality_text',
- '_pageimage', '_item', '_lintinfo',
+ '_pageimage', '_item', '_lintinfo', '_imageforpage',
)
def __init__(self, source, title: str = '', ns=0) -> None:
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index a4ae60f..7157316 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -54,7 +54,7 @@
from pywikibot.data.api import ListGenerator, Request
from pywikibot.exceptions import Error, InvalidTitleError, OtherPageSaveError
from pywikibot.page import PageSourceType
-from pywikibot.tools import cached
+from pywikibot.tools import MediaWikiVersion, cached
try:
@@ -825,9 +825,7 @@
"""
return f'/* {self.status} */ '
- @property
- @cached
- def url_image(self) -> str:
+ def __url_image_lt_140(self) -> str:
"""Get the file url of the scan of ProofreadPage.
:return: file url of the scan ProofreadPage or None.
@@ -864,6 +862,36 @@
return url_image
+ def __url_image(self) -> str:
+ """Get the file url of the scan of ProofreadPage.
+
+ :return: file url of the scan of ProofreadPage or None.
+ :raises ValueError: in case of no image found for scan
+ """
+ self.site.loadpageurls(self)
+ url = self._imageforpage.get('fullsize')
+ if url is not None:
+ return f'{self.site.family.protocol(self.site.code)}:{url}'
+ else:
+ raise ValueError(f'imagesforpage is empty for {self}.')
+
+ @property
+ @cached
+ def url_image(self) -> str:
+ """Get the file url of the scan of ProofreadPage.
+
+ :return: file url of the scan of ProofreadPage or None.
+
+ For MW version < 1.40:
+ :raises Exception: in case of http errors
+ :raises ImportError: if bs4 is not installed, _bs4_soup() will raise
+ :raises ValueError: in case of no prp_page_image src found for scan
+ """
+ if self.site.version() < MediaWikiVersion('1.40'):
+ return self.__url_image_lt_140()
+ else:
+ return self.__url_image()
+
def _ocr_callback(self, cmd_uri: str,
parser_func: Optional[Callable[[str], str]] = None,
ocr_tool: Optional[str] = None
diff --git a/pywikibot/site/_extensions.py b/pywikibot/site/_extensions.py
index f6c5859..4e6977e 100644
--- a/pywikibot/site/_extensions.py
+++ b/pywikibot/site/_extensions.py
@@ -141,6 +141,29 @@
self._cache_proofreadinfo()
return self._proofread_levels
+ @need_extension('ProofreadPage')
+ def loadpageurls(
+ self,
+ page: 'pywikibot.page.BasePage'
+ ) -> None:
+ """Load URLs from api and store in page attributes.
+
+ Load URLs to images for a given page in the "Page:" namespace.
+ No effect for pages in other namespaces.
+
+ .. seealso:: :api:`imageforpage`
+ """
+ title = page.title(with_section=False)
+ # responsiveimages: server would try to render the other images as well
+ # let's not load the server unless needed.
+ prppifpprop = 'filename|size|fullsize'
+
+ query = self._generator(api.PropertyGenerator,
+ type_arg='imageforpage',
+ titles=title.encode(self.encoding()),
+ prppifpprop=prppifpprop)
+ self._update_page(page, query)
+
class GeoDataMixin:
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 40a1728..13534e1 100755
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -26,7 +26,6 @@
BasePageLoadRevisionsCachingTestBase,
BasePageMethodsTestBase,
)
-from tests.utils import skipping
class TestPagesTagParser(TestCase):
@@ -250,7 +249,7 @@
'footer': '\n{{smallrefs}}',
'url_image': ('https://upload.wikimedia.org/wikipedia/commons/'
'thumb/a/ac/Popular_Science_Monthly_Volume_1.djvu/'
- 'page12-1024px-Popular_Science_Monthly_Volume_1.djvu'
+ 'page12-2267px-Popular_Science_Monthly_Volume_1.djvu'
'.jpg'),
}
@@ -412,8 +411,7 @@
page.url_image
page = ProofreadPage(self.site, self.valid_redlink['title'])
- with skipping(ValueError, msg='T181913, T114318'):
- self.assertEqual(page.url_image, self.valid_redlink['url_image'])
+ self.assertEqual(page.url_image, self.valid_redlink['url_image'])
class TestPageQuality(TestCase):
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/969516
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I374e878d0b321024903be8d5194b2878355667b6
Gerrit-Change-Number: 969516
Gerrit-PatchSet: 13
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-CC: Sohom Datta <sohomdatta1(a)gmail.com>
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/978143 )
Change subject: proofreadpage: sort page names before loading pages
......................................................................
proofreadpage: sort page names before loading pages
Sort the list of pages before converting to a preload generator.
Otherwise the sorted() operation will tranform everything to a list,
loading all pages, defeating the purpose of having a generator.
Change-Id: Ie8ca5d7dd37b7dd9fadca42ed5dcf339164fc427
---
M pywikibot/proofreadpage.py
1 file changed, 22 insertions(+), 6 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index e72b676..a4ae60f 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -1313,7 +1313,14 @@
filter_ql = list(self.site.proofread_levels)
filter_ql.remove(ProofreadPage.WITHOUT_TEXT)
- gen = (self.get_page(i) for i in range(start, end + 1))
+ gen = [self.get_page(i) for i in range(start, end + 1)]
+
+ # Decorate and sort by page number because preloadpages does not
+ # guarantee order.
+ # TODO: remove if preloadpages will guarantee order.
+ gen = [(self.get_number(p), p) for p in gen]
+ gen = [p for n, p in sorted(gen)]
+
if content:
gen = self.site.preloadpages(gen)
# Filter by QL.
@@ -1321,11 +1328,6 @@
# Yield only existing.
if only_existing:
gen = (p for p in gen if p.exists())
- # Decorate and sort by page number because preloadpages does not
- # guarantee order.
- # TODO: remove if preloadpages will guarantee order.
- gen = ((self.get_number(p), p) for p in gen)
- gen = (p for n, p in sorted(gen))
return gen
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/978143
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie8ca5d7dd37b7dd9fadca42ed5dcf339164fc427
Gerrit-Change-Number: 978143
Gerrit-PatchSet: 1
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
JJMC89 has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/979136 )
Change subject: [FIX] fix typo in explanation of options
......................................................................
[FIX] fix typo in explanation of options
Change-Id: Idc396cbf2ecab8043da1740ae86a4ea817e1f1c8
---
M scripts/weblinkchecker.py
1 file changed, 10 insertions(+), 1 deletion(-)
Approvals:
Mpaa: Looks good to me, approved
JJMC89: Verified; Looks good to me, approved
diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py
index a04d25d..02be28c 100755
--- a/scripts/weblinkchecker.py
+++ b/scripts/weblinkchecker.py
@@ -31,7 +31,7 @@
These command line parameters can be used to specify which pages to work on:
--repeat Work on all pages were dead links were found before. This is
+-repeat Work on all pages where dead links were found before. This is
useful to confirm that the links are dead after some time (at
least one week), which is required before the script will report
the problem.
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/979136
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Idc396cbf2ecab8043da1740ae86a4ea817e1f1c8
Gerrit-Change-Number: 979136
Gerrit-PatchSet: 2
Gerrit-Owner: Mevo1961 <mevo1961(a)gmail.com>
Gerrit-Reviewer: D3r1ck01 <dalangi-ctr(a)wikimedia.org>
Gerrit-Reviewer: JJMC89 <JJMC89.Wikimedia(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged