jenkins-bot merged this change.

View Change

Approvals: Dvorapa: Looks good to me, approved jenkins-bot: Verified
[bugfix] Re-enable script test for imageharvest.py

- imageharvest script tests fails for Python 2.7 because urlopen has
no __exit__ method and urllib.basejoin isn't imported
- imageharvest script tests fails for Python 3 because there is no
basejoin function
- urllib is deprecated since Python 2.6.
Use comms.http.fetch in favour of urlopen
- Use urljoin instead of basejoin. Python 2 already imported
urlparse.urljoin as urllib.basejoin
- Skip urls if response.status is not 200.
A http response status warning is already given.

Bug: T68102
Bug: T167726
Change-Id: I1deec4bc07065afb2cd7b425edb0673832aff0d6
---
M scripts/imageharvest.py
M tests/script_tests.py
2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/scripts/imageharvest.py b/scripts/imageharvest.py
index ff0a73b..60a6a3d 100644
--- a/scripts/imageharvest.py
+++ b/scripts/imageharvest.py
@@ -18,7 +18,7 @@
-justshown Choose _only_ images shown on the page, not those linked
"""
#
-# (C) Pywikibot team, 2004-2019
+# (C) Pywikibot team, 2004-2020
#
# Distributed under the terms of the MIT license.
#
@@ -34,14 +34,14 @@
import pywikibot

from pywikibot.bot import QuitKeyboardInterrupt
+from pywikibot.comms.http import fetch
from pywikibot.specialbots import UploadRobot
from pywikibot.tools import PY2

if not PY2:
- import urllib
- from urllib.request import urlopen
+ from urllib.parse import urljoin
else:
- from urllib import urlopen
+ from urlparse import urljoin

fileformats = ('jpg', 'jpeg', 'png', 'gif', 'svg', 'ogg')

@@ -52,9 +52,13 @@
if isinstance(BeautifulSoup, ImportError):
raise BeautifulSoup

- links = []
- with urlopen(url) as f:
- soup = BeautifulSoup(f.read(), 'html.parser')
+ response = fetch(url)
+ if response.status != 200:
+ pywikibot.output('Skipping url: {}'
+ .format(url))
+ return []
+
+ soup = BeautifulSoup(response.text, 'html.parser')

if not shown:
tagname = 'a'
@@ -63,12 +67,13 @@
else:
tagname = ['a', 'img']

+ links = []
for tag in soup.findAll(tagname):
link = tag.get('src', tag.get('href', None))
if link:
ext = os.path.splitext(link)[1].lower().strip('.')
if ext in fileformats:
- links.append(urllib.basejoin(url, link))
+ links.append(urljoin(url, link))
return links


diff --git a/tests/script_tests.py b/tests/script_tests.py
index dac48de..4e0102c 100644
--- a/tests/script_tests.py
+++ b/tests/script_tests.py
@@ -388,7 +388,6 @@

_allowed_failures = [
'disambredir',
- 'imageharvest', # T167726
'misspelling', # T94681
'watchlist', # T77965
'lonelypages', # T94680: uses exit code 1

To view, visit change 588145. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I1deec4bc07065afb2cd7b425edb0673832aff0d6
Gerrit-Change-Number: 588145
Gerrit-PatchSet: 4
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Dvorapa <dvorapa@seznam.cz>
Gerrit-Reviewer: jenkins-bot (75)