jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/588145 )
Change subject: [bugfix] Re-enable script test for imageharvest.py ......................................................................
[bugfix] Re-enable script test for imageharvest.py
- imageharvest script tests fails for Python 2.7 because urlopen has no __exit__ method and urllib.basejoin isn't imported - imageharvest script tests fails for Python 3 because there is no basejoin function - urllib is deprecated since Python 2.6. Use comms.http.fetch in favour of urlopen - Use urljoin instead of basejoin. Python 2 already imported urlparse.urljoin as urllib.basejoin - Skip urls if response.status is not 200. A http response status warning is already given.
Bug: T68102 Bug: T167726 Change-Id: I1deec4bc07065afb2cd7b425edb0673832aff0d6 --- M scripts/imageharvest.py M tests/script_tests.py 2 files changed, 13 insertions(+), 9 deletions(-)
Approvals: Dvorapa: Looks good to me, approved jenkins-bot: Verified
diff --git a/scripts/imageharvest.py b/scripts/imageharvest.py index ff0a73b..60a6a3d 100644 --- a/scripts/imageharvest.py +++ b/scripts/imageharvest.py @@ -18,7 +18,7 @@ -justshown Choose _only_ images shown on the page, not those linked """ # -# (C) Pywikibot team, 2004-2019 +# (C) Pywikibot team, 2004-2020 # # Distributed under the terms of the MIT license. # @@ -34,14 +34,14 @@ import pywikibot
from pywikibot.bot import QuitKeyboardInterrupt +from pywikibot.comms.http import fetch from pywikibot.specialbots import UploadRobot from pywikibot.tools import PY2
if not PY2: - import urllib - from urllib.request import urlopen + from urllib.parse import urljoin else: - from urllib import urlopen + from urlparse import urljoin
fileformats = ('jpg', 'jpeg', 'png', 'gif', 'svg', 'ogg')
@@ -52,9 +52,13 @@ if isinstance(BeautifulSoup, ImportError): raise BeautifulSoup
- links = [] - with urlopen(url) as f: - soup = BeautifulSoup(f.read(), 'html.parser') + response = fetch(url) + if response.status != 200: + pywikibot.output('Skipping url: {}' + .format(url)) + return [] + + soup = BeautifulSoup(response.text, 'html.parser')
if not shown: tagname = 'a' @@ -63,12 +67,13 @@ else: tagname = ['a', 'img']
+ links = [] for tag in soup.findAll(tagname): link = tag.get('src', tag.get('href', None)) if link: ext = os.path.splitext(link)[1].lower().strip('.') if ext in fileformats: - links.append(urllib.basejoin(url, link)) + links.append(urljoin(url, link)) return links
diff --git a/tests/script_tests.py b/tests/script_tests.py index dac48de..4e0102c 100644 --- a/tests/script_tests.py +++ b/tests/script_tests.py @@ -388,7 +388,6 @@
_allowed_failures = [ 'disambredir', - 'imageharvest', # T167726 'misspelling', # T94681 'watchlist', # T77965 'lonelypages', # T94680: uses exit code 1