jenkins-bot has submitted this change and it was merged.
Change subject: Memento support for weblinkchecker ......................................................................
Memento support for weblinkchecker
Deprecate pywikibot.weblib in favour of memento_client. The weblib function for WebCite has been broken for a few months, and the Internet Archive function has been broken for a week.
Bug: T85001 Change-Id: Iead920493610d0f78faead5cb5aee0921fd0e7bc --- M pywikibot/weblib.py M requirements.txt M scripts/weblinkchecker.py M setup.py M tests/__init__.py M tests/script_tests.py M tests/weblib_tests.py A tests/weblinkchecker_tests.py 8 files changed, 180 insertions(+), 14 deletions(-)
Approvals: XZise: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/weblib.py b/pywikibot/weblib.py index 50ff23e..44f8ffb 100644 --- a/pywikibot/weblib.py +++ b/pywikibot/weblib.py @@ -10,14 +10,17 @@ __version__ = '$Id$'
import sys + if sys.version_info[0] > 2: from urllib.parse import urlencode else: from urllib import urlencode
from pywikibot.comms import http +from pywikibot.tools import deprecated
+@deprecated('memento_client package') def getInternetArchiveURL(url, timestamp=None): """Return archived URL by Internet Archive.
@@ -46,6 +49,7 @@ return None
+@deprecated('memento_client package') def getWebCitationURL(url, timestamp=None): """Return archived URL by Web Citation.
diff --git a/requirements.txt b/requirements.txt index e12c55c..da61fb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -86,3 +86,6 @@
# scripts/states_redirect.py pycountry + +# scripts/weblinkchecker.py +git+https://github.com/mementoweb/py-memento-client#egg=memento_client-0.5.0 diff --git a/scripts/weblinkchecker.py b/scripts/weblinkchecker.py index 6aae020..64a01a3 100755 --- a/scripts/weblinkchecker.py +++ b/scripts/weblinkchecker.py @@ -91,7 +91,7 @@ """ # # (C) Daniel Herding, 2005 -# (C) Pywikibot team, 2005-2014 +# (C) Pywikibot team, 2005-2015 # # Distributed under the terms of the MIT license. # @@ -101,11 +101,19 @@
import re import codecs +import datetime import pickle import socket import threading import time import sys + +from warnings import warn + +try: + import memento_client +except ImportError as e: + memento_client = e
import pywikibot from pywikibot import i18n, config, pagegenerators, textlib, xmlreader, weblib @@ -155,6 +163,34 @@ # bot rejected on the site: re.compile(r'.*[./@]quickfacts.census.gov(/.*)?'), ] + + +def _get_closest_memento_url(url, when=None, timegate_uri=None): + """Get most recent memento for url.""" + if isinstance(memento_client, ImportError): + raise memento_client + + if not when: + when = datetime.datetime.now() + + mc = memento_client.MementoClient() + if timegate_uri: + mc.timegate_uri = timegate_uri + + memento_info = mc.get_memento_info(url, when) + return memento_info.get('mementos').get('closest').get('uri')[0] + + +def get_archive_url(url): + """Get archive URL.""" + try: + return _get_closest_memento_url( + url, + timegate_uri='http://web.archive.org/web/') + except Exception: + return _get_closest_memento_url( + url, + timegate_uri='http://timetravel.mementoweb.org/webcite/timegate/')
def weblinksIn(text, withoutBracketed=False, onlyBracketed=False): @@ -633,7 +669,15 @@ # We'll list it in a file so that it can be removed manually. if timeSinceFirstFound > 60 * 60 * 24 * day: # search for archived page - archiveURL = weblib.getInternetArchiveURL(url) + try: + archiveURL = get_archive_url(url) + except Exception as e: + pywikibot.warning( + 'get_closest_memento_url({0}) failed: {1}'.format( + url, e)) + archiveURL = None + if archiveURL is None: + archiveURL = weblib.getInternetArchiveURL(url) if archiveURL is None: archiveURL = weblib.getWebCitationURL(url) self.log(url, error, page, archiveURL) @@ -863,6 +907,9 @@ HTTPignore = [] day = 7
+ if isinstance(memento_client, ImportError): + warn('memento_client not imported: %s' % memento_client, ImportWarning) + # Process global args and prepare generator args parser local_args = pywikibot.handle_args(args) genFactory = pagegenerators.GeneratorFactory() diff --git a/setup.py b/setup.py index eab2bcf..f541fa3 100644 --- a/setup.py +++ b/setup.py @@ -90,6 +90,7 @@ 'hg+https://bitbucket.org/TJG/pywin32#egg=pywin32', 'git+https://github.com/vasily-v-ryabov/pywinauto-64#egg=pywinauto', 'git+https://github.com/nlhepler/pydot#egg=pydot-1.0.29', + 'git+https://github.com/mementoweb/py-memento-client#egg=memento_client-0.5.0', ]
if PYTHON_VERSION < (2, 7, 3): diff --git a/tests/__init__.py b/tests/__init__.py index c5a7d42..704920d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -94,6 +94,9 @@
disabled_test_modules = [ 'tests', # tests of the tests package + # weblib is deprecated, the tests fail for weblib, + # but the tests are run in weblinkchecker_tests. + 'weblib', ] if not i18n.messages_available(): disabled_test_modules.append('l10n') diff --git a/tests/script_tests.py b/tests/script_tests.py index bd33b71..3fd9596 100644 --- a/tests/script_tests.py +++ b/tests/script_tests.py @@ -33,6 +33,7 @@ 'match_images': ['PIL.ImageTk'], 'states_redirect': ['pycountry'], 'patrol': ['mwlib'], + 'weblinkchecker.py': ['memento_client'], }
if sys.version_info < (2, 7): diff --git a/tests/weblib_tests.py b/tests/weblib_tests.py index 7c4ae97..65cc6a1 100644 --- a/tests/weblib_tests.py +++ b/tests/weblib_tests.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """Weblib test module.""" # -# (C) Pywikibot team, 2014 +# (C) Pywikibot team, 2014-2015 # # Distributed under the terms of the MIT license. # @@ -17,11 +17,11 @@
import pywikibot.weblib as weblib
-from tests.aspects import unittest, TestCase +from tests.aspects import unittest, DeprecationTestCase from tests.utils import PatchedHttp
-class TestInternetArchive(TestCase): +class TestInternetArchive(DeprecationTestCase):
"""Test weblib methods to access Internet Archive."""
@@ -36,11 +36,16 @@ # original content if that does not match self.assertIn('closest', response.content)
- def testInternetArchiveNewest(self): - """Test Internet Archive for newest https://google.com.""" + def _get_archive_url(self, url, date_string=None): with PatchedHttp(weblib, False) as p: p.after_fetch = self._test_response - archivedversion = weblib.getInternetArchiveURL('https://google.com') + archivedversion = weblib.getInternetArchiveURL(url, date_string) + self.assertOneDeprecation() + return archivedversion + + def testInternetArchiveNewest(self): + """Test Internet Archive for newest https://google.com.""" + archivedversion = self._get_archive_url('https://google.com') parsed = urlparse(archivedversion) self.assertIn(parsed.scheme, [u'http', u'https']) self.assertEqual(parsed.netloc, u'web.archive.org') @@ -48,9 +53,7 @@
def testInternetArchiveOlder(self): """Test Internet Archive for https://google.com as of June 2006.""" - with PatchedHttp(weblib, False) as p: - p.after_fetch = self._test_response - archivedversion = weblib.getInternetArchiveURL('https://google.com', '200606') + archivedversion = self._get_archive_url('https://google.com', '20060601') parsed = urlparse(archivedversion) self.assertIn(parsed.scheme, [u'http', u'https']) self.assertEqual(parsed.netloc, u'web.archive.org') @@ -58,7 +61,7 @@ self.assertIn('200606', parsed.path)
-class TestWebCite(TestCase): +class TestWebCite(DeprecationTestCase):
"""Test weblib methods to access WebCite."""
@@ -68,10 +71,14 @@ } }
- @unittest.expectedFailure + def _get_archive_url(self, url, date_string=None): + archivedversion = weblib.getWebCitationURL(url, date_string) + self.assertOneDeprecation() + return archivedversion + def testWebCiteOlder(self): """Test WebCite for https://google.com as of January 2013.""" - archivedversion = weblib.getWebCitationURL('https://google.com', '20130101') + archivedversion = self._get_archive_url('https://google.com', '20130101') self.assertEqual(archivedversion, 'http://www.webcitation.org/6DHSeh2L0')
diff --git a/tests/weblinkchecker_tests.py b/tests/weblinkchecker_tests.py new file mode 100644 index 0000000..280a2e9 --- /dev/null +++ b/tests/weblinkchecker_tests.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +"""weblinkchecker test module.""" +# +# (C) Pywikibot team, 2015 +# +# Distributed under the terms of the MIT license. +# +from __future__ import unicode_literals + +__version__ = '$Id$' + +import datetime + +from pywikibot.tools import PY2 +if not PY2: + from urllib.parse import urlparse +else: + from urlparse import urlparse + +from scripts import weblinkchecker + +from tests.aspects import unittest, TestCase, TestCaseBase +from tests import weblib_tests + + +class MementoTestBase(TestCaseBase): + + """Test memento client.""" + + @classmethod + def setUpClass(cls): + """Set up test class.""" + if isinstance(weblinkchecker.memento_client, ImportError): + raise unittest.SkipTest('memento_client not imported') + super(MementoTestBase, cls).setUpClass() + + def _get_archive_url(self, url, date_string=None): + if date_string is None: + when = datetime.datetime.now() + else: + when = datetime.datetime.strptime(date_string, '%Y%m%d') + return weblinkchecker._get_closest_memento_url( + url, + when, + self.timegate_uri) + + +class WeblibTestMementoInternetArchive(MementoTestBase, weblib_tests.TestInternetArchive): + + """Test InternetArchive Memento using old weblib tests.""" + + timegate_uri = 'http://web.archive.org/web/' + hostname = timegate_uri + + +class WeblibTestMementoWebCite(MementoTestBase, weblib_tests.TestWebCite): + + """Test WebCite Memento using old weblib tests.""" + + timegate_uri = 'http://timetravel.mementoweb.org/webcite/timegate/' + hostname = timegate_uri + + +class TestMementoWebCite(MementoTestBase): + + """New WebCite Memento tests.""" + + timegate_uri = 'http://timetravel.mementoweb.org/webcite/timegate/' + hostname = timegate_uri + + def test_newest(self): + """Test WebCite for newest https://google.com.""" + archivedversion = self._get_archive_url('https://google.com') + parsed = urlparse(archivedversion) + self.assertIn(parsed.scheme, ['http', 'https']) + self.assertEqual(parsed.netloc, 'www.webcitation.org') + + +class TestMementoDefault(MementoTestBase, TestCase): + + """Test InternetArchive is default Memento timegate.""" + + timegate_uri = None + net = True + + def test_newest(self): + """Test getting memento for newest https://google.com.""" + archivedversion = self._get_archive_url('https://google.com') + self.assertIsNotNone(archivedversion) + + def test_invalid(self): + """Test getting memento for invalid URL.""" + self.assertRaises(Exception, self._get_archive_url, 'invalid') + + +if __name__ == '__main__': + try: + unittest.main() + except SystemExit: + pass