jenkins-bot merged this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[FEAT] pagegenerators: handle protocols in -weblink

Currently, http is hardcoded in -weblink. It's not possible
to use another protocol.

This patch allows you to specify a protocol in this CLI
parameter, in the form of a URL, such as
-weblink:https://wikipedia.org. Http is keept as default
if none is given.

Added tests for -weblink and LinksearchPageGenerator class
behind it.

Bug: T251310
Bug: T251308
Change-Id: I3804c46e3f037f1b03c3198404734a771a849f44
---
M pywikibot/pagegenerators.py
M pywikibot/site/__init__.py
M tests/pagegenerators_tests.py
3 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index d45c181..1521617 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -993,10 +993,6 @@
if not value:
value = pywikibot.input(
'Pages with which weblink should be processed?')
- # If url is * we make it None in order to search for every page
- # with any URL.
- if value == '*':
- value = None
return LinksearchPageGenerator(value, site=self.site)

def _handle_transcludes(self, value):
@@ -2583,10 +2579,10 @@

@deprecated_args(link='url', euprotocol='protocol', step=None)
def LinksearchPageGenerator(url, namespaces=None, total=None,
- site=None, protocol='http'):
+ site=None, protocol=None):
"""Yield all pages that link to a certain URL, like Special:Linksearch.

- @param url: The URL to search for (without the protocol prefix);
+ @param url: The URL to search for (with ot without the protocol prefix);
this may include a '*' as a wildcard, only at the start of the
hostname
@type url: str
@@ -2594,8 +2590,11 @@
@type namespaces: list of int
@param total: Maximum number of pages to retrieve in total
@type total: int
- @param site: Site for generator results.
+ @param site: Site for generator results
@type site: L{pywikibot.site.BaseSite}
+ @param protocol: Protocol to search for, likely http or https, http by
+ default. Full list shown on Special:LinkSearch wikipage
+ @type protocol: str
"""
if site is None:
site = pywikibot.Site()
diff --git a/pywikibot/site/__init__.py b/pywikibot/site/__init__.py
index 5b78630..d8896cb 100644
--- a/pywikibot/site/__init__.py
+++ b/pywikibot/site/__init__.py
@@ -4610,18 +4610,39 @@
return bkgen

@deprecated_args(step=None)
- def exturlusage(self, url=None, protocol='http', namespaces=None,
+ def exturlusage(self, url=None, protocol=None, namespaces=None,
total=None, content=False):
"""Iterate Pages that contain links to the given URL.

@see: U{https://www.mediawiki.org/wiki/API:Exturlusage}

- @param url: The URL to search for (without the protocol prefix);
- this may include a '*' as a wildcard, only at the start of the
- hostname
- @param protocol: The protocol prefix (default: "http")
-
+ @param url: The URL to search for (with ot without the protocol
+ prefix); this may include a '*' as a wildcard, only at the start
+ of the hostname
+ @type url: str
+ @param namespaces: list of namespace numbers to fetch contribs from
+ @type namespaces: list of int
+ @param total: Maximum number of pages to retrieve in total
+ @type total: int
+ @param protocol: Protocol to search for, likely http or https, http by
+ default. Full list shown on Special:LinkSearch wikipage
+ @type protocol: str
"""
+ separator = '://'
+ if separator in url:
+ found_protocol = url[:url.index(separator)]
+ url = url[url.index(separator) + len(separator):]
+ if protocol and protocol != found_protocol:
+ raise ValueError('Protocol was specified, but a different one '
+ 'was found in searched url')
+ protocol = found_protocol
+ if not protocol:
+ protocol = 'http'
+
+ # If url is * we make it None in order to search for every page
+ # with any URL.
+ if url == '*':
+ url = None
return self._generator(api.PageGenerator, type_arg='exturlusage',
geuquery=url, geuprotocol=protocol,
namespaces=namespaces,
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index ace3262..c2d1b3f 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -1626,6 +1626,50 @@
assert False # this shouldn't be reached


+class TestLinksearchPageGenerator(TestCase):
+
+ """Tests for pagegenerators.LinksearchPageGenerator."""
+
+ family = 'wikipedia'
+ code = 'en'
+
+ def test_weblink(self):
+ """Test -weblink."""
+ cases = (('wikipedia.org', 'http://wikipedia.org'),
+ ('en.wikipedia.org', 'http://en.wikipedia.org'),
+ ('https://fr.wikipedia.org', 'https://fr.wikipedia.org'),
+ ('ftp://*', 'ftp://'))
+
+ for search, expected in cases:
+ gf = pagegenerators.GeneratorFactory(site=self.site)
+ gf.handleArg('-weblink:%s' % search)
+ gf.handleArg('-ns:2')
+ gf.handleArg('-limit:1')
+ gen = gf.getCombinedGenerator()
+ genlist = list(gen)
+ self.assertLength(genlist, 1)
+
+ page = genlist[0]
+ self.assertIsInstance(page, pywikibot.Page)
+ self.assertTrue(page.exists())
+ self.assertEqual(page.namespace(), 2)
+ self.assertIn(expected, page.text)
+
+ def test_double_opposite_protocols(self):
+ """Test LinksearchPageGenerator with two opposite protocols."""
+ self.assertRaises(ValueError, pagegenerators.LinksearchPageGenerator,
+ 'http://w.wiki', protocol='https', site=self.site)
+
+ def test_double_same_protocols(self):
+ """Test LinksearchPageGenerator with two same protocols."""
+ gen = pagegenerators.LinksearchPageGenerator('https://w.wiki',
+ protocol='https',
+ site=self.site,
+ total=1)
+ self.assertIsInstance(gen, pywikibot.data.api.PageGenerator)
+ self.assertEqual(len(list(gen)), 1)
+
+
if __name__ == '__main__': # pragma: no cover
try:
unittest.main()

To view, visit change 593055. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I3804c46e3f037f1b03c3198404734a771a849f44
Gerrit-Change-Number: 593055
Gerrit-PatchSet: 4
Gerrit-Owner: Framawiki <framawiki@tools.wmflabs.org>
Gerrit-Reviewer: Dvorapa <dvorapa@seznam.cz>
Gerrit-Reviewer: Framawiki <framawiki@tools.wmflabs.org>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot (75)