jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/593055 )
Change subject: [FEAT] pagegenerators: handle protocols in -weblink ......................................................................
[FEAT] pagegenerators: handle protocols in -weblink
Currently, http is hardcoded in -weblink. It's not possible to use another protocol.
This patch allows you to specify a protocol in this CLI parameter, in the form of a URL, such as -weblink:https://wikipedia.org. Http is keept as default if none is given.
Added tests for -weblink and LinksearchPageGenerator class behind it.
Bug: T251310 Bug: T251308 Change-Id: I3804c46e3f037f1b03c3198404734a771a849f44 --- M pywikibot/pagegenerators.py M pywikibot/site/__init__.py M tests/pagegenerators_tests.py 3 files changed, 77 insertions(+), 13 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index d45c181..1521617 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -993,10 +993,6 @@ if not value: value = pywikibot.input( 'Pages with which weblink should be processed?') - # If url is * we make it None in order to search for every page - # with any URL. - if value == '*': - value = None return LinksearchPageGenerator(value, site=self.site)
def _handle_transcludes(self, value): @@ -2583,10 +2579,10 @@
@deprecated_args(link='url', euprotocol='protocol', step=None) def LinksearchPageGenerator(url, namespaces=None, total=None, - site=None, protocol='http'): + site=None, protocol=None): """Yield all pages that link to a certain URL, like Special:Linksearch.
- @param url: The URL to search for (without the protocol prefix); + @param url: The URL to search for (with ot without the protocol prefix); this may include a '*' as a wildcard, only at the start of the hostname @type url: str @@ -2594,8 +2590,11 @@ @type namespaces: list of int @param total: Maximum number of pages to retrieve in total @type total: int - @param site: Site for generator results. + @param site: Site for generator results @type site: L{pywikibot.site.BaseSite} + @param protocol: Protocol to search for, likely http or https, http by + default. Full list shown on Special:LinkSearch wikipage + @type protocol: str """ if site is None: site = pywikibot.Site() diff --git a/pywikibot/site/__init__.py b/pywikibot/site/__init__.py index 5b78630..d8896cb 100644 --- a/pywikibot/site/__init__.py +++ b/pywikibot/site/__init__.py @@ -4610,18 +4610,39 @@ return bkgen
@deprecated_args(step=None) - def exturlusage(self, url=None, protocol='http', namespaces=None, + def exturlusage(self, url=None, protocol=None, namespaces=None, total=None, content=False): """Iterate Pages that contain links to the given URL.
@see: U{https://www.mediawiki.org/wiki/API:Exturlusage%7D
- @param url: The URL to search for (without the protocol prefix); - this may include a '*' as a wildcard, only at the start of the - hostname - @param protocol: The protocol prefix (default: "http") - + @param url: The URL to search for (with ot without the protocol + prefix); this may include a '*' as a wildcard, only at the start + of the hostname + @type url: str + @param namespaces: list of namespace numbers to fetch contribs from + @type namespaces: list of int + @param total: Maximum number of pages to retrieve in total + @type total: int + @param protocol: Protocol to search for, likely http or https, http by + default. Full list shown on Special:LinkSearch wikipage + @type protocol: str """ + separator = '://' + if separator in url: + found_protocol = url[:url.index(separator)] + url = url[url.index(separator) + len(separator):] + if protocol and protocol != found_protocol: + raise ValueError('Protocol was specified, but a different one ' + 'was found in searched url') + protocol = found_protocol + if not protocol: + protocol = 'http' + + # If url is * we make it None in order to search for every page + # with any URL. + if url == '*': + url = None return self._generator(api.PageGenerator, type_arg='exturlusage', geuquery=url, geuprotocol=protocol, namespaces=namespaces, diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py index ace3262..c2d1b3f 100755 --- a/tests/pagegenerators_tests.py +++ b/tests/pagegenerators_tests.py @@ -1626,6 +1626,50 @@ assert False # this shouldn't be reached
+class TestLinksearchPageGenerator(TestCase): + + """Tests for pagegenerators.LinksearchPageGenerator.""" + + family = 'wikipedia' + code = 'en' + + def test_weblink(self): + """Test -weblink.""" + cases = (('wikipedia.org', 'http://wikipedia.org'), + ('en.wikipedia.org', 'http://en.wikipedia.org'), + ('https://fr.wikipedia.org', 'https://fr.wikipedia.org'), + ('ftp://*', 'ftp://')) + + for search, expected in cases: + gf = pagegenerators.GeneratorFactory(site=self.site) + gf.handleArg('-weblink:%s' % search) + gf.handleArg('-ns:2') + gf.handleArg('-limit:1') + gen = gf.getCombinedGenerator() + genlist = list(gen) + self.assertLength(genlist, 1) + + page = genlist[0] + self.assertIsInstance(page, pywikibot.Page) + self.assertTrue(page.exists()) + self.assertEqual(page.namespace(), 2) + self.assertIn(expected, page.text) + + def test_double_opposite_protocols(self): + """Test LinksearchPageGenerator with two opposite protocols.""" + self.assertRaises(ValueError, pagegenerators.LinksearchPageGenerator, + 'http://w.wiki', protocol='https', site=self.site) + + def test_double_same_protocols(self): + """Test LinksearchPageGenerator with two same protocols.""" + gen = pagegenerators.LinksearchPageGenerator('https://w.wiki', + protocol='https', + site=self.site, + total=1) + self.assertIsInstance(gen, pywikibot.data.api.PageGenerator) + self.assertEqual(len(list(gen)), 1) + + if __name__ == '__main__': # pragma: no cover try: unittest.main()
pywikibot-commits@lists.wikimedia.org