[Gerrit] pywikibot/core[master]: [FEAT] pagegenerators: handle protocols in -weblink - Pywikibot-commits

9 May 2020

jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/593055 )
Change subject: [FEAT] pagegenerators: handle protocols in -weblink
......................................................................
[FEAT] pagegenerators: handle protocols in -weblink
Currently, http is hardcoded in -weblink. It's not possible
to use another protocol.
This patch allows you to specify a protocol in this CLI
parameter, in the form of a URL, such as
-weblink:https://wikipedia.org. Http is keept as default
if none is given.
Added tests for -weblink and LinksearchPageGenerator class
behind it.
Bug: T251310
Bug: T251308
Change-Id: I3804c46e3f037f1b03c3198404734a771a849f44
---
M pywikibot/pagegenerators.py
M pywikibot/site/__init__.py
M tests/pagegenerators_tests.py
3 files changed, 77 insertions(+), 13 deletions(-)
Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index d45c181..1521617 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -993,10 +993,6 @@
         if not value:
             value = pywikibot.input(
                 'Pages with which weblink should be processed?')
-        # If url is * we make it None in order to search for every page
-        # with any URL.
-        if value == '*':
-            value = None
         return LinksearchPageGenerator(value, site=self.site)
def _handle_transcludes(self, value):
@@ -2583,10 +2579,10 @@
@deprecated_args(link='url', euprotocol='protocol', step=None)
 def LinksearchPageGenerator(url, namespaces=None, total=None,
-                            site=None, protocol='http'):
+                            site=None, protocol=None):
     """Yield all pages that link to a certain URL, like Special:Linksearch.
-    @param url: The URL to search for (without the protocol prefix);
+    @param url: The URL to search for (with ot without the protocol prefix);
             this may include a '*' as a wildcard, only at the start of the
             hostname
     @type url: str
@@ -2594,8 +2590,11 @@
     @type namespaces: list of int
     @param total: Maximum number of pages to retrieve in total
     @type total: int
-    @param site: Site for generator results.
+    @param site: Site for generator results
     @type site: L{pywikibot.site.BaseSite}
+    @param protocol: Protocol to search for, likely http or https, http by
+            default. Full list shown on Special:LinkSearch wikipage
+    @type protocol: str
     """
     if site is None:
         site = pywikibot.Site()
diff --git a/pywikibot/site/__init__.py b/pywikibot/site/__init__.py
index 5b78630..d8896cb 100644
--- a/pywikibot/site/__init__.py
+++ b/pywikibot/site/__init__.py
@@ -4610,18 +4610,39 @@
         return bkgen
@deprecated_args(step=None)
-    def exturlusage(self, url=None, protocol='http', namespaces=None,
+    def exturlusage(self, url=None, protocol=None, namespaces=None,
                     total=None, content=False):
         """Iterate Pages that contain links to the given URL.
@see: U{https://www.mediawiki.org/wiki/API:Exturlusage%7D
-        @param url: The URL to search for (without the protocol prefix);
-            this may include a '*' as a wildcard, only at the start of the
-            hostname
-        @param protocol: The protocol prefix (default: "http")
-
+        @param url: The URL to search for (with ot without the protocol
+            prefix); this may include a '*' as a wildcard, only at the start
+            of the hostname
+        @type url: str
+        @param namespaces: list of namespace numbers to fetch contribs from
+        @type namespaces: list of int
+        @param total: Maximum number of pages to retrieve in total
+        @type total: int
+        @param protocol: Protocol to search for, likely http or https, http by
+                default. Full list shown on Special:LinkSearch wikipage
+        @type protocol: str
         """
+        separator = '://'
+        if separator in url:
+            found_protocol = url[:url.index(separator)]
+            url = url[url.index(separator) + len(separator):]
+            if protocol and protocol != found_protocol:
+                raise ValueError('Protocol was specified, but a different one '
+                                 'was found in searched url')
+            protocol = found_protocol
+        if not protocol:
+            protocol = 'http'
+
+        # If url is * we make it None in order to search for every page
+        # with any URL.
+        if url == '*':
+            url = None
         return self._generator(api.PageGenerator, type_arg='exturlusage',
                                geuquery=url, geuprotocol=protocol,
                                namespaces=namespaces,
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index ace3262..c2d1b3f 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -1626,6 +1626,50 @@
                 assert False  # this shouldn't be reached
+class TestLinksearchPageGenerator(TestCase):
+
+    """Tests for pagegenerators.LinksearchPageGenerator."""
+
+    family = 'wikipedia'
+    code = 'en'
+
+    def test_weblink(self):
+        """Test -weblink."""
+        cases = (('wikipedia.org', 'http://wikipedia.org'),
+                 ('en.wikipedia.org', 'http://en.wikipedia.org'),
+                 ('https://fr.wikipedia.org', 'https://fr.wikipedia.org'),
+                 ('ftp://*', 'ftp://'))
+
+        for search, expected in cases:
+            gf = pagegenerators.GeneratorFactory(site=self.site)
+            gf.handleArg('-weblink:%s' % search)
+            gf.handleArg('-ns:2')
+            gf.handleArg('-limit:1')
+            gen = gf.getCombinedGenerator()
+            genlist = list(gen)
+            self.assertLength(genlist, 1)
+
+            page = genlist[0]
+            self.assertIsInstance(page, pywikibot.Page)
+            self.assertTrue(page.exists())
+            self.assertEqual(page.namespace(), 2)
+            self.assertIn(expected, page.text)
+
+    def test_double_opposite_protocols(self):
+        """Test LinksearchPageGenerator with two opposite protocols."""
+        self.assertRaises(ValueError, pagegenerators.LinksearchPageGenerator,
+                          'http://w.wiki', protocol='https', site=self.site)
+
+    def test_double_same_protocols(self):
+        """Test LinksearchPageGenerator with two same protocols."""
+        gen = pagegenerators.LinksearchPageGenerator('https://w.wiki',
+                                                     protocol='https',
+                                                     site=self.site,
+                                                     total=1)
+        self.assertIsInstance(gen, pywikibot.data.api.PageGenerator)
+        self.assertEqual(len(list(gen)), 1)
+
+
 if __name__ == '__main__':  # pragma: no cover
     try:
         unittest.main()
-- 
To view, visit https://gerrit.wikimedia.org/r/593055
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-MessageType: merged
Gerrit-Change-Id: I3804c46e3f037f1b03c3198404734a771a849f44
Gerrit-Change-Number: 593055
Gerrit-PatchSet: 4
Gerrit-Owner: Framawiki framawiki@tools.wmflabs.org
Gerrit-Reviewer: Dvorapa dvorapa@seznam.cz
Gerrit-Reviewer: Framawiki framawiki@tools.wmflabs.org
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot (75)