jenkins-bot submitted this change.

View Change

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

pagegenerators: Add -url option

Allow users to create page generators based on a URL pointing to a page containing page titles.
Works much like the -file argument, but instead of taking a local filenames takes a URL instead.

Bug: T239436
Change-Id: I08150994fb14f44afdc79bab086d13b2d2a74fc2
---
M pywikibot/pagegenerators.py
M scripts/interwiki.py
M scripts/movepages.py
M tests/pagegenerators_tests.py
4 files changed, 90 insertions(+), 37 deletions(-)

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index d72d391..2ecd9f5 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -20,6 +20,7 @@
 import calendar
 import codecs
 import datetime
+import io
 import itertools
 import json
 import re
@@ -31,6 +32,7 @@
 from http import HTTPStatus
 from itertools import zip_longest
 from typing import Optional, Union
+from urllib.parse import urlparse
 
 from requests.exceptions import ReadTimeout
 
@@ -314,6 +316,11 @@
 
                     -querypage shows special pages available.
 
+-url                Read a list of pages to treat from the provided URL.
+                    The URL must return text in the same format as expected for
+                    the -file argument, e.g. page titles separated by newlines
+                    or enclosed in brackets.
+
 
 FILTER OPTIONS
 ==============
@@ -812,6 +819,12 @@
 
         return self.site.querypage(value)
 
+    def _handle_url(self, value):
+        """Handle `-url` argument."""
+        if not value:
+            value = pywikibot.input('Please enter the URL:')
+        return TextIOPageGenerator(value, site=self.site)
+
     def _handle_unusedfiles(self, value):
         """Handle `-unusedfiles` argument."""
         return self.site.unusedfiles(total=_int_none(value))
@@ -926,7 +939,7 @@
         """Handle `-file` argument."""
         if not value:
             value = pywikibot.input('Please enter the local file name:')
-        return TextfilePageGenerator(value, site=self.site)
+        return TextIOPageGenerator(value, site=self.site)
 
     def _handle_namespaces(self, value):
         """Handle `-namespaces` argument."""
@@ -1532,43 +1545,68 @@
                                    content=content)  # pragma: no cover
 
 
-def TextfilePageGenerator(filename: Optional[str] = None, site=None):
-    """Iterate pages from a list in a text file.
+def _yield_titles(f: Union[codecs.StreamReaderWriter, io.StringIO],
+                  site: pywikibot.Site):
+    """Yield page titles from a text stream.
 
-    The file must contain page links between double-square-brackets or, in
-    alternative, separated by newlines. The generator will yield each
+    :param f: text stream object
+    :type f: codecs.StreamReaderWriter, io.StringIO, or any other stream-like
+        object
+    :param site: Site for generator results.
+    :type site: :py:obj:`pywikibot.site.BaseSite`
+    :return: a generator that yields Page objects of pages with titles in text
+        stream
+    :rtype: generator
+    """
+    linkmatch = None
+    for linkmatch in pywikibot.link_regex.finditer(f.read()):
+        # If the link is in interwiki format, the Page object may reside
+        # on a different Site than the default.
+        # This makes it possible to work on different wikis using a single
+        # text file, but also could be dangerous because you might
+        # inadvertently change pages on another wiki!
+        yield pywikibot.Page(pywikibot.Link(linkmatch.group('title'),
+                                            site))
+    if linkmatch is not None:
+        return
+
+    f.seek(0)
+    for title in f:
+        title = title.strip()
+        if '|' in title:
+            title = title[:title.index('|')]
+        if title:
+            yield pywikibot.Page(site, title)
+
+
+def TextIOPageGenerator(source: Optional[str] = None,
+                        site: Optional[pywikibot.Site] = None):
+    """Iterate pages from a list in a text file or on a webpage.
+
+    The text source must contain page links between double-square-brackets or,
+    alternatively, separated by newlines. The generator will yield each
     corresponding Page object.
 
-    :param filename: the name of the file that should be read. If no name is
+    :param source: the file path or URL that should be read. If no name is
                      given, the generator prompts the user.
     :param site: Site for generator results.
     :type site: :py:obj:`pywikibot.site.BaseSite`
 
     """
-    if filename is None:
-        filename = pywikibot.input('Please enter the filename:')
+    if source is None:
+        source = pywikibot.input('Please enter the filename / URL:')
     if site is None:
         site = pywikibot.Site()
-    with codecs.open(filename, 'r', config.textfile_encoding) as f:
-        linkmatch = None
-        for linkmatch in pywikibot.link_regex.finditer(f.read()):
-            # If the link is in interwiki format, the Page object may reside
-            # on a different Site than the default.
-            # This makes it possible to work on different wikis using a single
-            # text file, but also could be dangerous because you might
-            # inadvertently change pages on another wiki!
-            yield pywikibot.Page(pywikibot.Link(linkmatch.group('title'),
-                                                site))
-        if linkmatch is not None:
-            return
-
-        f.seek(0)
-        for title in f:
-            title = title.strip()
-            if '|' in title:
-                title = title[:title.index('|')]
-            if title:
-                yield pywikibot.Page(site, title)
+    # If source cannot be parsed as an HTTP URL, treat as local file
+    if not urlparse(source).scheme:
+        with codecs.open(source, 'r', config.textfile_encoding) as f:
+            yield from _yield_titles(f, site)
+    # Else, fetch page (page should return text in same format as that expected
+    # in filename, i.e. pages separated by newlines or pages enclosed in double
+    # brackets
+    else:
+        with io.StringIO(http.fetch(source).text) as f:
+            yield from _yield_titles(f, site)
 
 
 def PagesFromTitlesGenerator(iterable, site=None):
@@ -2966,6 +3004,8 @@
 PreloadingItemGenerator = redirect_func(PreloadingEntityGenerator,
                                         old_name='PreloadingItemGenerator',
                                         since='20170314')
+TextfilePageGenerator = redirect_func(
+    TextIOPageGenerator, old_name='TextfilePageGenerator', since='20210611')
 
 if __name__ == '__main__':  # pragma: no cover
     pywikibot.output('Pagegenerators cannot be run as script - are you '
diff --git a/scripts/interwiki.py b/scripts/interwiki.py
index b32a085..3579259 100755
--- a/scripts/interwiki.py
+++ b/scripts/interwiki.py
@@ -512,7 +512,7 @@
             if value.isdigit():
                 self.needlimit = int(value)
         elif arg == 'skipfile':
-            skip_page_gen = pagegenerators.TextfilePageGenerator(value)
+            skip_page_gen = pagegenerators.TextIOPageGenerator(value)
             self.skip.update(skip_page_gen)
             del skip_page_gen
         elif arg == 'neverlink':
@@ -521,7 +521,7 @@
             self.ignore += [pywikibot.Page(pywikibot.Site(), p)
                             for p in value.split(',')]
         elif arg == 'ignorefile':
-            ignore_page_gen = pagegenerators.TextfilePageGenerator(value)
+            ignore_page_gen = pagegenerators.TextIOPageGenerator(value)
             self.ignore.update(ignore_page_gen)
             del ignore_page_gen
         elif arg == 'showpage':
@@ -2298,7 +2298,7 @@
                 continue
 
             pywikibot.output('Retrieving pages from dump file ' + tail)
-            for page in pagegenerators.TextfilePageGenerator(filename, site):
+            for page in pagegenerators.TextIOPageGenerator(filename, site):
                 if site == self.site:
                     self._next_page = page.title(with_ns=False) + '!'
                     self._next_namespace = page.namespace()
diff --git a/scripts/movepages.py b/scripts/movepages.py
index cefed76..eadd26a 100755
--- a/scripts/movepages.py
+++ b/scripts/movepages.py
@@ -197,7 +197,7 @@
             else:
                 filename = arg[len('-pairsfile:'):]
             oldName1 = None
-            for page in pagegenerators.TextfilePageGenerator(filename):
+            for page in pagegenerators.TextIOPageGenerator(filename):
                 if oldName1:
                     fromToPairs.append([oldName1, page.title()])
                     oldName1 = None
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index 120469c..f04f50d 100644
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -424,7 +424,7 @@
         self.assertLength({item['revid'] for item in items}, self.length)
 
 
-class TestTextfilePageGenerator(DefaultSiteTestCase):
+class TestTextIOPageGenerator(DefaultSiteTestCase):
 
     """Test loading pages from a textfile."""
 
@@ -444,10 +444,10 @@
     )
 
     def test_brackets(self):
-        """Test TextfilePageGenerator with brackets."""
+        """Test TextIOPageGenerator with brackets."""
         filename = join_data_path('pagelist-brackets.txt')
         site = self.get_site()
-        titles = list(pagegenerators.TextfilePageGenerator(filename, site))
+        titles = list(pagegenerators.TextIOPageGenerator(filename, site))
         self.assertLength(titles, self.expected_titles)
         expected_titles = [
             expected_title[self.title_columns[site.namespaces[page.namespace()]
@@ -456,10 +456,10 @@
         self.assertPageTitlesEqual(titles, expected_titles)
 
     def test_lines(self):
-        """Test TextfilePageGenerator with newlines."""
+        """Test TextIOPageGenerator with newlines."""
         filename = join_data_path('pagelist-lines.txt')
         site = self.get_site()
-        titles = list(pagegenerators.TextfilePageGenerator(filename, site))
+        titles = list(pagegenerators.TextIOPageGenerator(filename, site))
         self.assertLength(titles, self.expected_titles)
         expected_titles = [
             expected_title[self.title_columns[site.namespaces[page.namespace()]
@@ -467,6 +467,19 @@
             for expected_title, page in zip(self.expected_titles, titles)]
         self.assertPageTitlesEqual(titles, expected_titles)
 
+    @unittest.mock.patch('pywikibot.comms.http.fetch', autospec=True)
+    def test_url(self, mock_fetch):
+        """Test TextIOPageGenerator with URL."""
+        # Mock return value of fetch()
+        fetch_return = unittest.mock.Mock()
+        fetch_return.text = '\n'.join(
+            [title[0] for title in self.expected_titles])
+        mock_fetch.return_value = fetch_return
+        site = self.get_site()
+        titles = list(
+            pagegenerators.TextIOPageGenerator('http://www.someurl.org', site))
+        self.assertLength(titles, self.expected_titles)
+
 
 class TestYearPageGenerator(DefaultSiteTestCase):

To view, visit change 698655. To unsubscribe, or for help writing mail filters, visit settings.