jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
pagegenerators: Add -url option

Allow users to create page generators based on a URL pointing to a page containing page titles.
Works much like the -file argument, but instead of taking a local filenames takes a URL instead.

Bug: T239436
Change-Id: I08150994fb14f44afdc79bab086d13b2d2a74fc2
---
M pywikibot/pagegenerators.py
M scripts/interwiki.py
M scripts/movepages.py
M tests/pagegenerators_tests.py
4 files changed, 90 insertions(+), 37 deletions(-)

diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index d72d391..2ecd9f5 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -20,6 +20,7 @@
import calendar
import codecs
import datetime
+import io
import itertools
import json
import re
@@ -31,6 +32,7 @@
from http import HTTPStatus
from itertools import zip_longest
from typing import Optional, Union
+from urllib.parse import urlparse

from requests.exceptions import ReadTimeout

@@ -314,6 +316,11 @@

-querypage shows special pages available.

+-url Read a list of pages to treat from the provided URL.
+ The URL must return text in the same format as expected for
+ the -file argument, e.g. page titles separated by newlines
+ or enclosed in brackets.
+

FILTER OPTIONS
==============
@@ -812,6 +819,12 @@

return self.site.querypage(value)

+ def _handle_url(self, value):
+ """Handle `-url` argument."""
+ if not value:
+ value = pywikibot.input('Please enter the URL:')
+ return TextIOPageGenerator(value, site=self.site)
+
def _handle_unusedfiles(self, value):
"""Handle `-unusedfiles` argument."""
return self.site.unusedfiles(total=_int_none(value))
@@ -926,7 +939,7 @@
"""Handle `-file` argument."""
if not value:
value = pywikibot.input('Please enter the local file name:')
- return TextfilePageGenerator(value, site=self.site)
+ return TextIOPageGenerator(value, site=self.site)

def _handle_namespaces(self, value):
"""Handle `-namespaces` argument."""
@@ -1532,43 +1545,68 @@
content=content) # pragma: no cover


-def TextfilePageGenerator(filename: Optional[str] = None, site=None):
- """Iterate pages from a list in a text file.
+def _yield_titles(f: Union[codecs.StreamReaderWriter, io.StringIO],
+ site: pywikibot.Site):
+ """Yield page titles from a text stream.

- The file must contain page links between double-square-brackets or, in
- alternative, separated by newlines. The generator will yield each
+ :param f: text stream object
+ :type f: codecs.StreamReaderWriter, io.StringIO, or any other stream-like
+ object
+ :param site: Site for generator results.
+ :type site: :py:obj:`pywikibot.site.BaseSite`
+ :return: a generator that yields Page objects of pages with titles in text
+ stream
+ :rtype: generator
+ """
+ linkmatch = None
+ for linkmatch in pywikibot.link_regex.finditer(f.read()):
+ # If the link is in interwiki format, the Page object may reside
+ # on a different Site than the default.
+ # This makes it possible to work on different wikis using a single
+ # text file, but also could be dangerous because you might
+ # inadvertently change pages on another wiki!
+ yield pywikibot.Page(pywikibot.Link(linkmatch.group('title'),
+ site))
+ if linkmatch is not None:
+ return
+
+ f.seek(0)
+ for title in f:
+ title = title.strip()
+ if '|' in title:
+ title = title[:title.index('|')]
+ if title:
+ yield pywikibot.Page(site, title)
+
+
+def TextIOPageGenerator(source: Optional[str] = None,
+ site: Optional[pywikibot.Site] = None):
+ """Iterate pages from a list in a text file or on a webpage.
+
+ The text source must contain page links between double-square-brackets or,
+ alternatively, separated by newlines. The generator will yield each
corresponding Page object.

- :param filename: the name of the file that should be read. If no name is
+ :param source: the file path or URL that should be read. If no name is
given, the generator prompts the user.
:param site: Site for generator results.
:type site: :py:obj:`pywikibot.site.BaseSite`

"""
- if filename is None:
- filename = pywikibot.input('Please enter the filename:')
+ if source is None:
+ source = pywikibot.input('Please enter the filename / URL:')
if site is None:
site = pywikibot.Site()
- with codecs.open(filename, 'r', config.textfile_encoding) as f:
- linkmatch = None
- for linkmatch in pywikibot.link_regex.finditer(f.read()):
- # If the link is in interwiki format, the Page object may reside
- # on a different Site than the default.
- # This makes it possible to work on different wikis using a single
- # text file, but also could be dangerous because you might
- # inadvertently change pages on another wiki!
- yield pywikibot.Page(pywikibot.Link(linkmatch.group('title'),
- site))
- if linkmatch is not None:
- return
-
- f.seek(0)
- for title in f:
- title = title.strip()
- if '|' in title:
- title = title[:title.index('|')]
- if title:
- yield pywikibot.Page(site, title)
+ # If source cannot be parsed as an HTTP URL, treat as local file
+ if not urlparse(source).scheme:
+ with codecs.open(source, 'r', config.textfile_encoding) as f:
+ yield from _yield_titles(f, site)
+ # Else, fetch page (page should return text in same format as that expected
+ # in filename, i.e. pages separated by newlines or pages enclosed in double
+ # brackets
+ else:
+ with io.StringIO(http.fetch(source).text) as f:
+ yield from _yield_titles(f, site)


def PagesFromTitlesGenerator(iterable, site=None):
@@ -2966,6 +3004,8 @@
PreloadingItemGenerator = redirect_func(PreloadingEntityGenerator,
old_name='PreloadingItemGenerator',
since='20170314')
+TextfilePageGenerator = redirect_func(
+ TextIOPageGenerator, old_name='TextfilePageGenerator', since='20210611')

if __name__ == '__main__': # pragma: no cover
pywikibot.output('Pagegenerators cannot be run as script - are you '
diff --git a/scripts/interwiki.py b/scripts/interwiki.py
index b32a085..3579259 100755
--- a/scripts/interwiki.py
+++ b/scripts/interwiki.py
@@ -512,7 +512,7 @@
if value.isdigit():
self.needlimit = int(value)
elif arg == 'skipfile':
- skip_page_gen = pagegenerators.TextfilePageGenerator(value)
+ skip_page_gen = pagegenerators.TextIOPageGenerator(value)
self.skip.update(skip_page_gen)
del skip_page_gen
elif arg == 'neverlink':
@@ -521,7 +521,7 @@
self.ignore += [pywikibot.Page(pywikibot.Site(), p)
for p in value.split(',')]
elif arg == 'ignorefile':
- ignore_page_gen = pagegenerators.TextfilePageGenerator(value)
+ ignore_page_gen = pagegenerators.TextIOPageGenerator(value)
self.ignore.update(ignore_page_gen)
del ignore_page_gen
elif arg == 'showpage':
@@ -2298,7 +2298,7 @@
continue

pywikibot.output('Retrieving pages from dump file ' + tail)
- for page in pagegenerators.TextfilePageGenerator(filename, site):
+ for page in pagegenerators.TextIOPageGenerator(filename, site):
if site == self.site:
self._next_page = page.title(with_ns=False) + '!'
self._next_namespace = page.namespace()
diff --git a/scripts/movepages.py b/scripts/movepages.py
index cefed76..eadd26a 100755
--- a/scripts/movepages.py
+++ b/scripts/movepages.py
@@ -197,7 +197,7 @@
else:
filename = arg[len('-pairsfile:'):]
oldName1 = None
- for page in pagegenerators.TextfilePageGenerator(filename):
+ for page in pagegenerators.TextIOPageGenerator(filename):
if oldName1:
fromToPairs.append([oldName1, page.title()])
oldName1 = None
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index 120469c..f04f50d 100644
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -424,7 +424,7 @@
self.assertLength({item['revid'] for item in items}, self.length)


-class TestTextfilePageGenerator(DefaultSiteTestCase):
+class TestTextIOPageGenerator(DefaultSiteTestCase):

"""Test loading pages from a textfile."""

@@ -444,10 +444,10 @@
)

def test_brackets(self):
- """Test TextfilePageGenerator with brackets."""
+ """Test TextIOPageGenerator with brackets."""
filename = join_data_path('pagelist-brackets.txt')
site = self.get_site()
- titles = list(pagegenerators.TextfilePageGenerator(filename, site))
+ titles = list(pagegenerators.TextIOPageGenerator(filename, site))
self.assertLength(titles, self.expected_titles)
expected_titles = [
expected_title[self.title_columns[site.namespaces[page.namespace()]
@@ -456,10 +456,10 @@
self.assertPageTitlesEqual(titles, expected_titles)

def test_lines(self):
- """Test TextfilePageGenerator with newlines."""
+ """Test TextIOPageGenerator with newlines."""
filename = join_data_path('pagelist-lines.txt')
site = self.get_site()
- titles = list(pagegenerators.TextfilePageGenerator(filename, site))
+ titles = list(pagegenerators.TextIOPageGenerator(filename, site))
self.assertLength(titles, self.expected_titles)
expected_titles = [
expected_title[self.title_columns[site.namespaces[page.namespace()]
@@ -467,6 +467,19 @@
for expected_title, page in zip(self.expected_titles, titles)]
self.assertPageTitlesEqual(titles, expected_titles)

+ @unittest.mock.patch('pywikibot.comms.http.fetch', autospec=True)
+ def test_url(self, mock_fetch):
+ """Test TextIOPageGenerator with URL."""
+ # Mock return value of fetch()
+ fetch_return = unittest.mock.Mock()
+ fetch_return.text = '\n'.join(
+ [title[0] for title in self.expected_titles])
+ mock_fetch.return_value = fetch_return
+ site = self.get_site()
+ titles = list(
+ pagegenerators.TextIOPageGenerator('http://www.someurl.org', site))
+ self.assertLength(titles, self.expected_titles)
+

class TestYearPageGenerator(DefaultSiteTestCase):


To view, visit change 698655. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I08150994fb14f44afdc79bab086d13b2d2a74fc2
Gerrit-Change-Number: 698655
Gerrit-PatchSet: 5
Gerrit-Owner: Chris Maynor <cmchrismaynor@gmail.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged