jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/698655 )
Change subject: pagegenerators: Add -url option ......................................................................
pagegenerators: Add -url option
Allow users to create page generators based on a URL pointing to a page containing page titles. Works much like the -file argument, but instead of taking a local filenames takes a URL instead.
Bug: T239436 Change-Id: I08150994fb14f44afdc79bab086d13b2d2a74fc2 --- M pywikibot/pagegenerators.py M scripts/interwiki.py M scripts/movepages.py M tests/pagegenerators_tests.py 4 files changed, 90 insertions(+), 37 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index d72d391..2ecd9f5 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -20,6 +20,7 @@ import calendar import codecs import datetime +import io import itertools import json import re @@ -31,6 +32,7 @@ from http import HTTPStatus from itertools import zip_longest from typing import Optional, Union +from urllib.parse import urlparse
from requests.exceptions import ReadTimeout
@@ -314,6 +316,11 @@
-querypage shows special pages available.
+-url Read a list of pages to treat from the provided URL. + The URL must return text in the same format as expected for + the -file argument, e.g. page titles separated by newlines + or enclosed in brackets. +
FILTER OPTIONS ============== @@ -812,6 +819,12 @@
return self.site.querypage(value)
+ def _handle_url(self, value): + """Handle `-url` argument.""" + if not value: + value = pywikibot.input('Please enter the URL:') + return TextIOPageGenerator(value, site=self.site) + def _handle_unusedfiles(self, value): """Handle `-unusedfiles` argument.""" return self.site.unusedfiles(total=_int_none(value)) @@ -926,7 +939,7 @@ """Handle `-file` argument.""" if not value: value = pywikibot.input('Please enter the local file name:') - return TextfilePageGenerator(value, site=self.site) + return TextIOPageGenerator(value, site=self.site)
def _handle_namespaces(self, value): """Handle `-namespaces` argument.""" @@ -1532,43 +1545,68 @@ content=content) # pragma: no cover
-def TextfilePageGenerator(filename: Optional[str] = None, site=None): - """Iterate pages from a list in a text file. +def _yield_titles(f: Union[codecs.StreamReaderWriter, io.StringIO], + site: pywikibot.Site): + """Yield page titles from a text stream.
- The file must contain page links between double-square-brackets or, in - alternative, separated by newlines. The generator will yield each + :param f: text stream object + :type f: codecs.StreamReaderWriter, io.StringIO, or any other stream-like + object + :param site: Site for generator results. + :type site: :py:obj:`pywikibot.site.BaseSite` + :return: a generator that yields Page objects of pages with titles in text + stream + :rtype: generator + """ + linkmatch = None + for linkmatch in pywikibot.link_regex.finditer(f.read()): + # If the link is in interwiki format, the Page object may reside + # on a different Site than the default. + # This makes it possible to work on different wikis using a single + # text file, but also could be dangerous because you might + # inadvertently change pages on another wiki! + yield pywikibot.Page(pywikibot.Link(linkmatch.group('title'), + site)) + if linkmatch is not None: + return + + f.seek(0) + for title in f: + title = title.strip() + if '|' in title: + title = title[:title.index('|')] + if title: + yield pywikibot.Page(site, title) + + +def TextIOPageGenerator(source: Optional[str] = None, + site: Optional[pywikibot.Site] = None): + """Iterate pages from a list in a text file or on a webpage. + + The text source must contain page links between double-square-brackets or, + alternatively, separated by newlines. The generator will yield each corresponding Page object.
- :param filename: the name of the file that should be read. If no name is + :param source: the file path or URL that should be read. If no name is given, the generator prompts the user. :param site: Site for generator results. :type site: :py:obj:`pywikibot.site.BaseSite`
""" - if filename is None: - filename = pywikibot.input('Please enter the filename:') + if source is None: + source = pywikibot.input('Please enter the filename / URL:') if site is None: site = pywikibot.Site() - with codecs.open(filename, 'r', config.textfile_encoding) as f: - linkmatch = None - for linkmatch in pywikibot.link_regex.finditer(f.read()): - # If the link is in interwiki format, the Page object may reside - # on a different Site than the default. - # This makes it possible to work on different wikis using a single - # text file, but also could be dangerous because you might - # inadvertently change pages on another wiki! - yield pywikibot.Page(pywikibot.Link(linkmatch.group('title'), - site)) - if linkmatch is not None: - return - - f.seek(0) - for title in f: - title = title.strip() - if '|' in title: - title = title[:title.index('|')] - if title: - yield pywikibot.Page(site, title) + # If source cannot be parsed as an HTTP URL, treat as local file + if not urlparse(source).scheme: + with codecs.open(source, 'r', config.textfile_encoding) as f: + yield from _yield_titles(f, site) + # Else, fetch page (page should return text in same format as that expected + # in filename, i.e. pages separated by newlines or pages enclosed in double + # brackets + else: + with io.StringIO(http.fetch(source).text) as f: + yield from _yield_titles(f, site)
def PagesFromTitlesGenerator(iterable, site=None): @@ -2966,6 +3004,8 @@ PreloadingItemGenerator = redirect_func(PreloadingEntityGenerator, old_name='PreloadingItemGenerator', since='20170314') +TextfilePageGenerator = redirect_func( + TextIOPageGenerator, old_name='TextfilePageGenerator', since='20210611')
if __name__ == '__main__': # pragma: no cover pywikibot.output('Pagegenerators cannot be run as script - are you ' diff --git a/scripts/interwiki.py b/scripts/interwiki.py index b32a085..3579259 100755 --- a/scripts/interwiki.py +++ b/scripts/interwiki.py @@ -512,7 +512,7 @@ if value.isdigit(): self.needlimit = int(value) elif arg == 'skipfile': - skip_page_gen = pagegenerators.TextfilePageGenerator(value) + skip_page_gen = pagegenerators.TextIOPageGenerator(value) self.skip.update(skip_page_gen) del skip_page_gen elif arg == 'neverlink': @@ -521,7 +521,7 @@ self.ignore += [pywikibot.Page(pywikibot.Site(), p) for p in value.split(',')] elif arg == 'ignorefile': - ignore_page_gen = pagegenerators.TextfilePageGenerator(value) + ignore_page_gen = pagegenerators.TextIOPageGenerator(value) self.ignore.update(ignore_page_gen) del ignore_page_gen elif arg == 'showpage': @@ -2298,7 +2298,7 @@ continue
pywikibot.output('Retrieving pages from dump file ' + tail) - for page in pagegenerators.TextfilePageGenerator(filename, site): + for page in pagegenerators.TextIOPageGenerator(filename, site): if site == self.site: self._next_page = page.title(with_ns=False) + '!' self._next_namespace = page.namespace() diff --git a/scripts/movepages.py b/scripts/movepages.py index cefed76..eadd26a 100755 --- a/scripts/movepages.py +++ b/scripts/movepages.py @@ -197,7 +197,7 @@ else: filename = arg[len('-pairsfile:'):] oldName1 = None - for page in pagegenerators.TextfilePageGenerator(filename): + for page in pagegenerators.TextIOPageGenerator(filename): if oldName1: fromToPairs.append([oldName1, page.title()]) oldName1 = None diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py index 120469c..f04f50d 100644 --- a/tests/pagegenerators_tests.py +++ b/tests/pagegenerators_tests.py @@ -424,7 +424,7 @@ self.assertLength({item['revid'] for item in items}, self.length)
-class TestTextfilePageGenerator(DefaultSiteTestCase): +class TestTextIOPageGenerator(DefaultSiteTestCase):
"""Test loading pages from a textfile."""
@@ -444,10 +444,10 @@ )
def test_brackets(self): - """Test TextfilePageGenerator with brackets.""" + """Test TextIOPageGenerator with brackets.""" filename = join_data_path('pagelist-brackets.txt') site = self.get_site() - titles = list(pagegenerators.TextfilePageGenerator(filename, site)) + titles = list(pagegenerators.TextIOPageGenerator(filename, site)) self.assertLength(titles, self.expected_titles) expected_titles = [ expected_title[self.title_columns[site.namespaces[page.namespace()] @@ -456,10 +456,10 @@ self.assertPageTitlesEqual(titles, expected_titles)
def test_lines(self): - """Test TextfilePageGenerator with newlines.""" + """Test TextIOPageGenerator with newlines.""" filename = join_data_path('pagelist-lines.txt') site = self.get_site() - titles = list(pagegenerators.TextfilePageGenerator(filename, site)) + titles = list(pagegenerators.TextIOPageGenerator(filename, site)) self.assertLength(titles, self.expected_titles) expected_titles = [ expected_title[self.title_columns[site.namespaces[page.namespace()] @@ -467,6 +467,19 @@ for expected_title, page in zip(self.expected_titles, titles)] self.assertPageTitlesEqual(titles, expected_titles)
+ @unittest.mock.patch('pywikibot.comms.http.fetch', autospec=True) + def test_url(self, mock_fetch): + """Test TextIOPageGenerator with URL.""" + # Mock return value of fetch() + fetch_return = unittest.mock.Mock() + fetch_return.text = '\n'.join( + [title[0] for title in self.expected_titles]) + mock_fetch.return_value = fetch_return + site = self.get_site() + titles = list( + pagegenerators.TextIOPageGenerator('http://www.someurl.org', site)) + self.assertLength(titles, self.expected_titles) +
class TestYearPageGenerator(DefaultSiteTestCase):