jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/327753 )
Change subject: PetScan page generator ......................................................................
PetScan page generator
Bug: T60814 Change-Id: I23044630a34c7cc4645524b0291448a338e2f176 --- M pywikibot/pagegenerators.py M tests/pagegenerators_tests.py 2 files changed, 120 insertions(+), 0 deletions(-)
Approvals: John Vandenberg: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py index eafd14b..3db16a7 100644 --- a/pywikibot/pagegenerators.py +++ b/pywikibot/pagegenerators.py @@ -27,6 +27,7 @@ import codecs import datetime import itertools +import json import re import sys import time @@ -46,12 +47,21 @@ intersect_generators, IteratorNextMixin, filter_unique, + PY2, )
from pywikibot import date, config, i18n, xmlreader +from pywikibot.comms import http from pywikibot.exceptions import ArgumentDeprecationWarning, UnknownExtension from pywikibot.logentries import LogEntryFactory from pywikibot.proofreadpage import ProofreadPage + +if PY2: + from urllib import urlencode + import urlparse +else: + import urllib.parse as urlparse + from urllib.parse import urlencode
if sys.version_info[0] > 2: basestring = (str, ) @@ -2764,6 +2774,92 @@ yield pywikibot.ItemPage(repo, item['id'])
+class PetScanPageGenerator(object): + """Queries PetScan (https://petscan.wmflabs.org/) to generate pages.""" + + def __init__(self, categories, subset_combination=True, namespaces=None, + site=None, extra_options=None): + """ + Constructor. + + :param categories: List of categories to retrieve pages from + (as strings) + :param subset_combination: Combination mode. + If True, returns the intersection of the results of the categories, + else returns the union of the results of the categories + :param namespaces: List of namespaces to search in + (default is None, meaning all namespaces) + :param site: Site to operate on + (default is the default site from the user config) + :param extra_options: Dictionary of extra options to use (optional) + """ + if site is None: + site = pywikibot.Site() + + self.site = site + self.opts = self.buildQuery(categories, subset_combination, + namespaces, extra_options) + + def buildQuery(self, categories, subset_combination, namespaces, + extra_options): + """ + Get the querystring options to query PetScan. + + :param categories: List of categories (as strings) + :param subset_combination: Combination mode. + If True, returns the intersection of the results of the categories, + else returns the union of the results of the categories + :param namespaces: List of namespaces to search in + :param extra_options: Dictionary of extra options to use + :return: Dictionary of querystring parameters to use in the query + """ + extra_options = extra_options or {} + + query = { + 'language': self.site.lang, + 'project': self.site.family, + 'combination': 'subset' if subset_combination else 'union', + 'categories': '\r\n'.join(categories), + 'format': 'json', + 'doit': '' + } + + # test wikipedia + if self.site.code == 'test' and self.site.family == 'test': + query['language'] = 'test' + query['project'] = 'wikipedia' + + if namespaces: + for namespace in namespaces: + query['ns[{0}]'.format(int(namespace))] = 1 + + query_final = query.copy() + query_final.update(extra_options) + + return query_final + + def query(self): + """Query PetScan.""" + url = urlparse.urlunparse(('https', # scheme + 'petscan.wmflabs.org', # netloc + '', # path + '', # params + urlencode(self.opts), # query + '')) # fragment + + req = http.fetch(url) + j = json.loads(req.content) + raw_pages = j['*'][0]['a']['*'] + for raw_page in raw_pages: + yield raw_page + + def __iter__(self): + for raw_page in self.query(): + page = pywikibot.Page(self.site, raw_page['title'], + int(raw_page['namespace'])) + yield page + + # Deprecated old names available for compatibility with compat. ImageGenerator = redirect_func(FileGenerator, old_name='ImageGenerator') UnCategorizedTemplatesGenerator = redirect_func( diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py index f7b7b31..7abf648 100755 --- a/tests/pagegenerators_tests.py +++ b/tests/pagegenerators_tests.py @@ -355,6 +355,30 @@ self.assertPagelistTitles(gen, titles=expect_3, site=site)
+class PetScanPageGeneratorTestCase(TestCase): + + """Test PetScanPageGenerator.""" + + family = 'test' + code = 'test' + + def test_petscan(self): + """Test PetScanPageGenerator.""" + site = self.get_site() + gen = pagegenerators.PetScanPageGenerator(['Pywikibot Protect Test'], True, None, site) + self.assertPagelistTitles(gen, titles=('User:Sn1per/ProtectTest1', + 'User:Sn1per/ProtectTest2'), site=site) + + gen = pagegenerators.PetScanPageGenerator(['Pywikibot Protect Test'], False, None, site) + self.assertPagelistTitles(gen, titles=('User:Sn1per/ProtectTest1', + 'User:Sn1per/ProtectTest2'), site=site) + + gen = pagegenerators.PetScanPageGenerator(['Pywikibot PetScan Test', + 'Pywikibot Category That Needs&ToBe!Encoded', + 'Test'], True, None, site) + self.assertPagelistTitles(gen, titles=('User:Sn1per/PetScanTest1',), site=site) + + class TestRepeatingGenerator(RecentChangesTestCase):
"""Test RepeatingGenerator."""