jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/327753 )
Change subject: PetScan page generator
......................................................................
PetScan page generator
Bug: T60814
Change-Id: I23044630a34c7cc4645524b0291448a338e2f176
---
M pywikibot/pagegenerators.py
M tests/pagegenerators_tests.py
2 files changed, 120 insertions(+), 0 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index eafd14b..3db16a7 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -27,6 +27,7 @@
import codecs
import datetime
import itertools
+import json
import re
import sys
import time
@@ -46,12 +47,21 @@
intersect_generators,
IteratorNextMixin,
filter_unique,
+ PY2,
)
from pywikibot import date, config, i18n, xmlreader
+from pywikibot.comms import http
from pywikibot.exceptions import ArgumentDeprecationWarning, UnknownExtension
from pywikibot.logentries import LogEntryFactory
from pywikibot.proofreadpage import ProofreadPage
+
+if PY2:
+ from urllib import urlencode
+ import urlparse
+else:
+ import urllib.parse as urlparse
+ from urllib.parse import urlencode
if sys.version_info[0] > 2:
basestring = (str, )
@@ -2764,6 +2774,92 @@
yield pywikibot.ItemPage(repo, item['id'])
+class PetScanPageGenerator(object):
+ """Queries PetScan (
https://petscan.wmflabs.org/) to generate
pages."""
+
+ def __init__(self, categories, subset_combination=True, namespaces=None,
+ site=None, extra_options=None):
+ """
+ Constructor.
+
+ :param categories: List of categories to retrieve pages from
+ (as strings)
+ :param subset_combination: Combination mode.
+ If True, returns the intersection of the results of the categories,
+ else returns the union of the results of the categories
+ :param namespaces: List of namespaces to search in
+ (default is None, meaning all namespaces)
+ :param site: Site to operate on
+ (default is the default site from the user config)
+ :param extra_options: Dictionary of extra options to use (optional)
+ """
+ if site is None:
+ site = pywikibot.Site()
+
+ self.site = site
+ self.opts = self.buildQuery(categories, subset_combination,
+ namespaces, extra_options)
+
+ def buildQuery(self, categories, subset_combination, namespaces,
+ extra_options):
+ """
+ Get the querystring options to query PetScan.
+
+ :param categories: List of categories (as strings)
+ :param subset_combination: Combination mode.
+ If True, returns the intersection of the results of the categories,
+ else returns the union of the results of the categories
+ :param namespaces: List of namespaces to search in
+ :param extra_options: Dictionary of extra options to use
+ :return: Dictionary of querystring parameters to use in the query
+ """
+ extra_options = extra_options or {}
+
+ query = {
+ 'language': self.site.lang,
+ 'project': self.site.family,
+ 'combination': 'subset' if subset_combination else
'union',
+ 'categories': '\r\n'.join(categories),
+ 'format': 'json',
+ 'doit': ''
+ }
+
+ # test wikipedia
+ if self.site.code == 'test' and self.site.family == 'test':
+ query['language'] = 'test'
+ query['project'] = 'wikipedia'
+
+ if namespaces:
+ for namespace in namespaces:
+ query['ns[{0}]'.format(int(namespace))] = 1
+
+ query_final = query.copy()
+ query_final.update(extra_options)
+
+ return query_final
+
+ def query(self):
+ """Query PetScan."""
+ url = urlparse.urlunparse(('https', # scheme
+ 'petscan.wmflabs.org', # netloc
+ '', # path
+ '', # params
+ urlencode(self.opts), # query
+ '')) # fragment
+
+ req = http.fetch(url)
+ j = json.loads(req.content)
+ raw_pages = j['*'][0]['a']['*']
+ for raw_page in raw_pages:
+ yield raw_page
+
+ def __iter__(self):
+ for raw_page in self.query():
+ page = pywikibot.Page(self.site, raw_page['title'],
+ int(raw_page['namespace']))
+ yield page
+
+
# Deprecated old names available for compatibility with compat.
ImageGenerator = redirect_func(FileGenerator, old_name='ImageGenerator')
UnCategorizedTemplatesGenerator = redirect_func(
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index f7b7b31..7abf648 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -355,6 +355,30 @@
self.assertPagelistTitles(gen, titles=expect_3, site=site)
+class PetScanPageGeneratorTestCase(TestCase):
+
+ """Test PetScanPageGenerator."""
+
+ family = 'test'
+ code = 'test'
+
+ def test_petscan(self):
+ """Test PetScanPageGenerator."""
+ site = self.get_site()
+ gen = pagegenerators.PetScanPageGenerator(['Pywikibot Protect Test'],
True, None, site)
+ self.assertPagelistTitles(gen, titles=('User:Sn1per/ProtectTest1',
+ 'User:Sn1per/ProtectTest2'),
site=site)
+
+ gen = pagegenerators.PetScanPageGenerator(['Pywikibot Protect Test'],
False, None, site)
+ self.assertPagelistTitles(gen, titles=('User:Sn1per/ProtectTest1',
+ 'User:Sn1per/ProtectTest2'),
site=site)
+
+ gen = pagegenerators.PetScanPageGenerator(['Pywikibot PetScan Test',
+ 'Pywikibot Category That
Needs&ToBe!Encoded',
+ 'Test'], True, None, site)
+ self.assertPagelistTitles(gen, titles=('User:Sn1per/PetScanTest1',),
site=site)
+
+
class TestRepeatingGenerator(RecentChangesTestCase):
"""Test RepeatingGenerator."""
--
To view, visit
https://gerrit.wikimedia.org/r/327753
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I23044630a34c7cc4645524b0291448a338e2f176
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Sn1per <geofbot(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Matěj Suchánek <matejsuchanek97(a)gmail.com>
Gerrit-Reviewer: Sn1per <geofbot(a)gmail.com>
Gerrit-Reviewer: jenkins-bot <>