jenkins-bot has submitted this change and it was merged.
Change subject: WikidataPageFromItemGenerator and performance
......................................................................
WikidataPageFromItemGenerator and performance
A new WikidataPageFromItemGenerator provides a convinent way to translate batch
of item pages (e.g data items from repository) to site pages.
WikidataQueryPageGenerator and WikidataSPARQLPageGenerator
requests for each page its sitelink, which requires many web requests
and is very inefficent. Instead we use a batch query requesting
only the data we need (sitelinks).
Bug: T129556
Change-Id: I515c7135b7c2f8b9851c82a189abdd102dd562ee
---
M pywikibot/pagegenerators.py
1 file changed, 39 insertions(+), 22 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index 6a59a1d..58d67b4 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -40,6 +40,7 @@
deprecated_args,
redirect_func,
issue_deprecation_warning,
+ itergroup,
DequeGenerator,
intersect_generators,
IteratorNextMixin,
@@ -2593,6 +2594,31 @@
yield pywikibot.Page(pywikibot.Link(fd(month, day), site))
+def WikidataPageFromItemGenerator(gen, site):
+ """Generate pages from site based on sitelinks of item pages.
+
+ @param gen: generator of L{pywikibot.ItemPage}
+ @param site: Site for generator results.
+ @type site: L{pywikibot.site.BaseSite}
+
+ """
+ repo = site.data_repository()
+ for sublist in itergroup(gen, 50):
+ req = {'ids': [item.id for item in sublist],
+ 'sitefilter': site.dbName(),
+ 'action': 'wbgetentities',
+ 'props': 'sitelinks'}
+
+ wbrequest = repo._simple_request(**req)
+ wbdata = wbrequest.submit()
+ entities = (item for item in wbdata['entities'].values() if
+ 'sitelinks' in item and site.dbName() in
item['sitelinks'])
+ sitelinks = (item['sitelinks'][site.dbName()]['title']
+ for item in entities)
+ for sitelink in sitelinks:
+ yield pywikibot.Page(site, sitelink)
+
+
def WikidataQueryPageGenerator(query, site=None):
"""Generate pages that result from the given WikidataQuery.
@@ -2606,24 +2632,22 @@
if site is None:
site = pywikibot.Site()
repo = site.data_repository()
+ is_repo = isinstance(site, pywikibot.site.DataSite)
+ if not is_repo:
+ # limit the results to those with sitelinks to target site
+ query += ' link[%s]' % site.dbName()
wd_queryset = wdquery.QuerySet(query)
wd_query = wdquery.WikidataQuery(cacheMaxAge=0)
data = wd_query.query(wd_queryset)
-
pywikibot.output(u'retrieved %d items' %
data[u'status'][u'items'])
- for item in data[u'items']:
- page = pywikibot.ItemPage(repo, u'Q{0}'.format(item))
- if isinstance(site, pywikibot.site.DataSite):
- yield page
- continue
+ items_pages = (pywikibot.ItemPage(repo, 'Q{0}'.format(item))
+ for item in data[u'items'])
+ if is_repo:
+ return items_pages
- try:
- link = page.getSitelink(site)
- except pywikibot.NoPage:
- continue
- yield pywikibot.Page(pywikibot.Link(link, site))
+ return WikidataPageFromItemGenerator(items_pages, site)
def WikidataSPARQLPageGenerator(query, site=None, item_name='item',
endpoint=None):
@@ -2644,18 +2668,11 @@
query_object = sparql.SparqlQuery(endpoint=endpoint)
data = query_object.get_items(query, item_name=item_name)
+ items_pages = (pywikibot.ItemPage(repo, item) for item in data)
+ if isinstance(site, pywikibot.site.DataSite):
+ return items_pages
- for item in data:
- page = pywikibot.ItemPage(repo, item)
- if isinstance(site, pywikibot.site.DataSite):
- yield page
- continue
-
- try:
- link = page.getSitelink(site)
- except pywikibot.NoPage:
- continue
- yield pywikibot.Page(pywikibot.Link(link, site))
+ return WikidataPageFromItemGenerator(items_pages, site)
def WikibaseSearchItemPageGenerator(text, language=None, total=None, site=None):
--
To view, visit
https://gerrit.wikimedia.org/r/276562
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I515c7135b7c2f8b9851c82a189abdd102dd562ee
Gerrit-PatchSet: 5
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Eranroz <eranroz89(a)gmail.com>
Gerrit-Reviewer: Eranroz <eranroz89(a)gmail.com>
Gerrit-Reviewer: FelixReimann <felix(a)fex-it.de>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>