jenkins-bot has submitted this change and it was merged.
Change subject: synchronize with compat
......................................................................
synchronize with compat
Change-Id: I3acc2d20025bbbf1c9959f8e02fb7f816807eecd
---
M pywikibot/pagegenerators.py
1 file changed, 75 insertions(+), 59 deletions(-)
Approvals:
Merlijn van Deen: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index bd24b0b..d749dbc 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
-"""This module offers a wide variety of page generators. A page generator
is an
+"""
+This module offers a wide variety of page generators. A page generator is an
object that is iterable (see
http://www.python.org/dev/peps/pep-0255/ ) and
that yields page objects on which other scripts can then work.
@@ -12,7 +13,7 @@
¶ms;
"""
#
-# (C) Pywikipedia bot team, 2008-2012
+# (C) Pywikipedia bot team, 2008-2013
#
# Distributed under the terms of the MIT license.
#
@@ -94,14 +95,6 @@
-newpages Work on the most recent new pages. If given as -newpages:x,
will work on the x newest pages.
--random Work on random pages returned by [[Special:Random]]
- Can also be given as "-random:n" where n is the number
- of pages to be returned, otherwise the default is 10 pages.
-
--randomredirect Work on random redirect pages returned by [[Special:RandomRedirect]].
- Can also be given as "-randomredirect:n" where n is the
number
- of pages to be returned, else 10 pages are returned.
-
-recentchanges Work on the pages with the most recent changes. If
given as -recentchanges:x, will work on the x most recently
changed pages.
@@ -140,12 +133,22 @@
-usercontribs Work on all articles that were edited by a certain user :
Example : -usercontribs:DumZiBoT
+
-weblink Work on all articles that contain an external link to
a given URL; may be given as "-weblink:url"
-withoutinterwiki Work on all pages that don't have interlanguage links.
Argument can be given as "-withoutinterwiki:n" where
n is some number (??).
+
+-random Work on random pages returned by [[Special:Random]].
+ Can also be given as "-random:n" where n is the number
+ of pages to be returned, otherwise the default is 10 pages.
+
+-randomredirect Work on random redirect pages returned by
+ [[Special:RandomRedirect]]. Can also be given as
+ "-randomredirect:n" where n is the number of pages to be
+ returned, else 10 pages are returned.
-google Work on all pages that are found in a Google search.
You need a Google Web API license key. Note that Google
@@ -999,10 +1002,10 @@
class YahooSearchPageGenerator:
- '''
- To use this generator, install pYsearch
- '''
- def __init__(self, query=None, count=100, site=None): # values larger than 100 fail
+ """ To use this generator, install pYsearch """
+
+ # values larger than 100 fail
+ def __init__(self, query=None, count=100, site=None):
self.query = query or pywikibot.input(u'Please enter the search query:')
self.count = count
if site is None:
@@ -1012,7 +1015,6 @@
def queryYahoo(self, query):
from yahoo.search.web import WebSearch
srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
-
dom = srch.get_results()
results = srch.parse_results(dom)
for res in results:
@@ -1032,12 +1034,14 @@
class GoogleSearchPageGenerator:
- '''
+ """
To use this generator, you must install the pyGoogle module from
http://pygoogle.sf.net/ and get a Google Web API license key from
http://www.google.com/apis/index.html . The google_key must be set to your
license key in your configuration.
- '''
+
+ """
+
def __init__(self, query=None, site=None):
self.query = query or pywikibot.input(u'Please enter the search query:')
if site is None:
@@ -1045,8 +1049,9 @@
self.site = site
#########
- # partially commented out because it is probably not in compliance with Google's
"Terms of
- # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
+ # partially commented out because it is probably not in compliance with
+ # Google's "Terms of service"
+ # (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
def queryGoogle(self, query):
#if config.google_key:
if True:
@@ -1065,22 +1070,24 @@
google.LICENSE_KEY = config.google_key
offset = 0
estimatedTotalResultsCount = None
- while not estimatedTotalResultsCount \
- or offset < estimatedTotalResultsCount:
- while (True):
+ while not estimatedTotalResultsCount or \
+ offset < estimatedTotalResultsCount:
+ while True:
# Google often yields 502 errors.
try:
pywikibot.output(u'Querying Google, offset %i' % offset)
- data = google.doGoogleSearch(query, start=offset, filter=False)
+ data = google.doGoogleSearch(query, start=offset,
+ filter=False)
break
except KeyboardInterrupt:
raise
except:
- # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway)
- # can happen here, depending on the module used. It's not easy
- # to catch this properly because pygoogle decides which one of
- # the soap modules to use.
- pywikibot.output(u"An error occured. Retrying in 10
seconds...")
+ # SOAPpy.Errors.HTTPError or SOAP.HTTPError
+ # (502 Bad Gateway) can happen here, depending on the module
+ # used. It's not easy to catch this properly because
+ # pygoogle decides which one of the soap modules to use.
+ pywikibot.output(u"An error occured. "
+ u"Retrying in 10 seconds...")
time.sleep(10)
continue
@@ -1089,40 +1096,48 @@
yield result.URL
# give an estimate of pages to work on, but only once.
if not estimatedTotalResultsCount:
- pywikibot.output(u'Estimated total result count: %i pages.' %
data.meta.estimatedTotalResultsCount)
+ pywikibot.output(u'Estimated total result count: %i pages.'
+ % data.meta.estimatedTotalResultsCount)
estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount
#print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount
offset += 10
- #########
- # commented out because it is probably not in compliance with Google's
"Terms of
- # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
-
- #def queryViaWeb(self, query):
- #"""
- #Google has stopped giving out API license keys, and sooner or later
- #they will probably shut down the service.
- #This is a quick and ugly solution: we just grab the search results from
- #the normal web interface.
- #"""
- #linkR = re.compile(r'<a href="([^>"]+?)"
class=l>', re.IGNORECASE)
- #offset = 0
-
- #while True:
- #pywikibot.output("Google: Querying page %d" % (offset / 100 + 1))
- #address =
"http://www.google.com/search?q=%s&num=100&hl=en&start=%d" %
(urllib.quote_plus(query), offset)
- ## we fake being Firefox because Google blocks unknown browsers
- #request = urllib2.Request(address, None, {'User-Agent':
'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1
Firefox/1.5'})
- #urlfile = urllib2.urlopen(request)
- #page = urlfile.read()
- #urlfile.close()
- #for url in linkR.findall(page):
- #yield url
- #if "<div id=nn>" in page: # Is there a "Next" link
for next page of results?
- #offset += 100 # Yes, go to next page of results.
- #else:
- #return
- #########
+#############
+## commented out because it is probably not in compliance with Google's
+## "Terms of service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
+##
+## def queryViaWeb(self, query):
+## """
+## Google has stopped giving out API license keys, and sooner or later
+## they will probably shut down the service.
+## This is a quick and ugly solution: we just grab the search results from
+## the normal web interface.
+## """
+## linkR = re.compile(r'<a href="([^>"]+?)"
class=l>', re.IGNORECASE)
+## offset = 0
+##
+## while True:
+## pywikibot.output("Google: Querying page %d" % (offset / 100 +
1))
+## address =
"http://www.google.com/search?q=%s&num=100&hl=en&start=%d" \
+## % (urllib.quote_plus(query), offset)
+## # we fake being Firefox because Google blocks unknown browsers
+## request = urllib2.Request(
+## address, None,
+## {'User-Agent':
+## 'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128
'
+## 'SUSE/1.5-0.1 Firefox/1.5'})
+## urlfile = urllib2.urlopen(request)
+## page = urlfile.read()
+## urlfile.close()
+## for url in linkR.findall(page):
+## yield url
+##
+## # Is there a "Next" link for next page of results?
+## if "<div id=nn>" in page:
+## offset += 100 # Yes, go to next page of results.
+## else:
+## return
+#############
def __iter__(self):
# restrict query to local site
@@ -1133,7 +1148,8 @@
if url[:len(base)] == base:
title = url[len(base):]
page = pywikibot.Page(pywikibot.Link(title, self.site))
- # Google contains links in the format
http://de.wikipedia.org/wiki/en:Foobar
+ # Google contains links in the format
+ #
http://de.wikipedia.org/wiki/en:Foobar
if page.site == self.site:
yield page
--
To view, visit
https://gerrit.wikimedia.org/r/89506
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I3acc2d20025bbbf1c9959f8e02fb7f816807eecd
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: Merlijn van Deen <valhallasw(a)arctus.nl>
Gerrit-Reviewer: jenkins-bot