jenkins-bot has submitted this change and it was merged.
Change subject: pep8 changes, synchronize with core
......................................................................
pep8 changes, synchronize with core
Change-Id: I274201bd4247ec56f6573cf3bc40d29ef72ac6e7
---
M pagegenerators.py
1 file changed, 232 insertions(+), 145 deletions(-)
Approvals:
Legoktm: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pagegenerators.py b/pagegenerators.py
index 10330b2..8decfb0 100644
--- a/pagegenerators.py
+++ b/pagegenerators.py
@@ -18,19 +18,24 @@
#
# Distributed under the terms of the MIT license.
#
-__version__='$Id$'
+__version__ = '$Id$'
import re
import sys
import codecs
+import date
import datetime
-import urllib, urllib2, time
+import time
+import urllib
+import urllib2
import traceback
import wikipedia as pywikibot
import config
from pywikibot import i18n
from pywikibot.support import deprecate_arg
-import date, catlib, userlib, query
+import catlib
+import userlib
+import query
parameterHelp = u"""\
-cat Work on all pages which are in a specific category.
@@ -162,10 +167,10 @@
-random Work on random pages returned by [[Special:Random]].
Can also be given as "-random:n" where n is the number
- of pages to be returned, else 10 pages are returned.
+ of pages to be returned, otherwise the default is 10 pages.
--randomredirect Work on random redirect target pages returned by
- [[Special:Randomredirect]]. Can also be given as
+-randomredirect Work on random redirect pages returned by
+ [[Special:RandomRedirect]]. Can also be given as
"-randomredirect:n" where n is the number of pages to be
returned, else 10 pages are returned.
@@ -204,9 +209,10 @@
# For python 2.4 compatibility
# see
http://www.mail-archive.com/python-dev@python.org/msg12668.html
try:
- GeneratorExit
+ GeneratorExit
except NameError:
- class GeneratorExit(Exception): pass
+ class GeneratorExit(Exception):
+ pass
class GeneratorFactory(object):
@@ -223,7 +229,7 @@
return map(int, self.namespaces)
def getCombinedGenerator(self, gen=None):
- """Returns the combination of all accumulated generators,
+ """Return the combination of all accumulated generators,
that have been created in the process of handling arguments.
Only call this after all arguments have been parsed.
@@ -239,7 +245,8 @@
gensList = CombinedPageGenerator(self.gens)
genToReturn = DuplicateFilterPageGenerator(gensList, total=self.limit)
if (self.getNamespaces()):
- genToReturn = NamespaceFilterPageGenerator(genToReturn,
self.getNamespaces())
+ genToReturn = NamespaceFilterPageGenerator(genToReturn,
+ self.getNamespaces())
return genToReturn
def getCategoryGen(self, arg, length, recurse=False):
@@ -275,8 +282,7 @@
cat = catlib.Category(site,
"%s:%s" % (site.namespace(14), categoryname))
- return SubCategoriesPageGenerator(cat,
- start=startfrom, recurse=recurse)
+ return SubCategoriesPageGenerator(cat, start=startfrom, recurse=recurse)
def handleArg(self, arg):
"""Parse one argument at a time.
@@ -308,12 +314,12 @@
if len(arg) == 12:
gen = UnusedFilesGenerator()
else:
- gen = UnusedFilesGenerator(number = int(arg[13:]))
+ gen = UnusedFilesGenerator(number=int(arg[13:]))
elif arg.startswith('-unwatched'):
if len(arg) == 10:
gen = UnwatchedPagesPageGenerator()
else:
- gen = UnwatchedPagesPageGenerator(number = int(arg[11:]))
+ gen = UnwatchedPagesPageGenerator(number=int(arg[11:]))
elif arg.startswith('-usercontribs'):
args = arg[14:].split(';')
number = None
@@ -321,33 +327,34 @@
number = int(args[1])
except:
number = 250
- gen = UserContributionsGenerator(args[0], number,
namespaces=self.getNamespaces)
+ gen = UserContributionsGenerator(args[0], number,
+ namespaces=self.getNamespaces)
elif arg.startswith('-withoutinterwiki'):
if len(arg) == 17:
gen = WithoutInterwikiPageGenerator()
else:
- gen = WithoutInterwikiPageGenerator(number = int(arg[18:]))
+ gen = WithoutInterwikiPageGenerator(number=int(arg[18:]))
elif arg.startswith('-interwiki'):
title = arg[11:]
if not title:
title = i18n.input('pywikibot-enter-page-processing')
page = pywikibot.Page(site, title)
gen = InterwikiPageGenerator(page)
- elif arg.startswith('-randomredirect'):
- if len(arg) == 15:
- gen = RandomRedirectPageGenerator()
- else:
- gen = RandomRedirectPageGenerator(number=int(arg[16:]))
elif arg.startswith('-random'):
if len(arg) == 7:
gen = RandomPageGenerator()
else:
gen = RandomPageGenerator(number=int(arg[8:]))
- elif arg.startswith('-recentchanges'):
- if len(arg) == 14:
- gen = RecentchangesPageGenerator()
+ elif arg.startswith('-randomredirect'):
+ if len(arg) == 15:
+ gen = RandomRedirectPageGenerator()
else:
+ gen = RandomRedirectPageGenerator(number=int(arg[16:]))
+ elif arg.startswith('-recentchanges'):
+ if len(arg) >= 15:
gen = RecentchangesPageGenerator(number=int(arg[15:]))
+ else:
+ gen = RecentchangesPageGenerator()
gen = DuplicateFilterPageGenerator(gen)
elif arg.startswith('-file'):
textfilename = arg[6:]
@@ -376,13 +383,13 @@
self.limit = int(arg[len('-limit:'):])
return True
elif arg.startswith('-catr'):
- gen = self.getCategoryGen(arg, len('-catr'), recurse = True)
+ gen = self.getCategoryGen(arg, len('-catr'), recurse=True)
elif arg.startswith('-category'):
gen = self.getCategoryGen(arg, len('-category'))
elif arg.startswith('-cat'):
gen = self.getCategoryGen(arg, len('-cat'))
elif arg.startswith('-subcatsr'):
- gen = self.setSubCategoriesGen(arg, 9, recurse = True)
+ gen = self.setSubCategoriesGen(arg, 9, recurse=True)
elif arg.startswith('-subcats'):
gen = self.setSubCategoriesGen(arg, 8)
elif arg.startswith('-page'):
@@ -426,16 +433,16 @@
transclusionPageTitle = pywikibot.input(
u'Pages that transclude which page should be processed?')
transclusionPage = pywikibot.Page(site,
- "%s:%s" % (site.namespace(10),
- transclusionPageTitle))
+ "%s:%s" % (site.namespace(10),
+ transclusionPageTitle))
gen = ReferringPageGenerator(transclusionPage,
onlyTemplateInclusion=True)
elif arg.startswith('-gorandom'):
- for firstPage in RandomPageGenerator(number = 1):
+ for firstPage in RandomPageGenerator(number=1):
firstPageTitle = firstPage.title()
namespace = pywikibot.Page(site, firstPageTitle).namespace()
- firstPageTitle = pywikibot.Page(site,
- firstPageTitle).title(withNamespace=False)
+ firstPageTitle = pywikibot.Page(site, firstPageTitle
+ ).title(withNamespace=False)
gen = AllpagesPageGenerator(firstPageTitle, namespace,
includeredirects=False)
elif arg.startswith('-start'):
@@ -448,8 +455,8 @@
else:
namespace = pywikibot.Page(site, firstPageTitle).namespace()
- firstPageTitle = pywikibot.Page(site,
- firstPageTitle).title(withNamespace=False)
+ firstPageTitle = pywikibot.Page(site, firstPageTitle
+ ).title(withNamespace=False)
gen = AllpagesPageGenerator(firstPageTitle, namespace,
includeredirects=False)
elif arg.startswith('-redirectonly'):
@@ -458,8 +465,8 @@
firstPageTitle = pywikibot.input(
u'At which page do you want to start?')
namespace = pywikibot.Page(site, firstPageTitle).namespace()
- firstPageTitle = pywikibot.Page(site,
- firstPageTitle).title(withNamespace=False)
+ firstPageTitle = pywikibot.Page(site, firstPageTitle
+ ).title(withNamespace=False)
gen = AllpagesPageGenerator(firstPageTitle, namespace,
includeredirects='only')
elif arg.startswith('-prefixindex'):
@@ -468,16 +475,16 @@
if not prefix:
prefix = pywikibot.input(
u'What page names are you looking for?')
- gen = PrefixingPageGenerator(prefix = prefix)
+ gen = PrefixingPageGenerator(prefix=prefix)
elif arg.startswith('-newimages'):
limit = arg[11:] or pywikibot.input(
u'How many images do you want to load?')
- gen = NewimagesPageGenerator(number = int(limit))
+ gen = NewimagesPageGenerator(number=int(limit))
elif arg == ('-new') or arg.startswith('-new:'):
- if len(arg) >=5:
- gen = NewpagesPageGenerator(number = int(arg[5:]))
+ if len(arg) >= 5:
+ gen = NewpagesPageGenerator(number=int(arg[5:]))
else:
- gen = NewpagesPageGenerator(number = 60)
+ gen = NewpagesPageGenerator(number=60)
elif arg.startswith('-imagelinks'):
imagelinkstitle = arg[len('-imagelinks:'):]
if not imagelinkstitle:
@@ -490,7 +497,8 @@
if not mediawikiQuery:
mediawikiQuery = pywikibot.input(
u'What do you want to search for?')
- gen = SearchPageGenerator(mediawikiQuery, number=None,
namespaces=self.getNamespaces)
+ gen = SearchPageGenerator(mediawikiQuery, number=None,
+ namespaces=self.getNamespaces)
elif arg.startswith('-google'):
gen = GoogleSearchPageGenerator(arg[8:])
elif arg.startswith('-titleregex'):
@@ -503,7 +511,8 @@
gen = YahooSearchPageGenerator(arg[7:])
elif arg.startswith('-'):
mode, log, user = arg.partition('log')
- if log == 'log' and mode not in ['-', '-no']:
#exclude -log, -nolog
+ # exclude -log, -nolog
+ if log == 'log' and mode not in ['-', '-no']:
number = 500
if not user:
user = None
@@ -528,7 +537,7 @@
return False
-def AllpagesPageGenerator(start ='!', namespace=None, includeredirects=True,
+def AllpagesPageGenerator(start='!', namespace=None, includeredirects=True,
site=None):
"""
Iterate Page objects for all titles in a single namespace.
@@ -542,6 +551,7 @@
includeredirects=includeredirects):
yield page
+
def PrefixingPageGenerator(prefix, namespace=None, includeredirects=True,
site=None):
if site is None:
@@ -550,8 +560,10 @@
if namespace is None:
namespace = prefixpage.namespace()
title = prefixpage.title(withNamespace=False)
- for page in site.prefixindex(prefix=title, namespace=namespace,
includeredirects=includeredirects):
+ for page in site.prefixindex(prefix=title, namespace=namespace,
+ includeredirects=includeredirects):
yield page
+
def LogpagesPageGenerator(number=500, mode='', user=None, repeat=False,
site=None, namespace=[]):
@@ -561,7 +573,8 @@
repeat=repeat, namespace=namespace):
yield page[0]
-@deprecate_arg("get_redirect", None) #20120822
+
+@deprecate_arg("get_redirect", None) # 20120822
def NewpagesPageGenerator(number=100, repeat=False, site=None, namespace=0):
"""
Iterate Page objects for all new titles in a single namespace.
@@ -572,6 +585,7 @@
for item in site.newpages(number=number, repeat=repeat, namespace=namespace,
rcshow=['!redirect']):
yield item[0]
+
def RecentchangesPageGenerator(number=100, site=None):
"""Generate pages that are in the recent changes list.
@@ -584,25 +598,32 @@
for item in site.recentchanges(number=number):
yield item[0]
+
def FileLinksGenerator(referredImagePage):
for page in referredImagePage.usingPages():
yield page
+
def ImagesPageGenerator(pageWithImages):
- for imagePage in pageWithImages.imagelinks(followRedirects=False, loose=True):
+ for imagePage in pageWithImages.imagelinks(followRedirects=False,
+ loose=True):
yield imagePage
-def UnusedFilesGenerator(number = 100, repeat = False, site = None, extension = None):
+
+def UnusedFilesGenerator(number=100, repeat=False, site=None, extension=None):
if site is None:
site = pywikibot.getSite()
- for page in site.unusedfiles(number=number, repeat=repeat, extension=extension):
+ for page in site.unusedfiles(number=number, repeat=repeat,
+ extension=extension):
yield pywikibot.ImagePage(page.site(), page.title())
+
def InterwikiPageGenerator(page):
"""Iterator over all interwiki (non-language) links on a
page."""
yield page
for link in page.interwiki():
yield link
+
def ReferringPageGenerator(referredPage, followRedirects=False,
withTemplateInclusion=True,
@@ -612,6 +633,7 @@
withTemplateInclusion,
onlyTemplateInclusion):
yield page
+
def CategorizedPageGenerator(category, recurse=False, start=None):
"""Yield all pages in a specific category.
@@ -630,6 +652,7 @@
if start is None or a.title() >= start:
yield a
+
def SubCategoriesPageGenerator(category, recurse=False, start=None):
"""Yield all subcategories in a specific category.
@@ -646,16 +669,19 @@
for s in category.subcategories(recurse=recurse, startFrom=start):
yield s
+
def LinkedPageGenerator(linkingPage):
"""Yield all pages linked from a specific page."""
for page in linkingPage.linkedPages():
yield page
-def NewimagesPageGenerator(number = 100, repeat = False, site = None):
+
+def NewimagesPageGenerator(number=100, repeat=False, site=None):
if site is None:
site = pywikibot.getSite()
for page in site.newimages(number, repeat=repeat):
yield page[0]
+
def TextfilePageGenerator(filename=None, site=None):
"""Iterate pages from a list in a text file.
@@ -674,7 +700,9 @@
if site is None:
site = pywikibot.getSite()
f = codecs.open(filename, 'r', config.textfile_encoding)
- R = re.compile(ur'\[\[(.+?)(?:\]\]|\|)') # title ends either before | or
before ]]
+
+ # title ends either before | or before ]]
+ R = re.compile(ur'\[\[(.+?)(?:\]\]|\|)')
pageTitle = None
for pageTitle in R.findall(f.read()):
# If the link is in interwiki format, the Page object may reside
@@ -693,11 +721,13 @@
yield pywikibot.Page(site, title)
f.close()
-def WithoutInterwikiPageGenerator(number = 100, repeat = False, site = None):
+
+def WithoutInterwikiPageGenerator(number=100, repeat=False, site=None):
if site is None:
site = pywikibot.getSite()
for page in site.withoutinterwiki(number=number, repeat=repeat):
yield page
+
def UnCategorizedCategoryGenerator(number=100, repeat=False, site=None):
if site is None:
@@ -705,11 +735,13 @@
for page in site.uncategorizedcategories(number=number, repeat=repeat):
yield page
+
def UnCategorizedImageGenerator(number=100, repeat=False, site=None):
if site is None:
site = pywikibot.getSite()
for page in site.uncategorizedimages(number=number, repeat=repeat):
yield page
+
def UnCategorizedPageGenerator(number=100, repeat=False, site=None):
if site is None:
@@ -717,11 +749,13 @@
for page in site.uncategorizedpages(number=number, repeat=repeat):
yield page
+
def UnCategorizedTemplatesGenerator(number=100, repeat=False, site=None):
if site is None:
site = pywikibot.getSite()
for page in site.uncategorizedtemplates(number=number, repeat=repeat):
yield page
+
def LonelyPagesPageGenerator(number=100, repeat=False, site=None):
if site is None:
@@ -729,11 +763,13 @@
for page in site.lonelypages(number=number, repeat=repeat):
yield page
+
def UnwatchedPagesPageGenerator(number=100, repeat=False, site=None):
if site is None:
site = pywikibot.getSite()
for page in site.unwatchedpages(number=number, repeat=repeat):
yield page
+
def AncientPagesPageGenerator(number=100, repeat=False, site=None):
if site is None:
@@ -741,36 +777,41 @@
for page in site.ancientpages(number=number, repeat=repeat):
yield page[0]
-def DeadendPagesPageGenerator(number = 100, repeat = False, site = None):
+
+def DeadendPagesPageGenerator(number=100, repeat=False, site=None):
if site is None:
site = pywikibot.getSite()
for page in site.deadendpages(number=number, repeat=repeat):
yield page
-def LongPagesPageGenerator(number = 100, repeat = False, site = None):
+
+def LongPagesPageGenerator(number=100, repeat=False, site=None):
if site is None:
site = pywikibot.getSite()
for page in site.longpages(number=number, repeat=repeat):
yield page[0]
-def ShortPagesPageGenerator(number = 100, repeat = False, site = None):
+
+def ShortPagesPageGenerator(number=100, repeat=False, site=None):
if site is None:
site = pywikibot.getSite()
for page in site.shortpages(number=number, repeat=repeat):
yield page[0]
-def RandomPageGenerator(number = 10, site = None):
+def RandomPageGenerator(number=10, site=None):
if site is None:
site = pywikibot.getSite()
for i in xrange(number):
yield site.randompage()
-def RandomRedirectPageGenerator(number = 10, site = None):
+
+def RandomRedirectPageGenerator(number=10, site=None):
if site is None:
site = pywikibot.getSite()
for i in xrange(number):
yield site.randomredirectpage()
+
def PagesFromTitlesGenerator(iterable, site=None):
"""Generate pages from the titles (unicode strings) yielded by
iterable."""
@@ -781,6 +822,7 @@
break
yield pywikibot.Page(site, title)
+
def LinksearchPageGenerator(link, step=500, site=None):
"""Yields all pages that include a specified link, according to
[[Special:Linksearch]].
@@ -790,7 +832,8 @@
for page in site.linksearch(link, limit=step):
yield page
-def UserContributionsGenerator(username, number = 250, namespaces = [], site = None ):
+
+def UserContributionsGenerator(username, number=250, namespaces=[], site=None):
"""
Yields number unique pages edited by user:username
namespaces : List of namespace numbers to fetch contribs from. Also accepted
@@ -805,7 +848,8 @@
for page in user.contributions(number, namespaces):
yield page[0]
-def SearchPageGenerator(query, number = 100, namespaces = None, site = None):
+
+def SearchPageGenerator(query, number=100, namespaces=None, site=None):
"""
Provides a list of results using the internal MediaWiki search engine.
@@ -817,14 +861,15 @@
site = pywikibot.getSite()
if callable(namespaces):
namespaces = namespaces()
- for page in site.search(query, number=number, namespaces = namespaces):
+ for page in site.search(query, number=number, namespaces=namespaces):
yield page[0]
+
class YahooSearchPageGenerator:
- '''
- To use this generator, install pYsearch
- '''
- def __init__(self, query = None, count = 100, site = None): # values larger than 100
fail
+ """ To use this generator, install pYsearch """
+
+ # values larger than 100 fail
+ def __init__(self, query=None, count=100, site=None):
self.query = query or pywikibot.input(u'Please enter the search query:')
self.count = count
if site is None:
@@ -832,41 +877,45 @@
self.site = site
def queryYahoo(self, query):
- from yahoo.search.web import WebSearch
- srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
-
- dom = srch.get_results()
- results = srch.parse_results(dom)
- for res in results:
- url = res.Url
- yield url
+ from yahoo.search.web import WebSearch
+ srch = WebSearch(config.yahoo_appid, query=query, results=self.count)
+ dom = srch.get_results()
+ results = srch.parse_results(dom)
+ for res in results:
+ url = res.Url
+ yield url
def __iter__(self):
# restrict query to local site
localQuery = '%s site:%s' % (self.query, self.site.hostname())
- base = 'http://%s%s' % (self.site.hostname(),
self.site.nice_get_address(''))
+ base = 'http://%s%s' % (self.site.hostname(),
+ self.site.nice_get_address(''))
for url in self.queryYahoo(localQuery):
if url[:len(base)] == base:
title = url[len(base):]
page = pywikibot.Page(self.site, title)
yield page
+
class GoogleSearchPageGenerator:
- '''
+ """
To use this generator, you must install the pyGoogle module from
http://pygoogle.sf.net/ and get a Google Web API license key from
http://www.google.com/apis/index.html . The google_key must be set to your
license key in your configuration.
- '''
- def __init__(self, query = None, site = None):
+
+ """
+
+ def __init__(self, query=None, site=None):
self.query = query or pywikibot.input(u'Please enter the search query:')
if site is None:
site = pywikibot.getSite()
self.site = site
#########
- # partially commented out because it is probably not in compliance with Google's
"Terms of
- # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
+ # partially commented out because it is probably not in compliance with
+ # Google's "Terms of service"
+ # (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
def queryGoogle(self, query):
#if config.google_key:
if True:
@@ -887,15 +936,17 @@
url =
u'http://ajax.googleapis.com/ajax/services/search/web?'
params = {
'key': config.google_key,
- 'v':'1.0',
+ 'v': '1.0',
'q': query,
}
url += urllib.urlencode(params)
while True:
try:
- pywikibot.output(u'Querying Google AJAX Search API...') #, offset
%i' % offset)
- result = json.loads(self.site.getUrl(url, refer =
config.google_api_refer, no_hostname=True))
+ pywikibot.output(u'Querying Google AJAX Search API...')
+ result = json.loads(
+ self.site.getUrl(url, refer=config.google_api_refer,
+ no_hostname=True))
for res in result['responseData']['results']:
yield res['url']
except:
@@ -908,22 +959,24 @@
google.LICENSE_KEY = config.google_key
offset = 0
estimatedTotalResultsCount = None
- while not estimatedTotalResultsCount \
- or offset < estimatedTotalResultsCount:
- while (True):
+ while not estimatedTotalResultsCount or \
+ offset < estimatedTotalResultsCount:
+ while True:
# Google often yields 502 errors.
try:
pywikibot.output(u'Querying Google, offset %i' % offset)
- data = google.doGoogleSearch(query, start = offset, filter = False)
+ data = google.doGoogleSearch(query, start=offset,
+ filter=False)
break
except KeyboardInterrupt:
raise
except:
- # SOAPpy.Errors.HTTPError or SOAP.HTTPError (502 Bad Gateway)
- # can happen here, depending on the module used. It's not easy
- # to catch this properly because pygoogle decides which one of
- # the soap modules to use.
- pywikibot.output(u"An error occured. Retrying in 10
seconds...")
+ # SOAPpy.Errors.HTTPError or SOAP.HTTPError
+ # (502 Bad Gateway) can happen here, depending on the module
+ # used. It's not easy to catch this properly because
+ # pygoogle decides which one of the soap modules to use.
+ pywikibot.output(u"An error occured. "
+ u"Retrying in 10 seconds...")
time.sleep(10)
continue
@@ -932,40 +985,48 @@
yield result.URL
# give an estimate of pages to work on, but only once.
if not estimatedTotalResultsCount:
- pywikibot.output(u'Estimated total result count: %i pages.' %
data.meta.estimatedTotalResultsCount)
+ pywikibot.output(u'Estimated total result count: %i pages.'
+ % data.meta.estimatedTotalResultsCount)
estimatedTotalResultsCount = data.meta.estimatedTotalResultsCount
#print 'estimatedTotalResultsCount: ', estimatedTotalResultsCount
offset += 10
- #########
- # commented out because it is probably not in compliance with Google's
"Terms of
- # service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
-
- #def queryViaWeb(self, query):
- #"""
- #Google has stopped giving out API license keys, and sooner or later
- #they will probably shut down the service.
- #This is a quick and ugly solution: we just grab the search results from
- #the normal web interface.
- #"""
- #linkR = re.compile(r'<a href="([^>"]+?)"
class=l>', re.IGNORECASE)
- #offset = 0
-
- #while True:
- #pywikibot.output("Google: Querying page %d" % (offset / 100 + 1))
- #address =
"http://www.google.com/search?q=%s&num=100&hl=en&start=%d" %
(urllib.quote_plus(query), offset)
- ## we fake being Firefox because Google blocks unknown browsers
- #request = urllib2.Request(address, None, {'User-Agent':
'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128 SUSE/1.5-0.1
Firefox/1.5'})
- #urlfile = urllib2.urlopen(request)
- #page = urlfile.read()
- #urlfile.close()
- #for url in linkR.findall(page):
- #yield url
- #if "<div id=nn>" in page: # Is there a "Next" link
for next page of results?
- #offset += 100 # Yes, go to next page of results.
- #else:
- #return
- #########
+#############
+## commented out because it is probably not in compliance with Google's
+## "Terms of service" (see 5.3,
http://www.google.com/accounts/TOS?loc=US)
+##
+## def queryViaWeb(self, query):
+## """
+## Google has stopped giving out API license keys, and sooner or later
+## they will probably shut down the service.
+## This is a quick and ugly solution: we just grab the search results from
+## the normal web interface.
+## """
+## linkR = re.compile(r'<a href="([^>"]+?)"
class=l>', re.IGNORECASE)
+## offset = 0
+##
+## while True:
+## pywikibot.output("Google: Querying page %d" % (offset / 100 +
1))
+## address =
"http://www.google.com/search?q=%s&num=100&hl=en&start=%d" \
+## % (urllib.quote_plus(query), offset)
+## # we fake being Firefox because Google blocks unknown browsers
+## request = urllib2.Request(
+## address, None,
+## {'User-Agent':
+## 'Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8) Gecko/20051128
'
+## 'SUSE/1.5-0.1 Firefox/1.5'})
+## urlfile = urllib2.urlopen(request)
+## page = urlfile.read()
+## urlfile.close()
+## for url in linkR.findall(page):
+## yield url
+##
+## # Is there a "Next" link for next page of results?
+## if "<div id=nn>" in page:
+## offset += 100 # Yes, go to next page of results.
+## else:
+## return
+#############
def __iter__(self):
# restrict query to local site
@@ -976,17 +1037,19 @@
if url[:len(base)] == base:
title = url[len(base):]
page = pywikibot.Page(self.site, title)
- # Google contains links in the format
http://de.wikipedia.org/wiki/en:Foobar
- if page.site() == self.site:
+ # Google contains links in the format
+ #
http://de.wikipedia.org/wiki/en:Foobar
+ if page.site == self.site:
yield page
-def MySQLPageGenerator(query, site = None):
+
+def MySQLPageGenerator(query, site=None):
import MySQLdb as mysqldb
if site is None:
site = pywikibot.getSite()
- conn = mysqldb.connect(config.db_hostname, db = site.dbName(),
- user = config.db_username,
- passwd = config.db_password)
+ conn = mysqldb.connect(config.db_hostname, db=site.dbName(),
+ user=config.db_username,
+ passwd=config.db_password)
cursor = conn.cursor()
pywikibot.output(u'Executing query:\n%s' % query)
query = query.encode(site.encoding())
@@ -1009,7 +1072,8 @@
page = pywikibot.Page(site, pageTitle)
yield page
-def YearPageGenerator(start = 1, end = 2050, site = None):
+
+def YearPageGenerator(start=1, end=2050, site=None):
if site is None:
site = pywikibot.getSite()
pywikibot.output(u"Starting with year %i" % start)
@@ -1018,10 +1082,11 @@
pywikibot.output(u'Preparing %i...' % i)
# There is no year 0
if i != 0:
- current_year = date.formatYear(site.lang, i )
+ current_year = date.formatYear(site.lang, i)
yield pywikibot.Page(site, current_year)
-def DayPageGenerator(startMonth = 1, endMonth = 12, site = None):
+
+def DayPageGenerator(startMonth=1, endMonth=12, site=None):
if site is None:
site = pywikibot.getSite()
fd = date.FormatDate(site)
@@ -1031,7 +1096,8 @@
for day in xrange(1, date.getNumberOfDaysInMonth(month)+1):
yield pywikibot.Page(site, fd(month, day))
-def NamespaceFilterPageGenerator(generator, namespaces, site = None):
+
+def NamespaceFilterPageGenerator(generator, namespaces, site=None):
"""
Wraps around another generator. Yields only those pages that are in one
of the given namespaces.
@@ -1056,6 +1122,7 @@
if page.namespace() in namespaces:
yield page
+
def PageTitleFilterPageGenerator(generator, ignoreList):
"""
Wraps around another generator. Yields only those pages are not
@@ -1067,7 +1134,8 @@
"""
def isIgnored(page):
- if not (page.site().family.name in ignoreList and page.site().lang in
ignoreList[page.site().family.name]):
+ if not (page.site().family.name in ignoreList and
+ page.site().lang in ignoreList[page.site().family.name]):
return False
for ig in ignoreList[page.site().family.name][page.site().lang]:
@@ -1082,13 +1150,17 @@
else:
yield page
+
def RedirectFilterPageGenerator(generator):
"""
- Wraps around another generator. Yields only those pages that are not redirects.
+ Wraps around another generator. Yields only those pages that are not
+ redirects.
+
"""
for page in generator:
if not page.isRedirectPage():
yield page
+
def DuplicateFilterPageGenerator(generator, total=None):
"""
@@ -1098,7 +1170,8 @@
seenPages = dict()
count = 0
for page in generator:
- _page = u"%s:%s:%s" % (page._site.family.name, page._site.lang,
page._title)
+ _page = u"%s:%s:%s" % (page._site.family.name, page._site.lang,
+ page._title)
if _page not in seenPages:
seenPages[_page] = True
if total:
@@ -1107,7 +1180,9 @@
break
yield page
-def RegexFilterPageGenerator(generator, regex, inverse=False, ignore_namespace=True):
+
+def RegexFilterPageGenerator(generator, regex, inverse=False,
+ ignore_namespace=True):
"""
Wraps around another generator. Yields only those pages, the titles of
which are positively matched to any regex in list. If invert is False,
@@ -1121,13 +1196,13 @@
regex = [regex]
# test if regex is already compiled
if isinstance(regex[0], basestring):
- reg = [ re.compile(r, re.I) for r in regex ]
+ reg = [re.compile(r, re.I) for r in regex]
else:
reg = regex
for page in generator:
# get the page title
- title = page.title(withNamespace = not ignore_namespace)
+ title = page.title(withNamespace=not ignore_namespace)
if inverse:
# yield page if NOT matched by all regex
@@ -1145,21 +1220,29 @@
yield page
break
-def EdittimeFilterPageGenerator(generator, begintime=datetime.datetime.min,
endtime=datetime.datetime.max):
+
+def EdittimeFilterPageGenerator(generator, begintime=datetime.datetime.min,
+ endtime=datetime.datetime.max):
"""
Wraps around another generator. Yields only those pages which were changed
between begintime and endtime.
@param generator: A generator object
- @param begintime: A datetime object. Only pages after this time will be returned.
- @param endtime: A datetime object Only pages before this time will be returned.
+ @param begintime: A datetime object. Only pages after this time will be
+ returned.
+ @param endtime: A datetime object Only pages before this time will be
+ returned.
+
"""
for page in generator:
- if page.editTime(datetime=True)==None:
+ if page.editTime(datetime=True) is None:
# FIXME: The page object should probably handle this
page.get()
- if page.editTime(datetime=True) and begintime < page.editTime(datetime=True)
and page.editTime(datetime=True) < endtime:
+ if page.editTime(datetime=True) and \
+ begintime < page.editTime(datetime=True) and \
+ page.editTime(datetime=True) < endtime:
yield page
+
def CombinedPageGenerator(generators):
"""
@@ -1171,6 +1254,7 @@
for page in generator:
yield page
+
def CategoryGenerator(generator):
"""
Wraps around another generator. Yields the same pages, but as Category
@@ -1180,6 +1264,7 @@
for page in generator:
yield catlib.Category(page.site(), page.title())
+
def ImageGenerator(generator):
"""
Wraps around another generator. Yields the same pages, but as Image
@@ -1188,6 +1273,7 @@
"""
for page in generator:
yield pywikibot.ImagePage(page.site(), page.title())
+
def PageWithTalkPageGenerator(generator):
"""
@@ -1209,6 +1295,7 @@
pages, etc. Thus, it is not necessary to load each page separately.
Operates asynchronously, so the next batch of pages is loaded in the
background before the first batch is fully consumed.
+
"""
@deprecate_arg("lookahead", None)
def __init__(self, generator, pageNumber=60):
@@ -1247,9 +1334,9 @@
# Query the sites one by one.
site = page_list[0].site()
pagesThisSite = [page for page in page_list
- if page.site() == site]
+ if page.site() == site]
page_list = [page for page in page_list
- if page.site() != site]
+ if page.site() != site]
pywikibot.getall(site, pagesThisSite)
for page in pagesThisSite:
yield page
@@ -1262,7 +1349,6 @@
self.preload(page_list, retry=True)
# Ignore this error, and get the pages the traditional way later.
pass
-
def main(*args):
@@ -1278,12 +1364,13 @@
i = 0
for page in gen:
i += 1
- pywikibot.output("%4d: %s" % (i, page.title()), toStdout =
True)
+ pywikibot.output("%4d: %s" % (i, page.title()),
+ toStdout=True)
else:
pywikibot.showHelp()
finally:
pywikibot.stopme()
-if __name__=="__main__":
+if __name__ == "__main__":
main()
--
To view, visit
https://gerrit.wikimedia.org/r/89505
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I274201bd4247ec56f6573cf3bc40d29ef72ac6e7
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/compat
Gerrit-Branch: master
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: Legoktm <legoktm.wikipedia(a)gmail.com>
Gerrit-Reviewer: jenkins-bot