Revision: 3964 Author: siebrand Date: 2007-08-05 14:14:48 +0000 (Sun, 05 Aug 2007)
Log Message: ----------- *Fixed LinksearchPageGenerator (submitted by Filnik) *Added -linksearch in replace.py (submitted by Filnik) *Removed EOL whitespace in pagegenerators.py
Modified Paths: -------------- trunk/pywikipedia/pagegenerators.py trunk/pywikipedia/replace.py
Modified: trunk/pywikipedia/pagegenerators.py =================================================================== --- trunk/pywikipedia/pagegenerators.py 2007-08-04 11:42:25 UTC (rev 3963) +++ trunk/pywikipedia/pagegenerators.py 2007-08-05 14:14:48 UTC (rev 3964) @@ -30,7 +30,7 @@ namespace = wikipedia.Page(wikipedia.getSite(), start).namespace() for page in wikipedia.getSite().allpages(start=start, namespace=namespace, includeredirects = includeredirects): yield page - + def PrefixingPageGenerator(prefix, namespace=None): for page in AllpagesPageGenerator(prefix, namespace): if page.titleWithoutNamespace().startswith(prefix): @@ -43,7 +43,7 @@ site = wikipedia.getSite() for page in site.newpages(number=number, get_redirect=get_redirect, repeat=repeat): yield page[0] - + def FileLinksGenerator(referredPage): for page in referredPage.getFileLinks(): yield page @@ -56,7 +56,7 @@ if site is None: site = wikipedia.getSite() for page in site.unusedfiles(number=number, repeat=repeat): - yield wikipedia.ImagePage(page.site(), page.title()) + yield wikipedia.ImagePage(page.site(), page.title())
def WithoutInterwikiPageGenerator(number = 100, repeat = False, site = None): if site is None: @@ -186,6 +186,7 @@ site = wikipedia.getSite() elRX = re.compile('<a .* class="external ?" .*</a>.*<a .*>(.*)</a>') #TODO: de-uglify? offset = 0 + pageyeldlist = list() found = step while found == step: found = 0 @@ -194,7 +195,12 @@ data = site.getUrl(url) for elM in elRX.finditer(data): found += 1 - yield wikipedia.Page(site,elM.group(1)) + pagenameofthelink = elM.group(1) + if pagenameofthelink in pageyeldlist: + continue + else: + pageyeldlist.append(pagenameofthelink) + yield wikipedia.Page(site, pagenameofthelink) offset += step
class GoogleSearchPageGenerator: @@ -206,9 +212,9 @@ ''' def __init__(self, query = None): self.query = query or wikipedia.input(u'Please enter the search query:') - + ######### - # partially commented out because it is probably not in compliance with Google's "Terms of + # partially commented out because it is probably not in compliance with Google's "Terms of # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US) def queryGoogle(self, query): #if config.google_key: @@ -230,7 +236,7 @@ estimatedTotalResultsCount = None while not estimatedTotalResultsCount or offset < estimatedTotalResultsCount: while (True): - # Google often yields 502 errors. + # Google often yields 502 errors. try: wikipedia.output(u'Querying Google, offset %i' % offset) data = google.doGoogleSearch(query, start = offset, filter = False) @@ -255,7 +261,7 @@ offset += 10
######### - # commented out because it is probably not in compliance with Google's "Terms of + # commented out because it is probably not in compliance with Google's "Terms of # service" (see 5.3, http://www.google.com/accounts/TOS?loc=US)
#def queryViaWeb(self, query): @@ -460,7 +466,7 @@ if pageNumber < 2: raise ValueError("PreloadingGenerator needs to load more than 1 page.") pagequeue = Queue.Queue(min(pageNumber//2, 10)) - preloader = _Preloader(pagequeue, generator, pageNumber) + preloader = _Preloader(pagequeue, generator, pageNumber) preloader.start() while True: # Queue.get() blocks the main thread. This means that the @@ -476,7 +482,7 @@ if p is None: return yield p - + class GeneratorFactory: """ This factory is responsible for processing command line arguments @@ -593,4 +599,3 @@ wikipedia.output(page.title(), toStdout = True) finally: wikipedia.stopme() -
Modified: trunk/pywikipedia/replace.py =================================================================== --- trunk/pywikipedia/replace.py 2007-08-04 11:42:25 UTC (rev 3963) +++ trunk/pywikipedia/replace.py 2007-08-05 14:14:48 UTC (rev 3964) @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- """ This bot will make direct text replacements. It will retrieve information on which pages might need changes either from an XML dump or a text file, or only @@ -19,6 +19,8 @@ parameter multiple times to edit multiple pages. -ref - Work on all pages that link to a certain page. Argument can also be given as "-ref:referredpagetitle". +-linksearch - Retrieve all the results using Special:Linksearch. + Argument can also be given as "-linksearch:url". -filelinks - Works on all pages that link to a certain image. Argument can also be given as "-filelinks:ImageName". -links - Work on all pages that are linked to from a certain page. @@ -350,6 +352,9 @@ summary_commandline = True elif arg.startswith('-allowoverlap'): allowoverlap = True + elif arg.startswith('-linksearch:'): + linkselected = (arg[12:]) + gen = pagegenerators.LinksearchPageGenerator(linkselected) else: generator = genFactory.handleArg(arg) if generator:
On Sunday 05 August 2007 16:14:48 siebrand@svn.wikimedia.org wrote:
*Added -linksearch in replace.py (submitted by Filnik)
Siebrand: replace.py already had this functionality, with the parameter -weblink. Thus, I have reverted your change.
Andre: neither Misza nor I have worked on this. I just threw out one of the two linksearch_address() methods from family.py, but both my pagegenerators.LinksearchGenerator and your wikipedia.Site.linksearch() are still there. Could you try and fix the code duplication?
The following is the mailing list correspondence regarding this issue:
----
Subject: Re: [pyWikipediaBot-users] pywikipedia spamremove.py, NONE, 1.1 family.py, 1.256, 1.257 wikipedia.py, 1.958, 1.959 Date: Donnerstag 19 Juli 2007 From: "Andre Engels" <andreengels@gm...> To: "Daniel Herding" <DHerding@gm...>
2007/7/19, Daniel Herding <DHerding@gm...>:
Hi Andre,
I think you missed that we already had got code for external link search (Special:Linksearch). It is in pagegenerators.py and has been added by Misza13 in March. If you want to test it, run this:
python pagegenerators.py -weblink:http://www.ebay.com
(currently it won't work because of a method name conflict, see below.)
We now have some code duplication, for example we have two methods called linksearch_address() in family.py. I'm not sure which code is the better
one.
Misza's method can browse through several pages if there is more than one page full of results, but your method has some wildcard handling.
Maybe you and Misza can clean this up together, you know best what the requirements are. Afterwards, I'd like to refactor spamremove.py a bit if
you
don't mind, it sounds quite useful.
Well, I would be willing to combine the two, but I'm going to be on holiday for 10 days from tomorrow, so I don't currently have the time. If you or Misza or anyone else wants to do it in the meantime, I'd be happy about it too. If not, send me a reminder when I'm back (basically, first week of August)
Hi Daniel.
It must be my blue eyes and blond hair, but how exactly is it now possible to run replace.py on all pages containing a particular url as reported by Special:Linksearch? I only see a reference to spamremove.py in your reply and there are currently no references to `linksearch`, or ´LinksearchGenerator´ in replace.py other than in the top if the file (documentation)...
Cheers!
Siebrand
-----Oorspronkelijk bericht----- Van: pywikipedia-l-bounces@lists.wikimedia.org [mailto:pywikipedia-l-bounces@lists.wikimedia.org] Namens Daniel Herding Verzonden: maandag 6 augustus 2007 12:07 Aan: pywikipedia-l@lists.wikimedia.org Onderwerp: Re: [Pywikipedia-l] SVN: [3964] trunk/pywikipedia
On Sunday 05 August 2007 16:14:48 siebrand@svn.wikimedia.org wrote:
*Added -linksearch in replace.py (submitted by Filnik)
Siebrand: replace.py already had this functionality, with the parameter -weblink. Thus, I have reverted your change.
Andre: neither Misza nor I have worked on this. I just threw out one of the two linksearch_address() methods from family.py, but both my pagegenerators.LinksearchGenerator and your wikipedia.Site.linksearch() are still there. Could you try and fix the code duplication?
The following is the mailing list correspondence regarding this issue:
----
Subject: Re: [pyWikipediaBot-users] pywikipedia spamremove.py, NONE, 1.1 family.py, 1.256, 1.257 wikipedia.py, 1.958, 1.959 Date: Donnerstag 19 Juli 2007 From: "Andre Engels" <andreengels@gm...> To: "Daniel Herding" <DHerding@gm...>
2007/7/19, Daniel Herding <DHerding@gm...>:
Hi Andre,
I think you missed that we already had got code for external link search (Special:Linksearch). It is in pagegenerators.py and has been added by Misza13 in March. If you want to test it, run this:
python pagegenerators.py -weblink:http://www.ebay.com
(currently it won't work because of a method name conflict, see below.)
We now have some code duplication, for example we have two methods called linksearch_address() in family.py. I'm not sure which code is the better
one.
Misza's method can browse through several pages if there is more than one page full of results, but your method has some wildcard handling.
Maybe you and Misza can clean this up together, you know best what the requirements are. Afterwards, I'd like to refactor spamremove.py a bit if
you
don't mind, it sounds quite useful.
Well, I would be willing to combine the two, but I'm going to be on holiday for 10 days from tomorrow, so I don't currently have the time. If you or Misza or anyone else wants to do it in the meantime, I'd be happy about it too. If not, send me a reminder when I'm back (basically, first week of August)
_______________________________________________ Pywikipedia-l mailing list Pywikipedia-l@lists.wikimedia.org http://lists.wikimedia.org/mailman/listinfo/pywikipedia-l
Am Montag, 6. August 2007 18:01 schrieb Siebrand Mazeland:
It must be my blue eyes and blond hair, but how exactly is it now possible to run replace.py on all pages containing a particular url as reported by Special:Linksearch? I only see a reference to spamremove.py in your reply and there are currently no references to `linksearch`, or ´LinksearchGenerator´ in replace.py other than in the top if the file (documentation)...
Hi Siebrand,
You can run this:
python replace.py a b -weblink:http://www.example.org
The magic is in these lines:
# This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() [...] generator = genFactory.handleArg(arg) if generator: gen = generator
The newest version of replace.py has updated documentation.
Daniel
Nifty! I guess I should up my code reading and understandig skills...
Thanks for the explanation.
Cheers!
Siebrand
-----Oorspronkelijk bericht----- Van: pywikipedia-l-bounces@lists.wikimedia.org [mailto:pywikipedia-l-bounces@lists.wikimedia.org] Namens Daniel Herding Verzonden: maandag 6 augustus 2007 18:50 Aan: pywikipedia-l@lists.wikimedia.org Onderwerp: Re: [Pywikipedia-l] SVN: [3964] trunk/pywikipedia
Am Montag, 6. August 2007 18:01 schrieb Siebrand Mazeland:
It must be my blue eyes and blond hair, but how exactly is it now possible to run replace.py on all pages containing a particular url as reported by Special:Linksearch? I only see a reference to spamremove.py in your reply and there are currently no references to `linksearch`, or ´LinksearchGenerator´ in replace.py other than in the top if the file (documentation)...
Hi Siebrand,
You can run this:
python replace.py a b -weblink:http://www.example.org
The magic is in these lines:
# This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() [...] generator = genFactory.handleArg(arg) if generator: gen = generator
The newest version of replace.py has updated documentation.
Daniel
pywikipedia-l@lists.wikimedia.org