Stress tester for the Wikipedia test site - Wikitech-l

5 Jul 2002


      If anyone's interested, here's a little spider program in Python for 
exercising the Wikipedia test site.
Tested in Python 2.2.1 for Win32, although it should run on other OS's 
just fine. A few instances of this
running at once should give the test site a good stress test. Notice 
that it will go places where proper spiders dare not tread -- 
intentionally.
---------------------------------------------------------------------------------------------------------------------------------
# Stressbot: a stress tester for the Wikipedia test site
# * Makes GET requests aggressively at random
# * Ignores embedded robots metadata entirely
# * Won't do POSTs, so shouldn't corrupt the database
# * Won't stray outside the site
import urllib, re, string, random
def proc_href(str):
    str = re.sub(r'<[Aa][^>]*?[hH][rR][eR][fF] *= *"([^"]*)"[^>]*?>', 
r'\1', str)
    # handle entities buried in the HREF
    return string.replace(str, "&amp;", "&")
def get_hrefs(text):
    hrefs = re.findall(r'<[Aa][^>]*?[hH][rR][eR][fF] *= 
*"[^"]*"[^>]*?>', text)
    return map(proc_href, hrefs)
home_url = "http://130.94.122.197/"
history = []
url = home_url
while 1:
    print 'opening', url
    text = urllib.urlopen(url).read()
    # Make a note in a limited-length history
    # This is to stop endless revisiting of the standard special pages
    history.append(url)
    history = history[-50:]
    # Parse out all the A HREFs
    url_list = get_hrefs(text)
    # Limit to the local site
    url_list = filter(lambda u: u[:len(home_url)] == home_url, url_list)
    # Don't revisit a page we have been to recently
    url_list = filter(lambda u: u not in history, url_list)    
    print len(text), 'bytes', len(url_list), 'non-recent local A HREFs'
    # The home page is the last resort, and we should also force it 
occasionally
    # to get us out of pathological dead-end subspaces
    if not url_list or random.choice(range(10)) == 0:
        url_list.append(home_url)
    url = random.choice(url_list)