If anyone's interested, here's a little spider program in Python for exercising the Wikipedia test site. Tested in Python 2.2.1 for Win32, although it should run on other OS's just fine. A few instances of this running at once should give the test site a good stress test. Notice that it will go places where proper spiders dare not tread -- intentionally. --------------------------------------------------------------------------------------------------------------------------------- # Stressbot: a stress tester for the Wikipedia test site # * Makes GET requests aggressively at random # * Ignores embedded robots metadata entirely # * Won't do POSTs, so shouldn't corrupt the database # * Won't stray outside the site
import urllib, re, string, random
def proc_href(str): str = re.sub(r'<[Aa][^>]*?[hH][rR][eR][fF] *= *"([^"]*)"[^>]*?>', r'\1', str) # handle entities buried in the HREF return string.replace(str, "&", "&")
def get_hrefs(text): hrefs = re.findall(r'<[Aa][^>]*?[hH][rR][eR][fF] *= *"[^"]*"[^>]*?>', text) return map(proc_href, hrefs)
home_url = "http://130.94.122.197/" history = []
url = home_url while 1: print 'opening', url text = urllib.urlopen(url).read() # Make a note in a limited-length history # This is to stop endless revisiting of the standard special pages history.append(url) history = history[-50:] # Parse out all the A HREFs url_list = get_hrefs(text) # Limit to the local site url_list = filter(lambda u: u[:len(home_url)] == home_url, url_list) # Don't revisit a page we have been to recently url_list = filter(lambda u: u not in history, url_list) print len(text), 'bytes', len(url_list), 'non-recent local A HREFs' # The home page is the last resort, and we should also force it occasionally # to get us out of pathological dead-end subspaces if not url_list or random.choice(range(10)) == 0: url_list.append(home_url) url = random.choice(url_list)
wikitech-l@lists.wikimedia.org