Thats right, my Web Spider Link Verification Utility for Stuff [WSLVUFS] is ready. In other words, i needed a simple project to get familiar with python so i wrote a tool that will recursivly traverse links in a given web site and report any dead links it finds.
Right now it just spits out the dead links without any clue as to which page they were found on. If i start feeling ambitous I might fix that.
Run with "-v" for verbose output.
Usage: python spider.py [-v]
from sys import argv
import urllib
baseUrl = ""
completed = []
deadLinks = []
verbose = 0
def fetchPage(url):
try:
sock = urllib.urlopen(url)
source = sock.read()
sock.close()
return (source, 1)
except IOError:
deadLinks.append(url)
return ("", 0)
def extractUrls(source):
splits = source.split('href="')
splits.pop(0)
urls = []
for s in splits:
current = s[0:s.find('"')]
urls.append(current)
return urls
def runSpider(url, recursive=1):
''' Recursively process links found on pages '''
if verbose:
print "Connecting to", url
if url.endswith("/"):
url = url[0:len(url) - 1]
fetch = fetchPage(url)
source = fetch[0]
if verbose and fetch[1] == 0:
print "FAILED to Connect to", url
return
if not recursive:
return
urls = extractUrls(source)
next = []
for u in urls:
if not u in completed:
completed.append(u);
if not "://" in u:
if u.startswith("/"):
u = url + u
else:
u = url + "/" + u
# Only recurse spider on pages from the base url
if baseUrl in u:
runSpider(u)
else:
runSpider(u, 0)
argc = len(argv)
if argc < 2:
print "Usage: spider.py [-v] "
else:
# Start the spider at the base url
if argc == 2:
baseUrl = argv[1]
elif argc == 3:
baseUrl = argv[2]
if argv[1] == "-v":
verbose = 1
runSpider(baseUrl)
if len(deadLinks) != 0:
print "\nThe following links were not accessible:"
for dead in deadLinks:
print "--", dead
else:
print "\nAll links were accessible"