craig552uk · August 29, 2015 14:03
diff --git a/kis_link_checker.py b/kis_link_checker.py
 # -*- coding: utf-8 -*-
 #
 # Author: Craig Russell <[email protected]>
 #
 # Find all KIS widgets in a site and check if they work
 # Requires Python 2.7

 import urllib2
 from urlparse import urljoin
 from urlparse import urlparse
 import re

 def check_links(root_page):
    # List of pages to be crawled
    pages_to_crawl = [root_page]

    # Pages which have been crawled
    pages_crawled = []

    while pages_to_crawl:
        page_url = pages_to_crawl.pop()

        if not page_url in pages_crawled:
            # Fetch page
            try:
                response = urllib2.urlopen(page_url)
                content  = response.read()
                status   = response.code
            except:
                continue

            # Record page crawl
            pages_crawled.append(page_url)

            if not status == 200:
                continue

            # Extract page links
            start_index = 0
            while True:
                link_index = content.find('<a ', start_index)
                
                if link_index < 0:
                    break

                # Extract href
                href_start  = content.find('href="', link_index) + len('href="')
                href_end    = content.find('"', href_start)
                href        = content[href_start : href_end]

                # Strip query string
                qs_index = href.find('?')
                if qs_index > 0:
                    href = href[:qs_index]

                # Strip fragment
                fg_index = href.find('#')
                if fg_index > 0:
                    href = href[:fg_index]

                # Set start point for next itteration
                start_index = href_end

                # Add sub-directory URLs to crawl list
                if href.startswith(root_page):
                    pages_to_crawl.append(href)

            # Look for KIS widget (assume one per page)
            kis_start = content.find('http://widget.unistats.ac.uk')
            if kis_start > 0:
                kis_end = content.find('"', kis_start)
                kis_url = content[kis_start:kis_end]

                # Attempt to get KIS widget
                try:
                    response = urllib2.urlopen(kis_url)
                    status   = response.code
                except Exception as e:
                    status   = "ERROR"

                # Print result
                print "%s,%s,%s" % (page_url, kis_url, status)


 if __name__ == "__main__":
    check_links('http://www.example.ac.uk/')
	# -- coding: utf-8 --
	#
	# Author: Craig Russell <[email protected]>
	#
	# Find all KIS widgets in a site and check if they work
	# Requires Python 2.7

	import urllib2
	from urlparse import urljoin
	from urlparse import urlparse
	import re

	def check_links(root_page):
	# List of pages to be crawled
	pages_to_crawl = [root_page]

	# Pages which have been crawled
	pages_crawled = []

	while pages_to_crawl:
	page_url = pages_to_crawl.pop()

	if not page_url in pages_crawled:
	# Fetch page
	try:
	response = urllib2.urlopen(page_url)
	content = response.read()
	status = response.code
	except:
	continue

	# Record page crawl
	pages_crawled.append(page_url)

	if not status == 200:
	continue

	# Extract page links
	start_index = 0
	while True:
	link_index = content.find('<a ', start_index)

	if link_index < 0:
	break

	# Extract href
	href_start = content.find('href="', link_index) + len('href="')
	href_end = content.find('"', href_start)
	href = content[href_start : href_end]

	# Strip query string
	qs_index = href.find('?')
	if qs_index > 0:
	href = href[:qs_index]

	# Strip fragment
	fg_index = href.find('#')
	if fg_index > 0:
	href = href[:fg_index]

	# Set start point for next itteration
	start_index = href_end

	# Add sub-directory URLs to crawl list
	if href.startswith(root_page):
	pages_to_crawl.append(href)

	# Look for KIS widget (assume one per page)
	kis_start = content.find('http://widget.unistats.ac.uk')
	if kis_start > 0:
	kis_end = content.find('"', kis_start)
	kis_url = content[kis_start:kis_end]

	# Attempt to get KIS widget
	try:
	response = urllib2.urlopen(kis_url)
	status = response.code
	except Exception as e:
	status = "ERROR"

	# Print result
	print "%s,%s,%s" % (page_url, kis_url, status)


	if __name__ == "__main__":
	check_links('http://www.example.ac.uk/')