Last active
August 29, 2015 14:03
-
-
Save craig552uk/b354bd17a68501524f8f to your computer and use it in GitHub Desktop.
KIS Widget Checker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# | |
# Author: Craig Russell <[email protected]> | |
# | |
# Find all KIS widgets in a site and check if they work | |
# Requires Python 2.7 | |
import urllib2 | |
from urlparse import urljoin | |
from urlparse import urlparse | |
import re | |
def check_links(root_page): | |
# List of pages to be crawled | |
pages_to_crawl = [root_page] | |
# Pages which have been crawled | |
pages_crawled = [] | |
while pages_to_crawl: | |
page_url = pages_to_crawl.pop() | |
if not page_url in pages_crawled: | |
# Fetch page | |
try: | |
response = urllib2.urlopen(page_url) | |
content = response.read() | |
status = response.code | |
except: | |
continue | |
# Record page crawl | |
pages_crawled.append(page_url) | |
if not status == 200: | |
continue | |
# Extract page links | |
start_index = 0 | |
while True: | |
link_index = content.find('<a ', start_index) | |
if link_index < 0: | |
break | |
# Extract href | |
href_start = content.find('href="', link_index) + len('href="') | |
href_end = content.find('"', href_start) | |
href = content[href_start : href_end] | |
# Strip query string | |
qs_index = href.find('?') | |
if qs_index > 0: | |
href = href[:qs_index] | |
# Strip fragment | |
fg_index = href.find('#') | |
if fg_index > 0: | |
href = href[:fg_index] | |
# Set start point for next itteration | |
start_index = href_end | |
# Add sub-directory URLs to crawl list | |
if href.startswith(root_page): | |
pages_to_crawl.append(href) | |
# Look for KIS widget (assume one per page) | |
kis_start = content.find('http://widget.unistats.ac.uk') | |
if kis_start > 0: | |
kis_end = content.find('"', kis_start) | |
kis_url = content[kis_start:kis_end] | |
# Attempt to get KIS widget | |
try: | |
response = urllib2.urlopen(kis_url) | |
status = response.code | |
except Exception as e: | |
status = "ERROR" | |
# Print result | |
print "%s,%s,%s" % (page_url, kis_url, status) | |
if __name__ == "__main__": | |
check_links('http://www.example.ac.uk/') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment