MikeNGarrett · December 13, 2017 21:11
diff --git a/utility.sh b/utility.sh
 # Crawl a site's public urls to produce a csv list of urls and response codes
 # This could be reduced into a single command, but I find it helpful to have a list of all urls.

 # overview: crawl the site to add one url per line in a text file.
 # NOTE: this must run and complete first.
 # wget mirror's the site (including static files)
 # grep files the line with the url on it.
 # awk grabs the 3rd item (separated by spaces) and writes to urls.txt

 wget --mirror -p https://domain.com/ 2>&1 | grep '^--' | awk '{ print $3 }' > urls.txt

 # overview: given a file with one url per line, output a csv with urls and response codes.
 # cat read the file
 # xargs executes a command for each new line. 
 # - P 10 sets 10 parallel processes. 
 # - User agent matches Google's bots. 
 # - Only read the head. 
 # - Replace the write out with our own. Check out the available variables: https://ec.haxx.se/usingcurl-verbose.html#available---write-out-variables
 # tee outputs piped content to a file. Same as >>

 cat urls.txt | xargs -P 10 curl --user-agent "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" -o /dev/null --silent --head --write-out '%{url_effective};%{http_code};\n' | tee tested-urls.csv
	# Crawl a site's public urls to produce a csv list of urls and response codes
	# This could be reduced into a single command, but I find it helpful to have a list of all urls.

	# overview: crawl the site to add one url per line in a text file.
	# NOTE: this must run and complete first.
	# wget mirror's the site (including static files)
	# grep files the line with the url on it.
	# awk grabs the 3rd item (separated by spaces) and writes to urls.txt

	wget --mirror -p https://domain.com/ 2>&1 \| grep '^--' \| awk '{ print $3 }' > urls.txt

	# overview: given a file with one url per line, output a csv with urls and response codes.
	# cat read the file
	# xargs executes a command for each new line.
	# - P 10 sets 10 parallel processes.
	# - User agent matches Google's bots.
	# - Only read the head.
	# - Replace the write out with our own. Check out the available variables: https://ec.haxx.se/usingcurl-verbose.html#available---write-out-variables
	# tee outputs piped content to a file. Same as >>

	cat urls.txt \| xargs -P 10 curl --user-agent "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" -o /dev/null --silent --head --write-out '%{url_effective};%{http_code};\n' \| tee tested-urls.csv