pete-otaqui · October 21, 2016 07:52
diff --git a/download-website.sh b/download-website.sh
 #!/bin/bash

 wget -E -k -r -p -e robots=off https://some-site.com/docs/

 #### Note the following arguments:
 # -E : converts downloaded HTML filenames to have a ".html" suffix
 # -k : converts internal links within downloaded files to point to other downloaded files
 # -r : recursively download by scanning for internal links in pages
 # -p : download "page requisites", i.e. images, styles, scripts
 # -e robots=off : ignore robots.txt (because some sites use it to avoid indexing)

 #### Other useful arguments
 # --no-parent : don't ascend in the path hierarchy (useful for just getting a "/docs/" section)
 # -A "/index.html,*.svg,*/docs/*" : comma-separated "accept list", can use patterns
 # -R "*.eot,*.woff,/archive" : comma-separated "reject list", can use patterns
 # -L : spans host names, careful you don't try to download the entire web
	#!/bin/bash

	wget -E -k -r -p -e robots=off https://some-site.com/docs/

	#### Note the following arguments:
	# -E : converts downloaded HTML filenames to have a ".html" suffix
	# -k : converts internal links within downloaded files to point to other downloaded files
	# -r : recursively download by scanning for internal links in pages
	# -p : download "page requisites", i.e. images, styles, scripts
	# -e robots=off : ignore robots.txt (because some sites use it to avoid indexing)

	#### Other useful arguments
	# --no-parent : don't ascend in the path hierarchy (useful for just getting a "/docs/" section)
	# -A "/index.html,.svg,/docs/*" : comma-separated "accept list", can use patterns
	# -R ".eot,.woff,/archive" : comma-separated "reject list", can use patterns
	# -L : spans host names, careful you don't try to download the entire web