thunderpoot · November 6, 2024 13:37
diff --git a/fetch_subdomains.sh b/fetch_subdomains.sh
 #!/bin/bash

 # Shell script using curl and jq to retrieve all subdomains for a given domain
 # from Common Crawl's most recent index or a specified crawl ID.  This script
 # dynamically retrieves the latest crawl ID if none is provided, fetches data
 # (across multiple pages if necessary), retries failed requests, and extracts
 # unique subdomains.

 # Usage:
 #   bash fetch_subdomains.sh <domain> [crawl_id]
 #
 # Examples:
 # # Fetch all subdomains for commoncrawl.org from the latest index:
 #  ./fetch_subdomains.sh commoncrawl.org
 #
 # # Fetch all subdomains for commoncrawl.org from the specified crawl ID:
 #  ./fetch_subdomains.sh commoncrawl.org CC-MAIN-2024-42

 # Notes:
 # If no crawl ID is specified, the script retrieves the most recent crawl
 # ID from https://index.commoncrawl.org/collinfo.json.  The script determines
 # the total number of pages for the specified domain and crawl ID, and iterates
 # through them to collect data.  Each page fetch has a retry mechanism, with up
 # to three attempts per page and a 5-second delay between retries (please don't
 # hammer the index server). Extracted subdomains are saved in subdomains.txt
 # with each subdomain listed on a new line


 # Check if domain is provided
 if [ -z "$1" ]; then
    echo "Usage: $0 <domain> [crawl_id]"
    exit 1
 fi

 # Set domain and optional crawl ID
 domain="$1"
 crawl_id="$2"

 # If no crawl ID provided, fetch the latest crawl ID
 if [ -z "$crawl_id" ]; then
    echo "Fetching the latest crawl ID..."
    crawl_id=$(curl -s "https://index.commoncrawl.org/collinfo.json" | jq -r '.[0].id')
    echo "Using latest crawl ID: $crawl_id"
 else
    echo "Using specified crawl ID: $crawl_id"
 fi

 # Determine the number of pages
 pages=$(curl -s "https://index.commoncrawl.org/${crawl_id}-index?url=*.$domain&showNumPages=true" | jq -r '.pages')
 echo "Total pages to fetch: $pages"

 # Set the output file
 output_file="output.json"
 > "$output_file"

 # Loop through each page and append results
 for ((i=0; i<pages; i++)); do
    echo "Fetching page $((i+1)) of $pages..."
    success=false
    retries=3

    # Retry loop
    for ((attempt=1; attempt<=retries; attempt++)); do
        curl -s "https://index.commoncrawl.org/${crawl_id}-index?url=*.$domain&output=json&page=$i" -o temp_output.json

        # Validate JSON
        if jq empty temp_output.json >/dev/null 2>&1; then
            cat temp_output.json >> "$output_file"
            success=true
            break
        else
            echo "Attempt $attempt failed due to invalid JSON. Retrying in 5 seconds..."
            sleep 5
        fi
    done

    # Check if all retries failed
    if [ "$success" = false ]; then
        echo "Failed to fetch valid JSON from page $((i+1)) after $retries attempts. Skipping..."
    fi
 done

 # Check if output.json has content before proceeding
 if [ ! -s "$output_file" ]; then
    echo "No valid data retrieved. Exiting."
    exit 1
 fi

 # Process the output to get unique subdomains, one per line
 echo "Processing results to extract unique subdomains..."
 jq -r 'select(type == "object") | .url' "$output_file" | awk -F/ '{print $3}' | sort -u > subdomains.txt

 echo "Output saved to subdomains.txt"
	#!/bin/bash

	# Shell script using curl and jq to retrieve all subdomains for a given domain
	# from Common Crawl's most recent index or a specified crawl ID. This script
	# dynamically retrieves the latest crawl ID if none is provided, fetches data
	# (across multiple pages if necessary), retries failed requests, and extracts
	# unique subdomains.

	# Usage:
	# bash fetch_subdomains.sh <domain> [crawl_id]
	#
	# Examples:
	# # Fetch all subdomains for commoncrawl.org from the latest index:
	# ./fetch_subdomains.sh commoncrawl.org
	#
	# # Fetch all subdomains for commoncrawl.org from the specified crawl ID:
	# ./fetch_subdomains.sh commoncrawl.org CC-MAIN-2024-42

	# Notes:
	# If no crawl ID is specified, the script retrieves the most recent crawl
	# ID from https://index.commoncrawl.org/collinfo.json. The script determines
	# the total number of pages for the specified domain and crawl ID, and iterates
	# through them to collect data. Each page fetch has a retry mechanism, with up
	# to three attempts per page and a 5-second delay between retries (please don't
	# hammer the index server). Extracted subdomains are saved in subdomains.txt
	# with each subdomain listed on a new line


	# Check if domain is provided
	if [ -z "$1" ]; then
	echo "Usage: $0 <domain> [crawl_id]"
	exit 1
	fi

	# Set domain and optional crawl ID
	domain="$1"
	crawl_id="$2"

	# If no crawl ID provided, fetch the latest crawl ID
	if [ -z "$crawl_id" ]; then
	echo "Fetching the latest crawl ID..."
	crawl_id=$(curl -s "https://index.commoncrawl.org/collinfo.json" \| jq -r '.[0].id')
	echo "Using latest crawl ID: $crawl_id"
	else
	echo "Using specified crawl ID: $crawl_id"
	fi

	# Determine the number of pages
	pages=$(curl -s "https://index.commoncrawl.org/${crawl_id}-index?url=*.$domain&showNumPages=true" \| jq -r '.pages')
	echo "Total pages to fetch: $pages"

	# Set the output file
	output_file="output.json"
	> "$output_file"

	# Loop through each page and append results
	for ((i=0; i<pages; i++)); do
	echo "Fetching page $((i+1)) of $pages..."
	success=false
	retries=3

	# Retry loop
	for ((attempt=1; attempt<=retries; attempt++)); do
	curl -s "https://index.commoncrawl.org/${crawl_id}-index?url=*.$domain&output=json&page=$i" -o temp_output.json

	# Validate JSON
	if jq empty temp_output.json >/dev/null 2>&1; then
	cat temp_output.json >> "$output_file"
	success=true
	break
	else
	echo "Attempt $attempt failed due to invalid JSON. Retrying in 5 seconds..."
	sleep 5
	fi
	done

	# Check if all retries failed
	if [ "$success" = false ]; then
	echo "Failed to fetch valid JSON from page $((i+1)) after $retries attempts. Skipping..."
	fi
	done

	# Check if output.json has content before proceeding
	if [ ! -s "$output_file" ]; then
	echo "No valid data retrieved. Exiting."
	exit 1
	fi

	# Process the output to get unique subdomains, one per line
	echo "Processing results to extract unique subdomains..."
	jq -r 'select(type == "object") \| .url' "$output_file" \| awk -F/ '{print $3}' \| sort -u > subdomains.txt

	echo "Output saved to subdomains.txt"