Created
November 6, 2024 13:37
-
-
Save thunderpoot/298c69487880cbae8920527fb658af3b to your computer and use it in GitHub Desktop.
Shell script using curl and jq to retrieve all subdomains for a given domain from a given Common Crawl index
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Shell script using curl and jq to retrieve all subdomains for a given domain | |
# from Common Crawl's most recent index or a specified crawl ID. This script | |
# dynamically retrieves the latest crawl ID if none is provided, fetches data | |
# (across multiple pages if necessary), retries failed requests, and extracts | |
# unique subdomains. | |
# Usage: | |
# bash fetch_subdomains.sh <domain> [crawl_id] | |
# | |
# Examples: | |
# # Fetch all subdomains for commoncrawl.org from the latest index: | |
# ./fetch_subdomains.sh commoncrawl.org | |
# | |
# # Fetch all subdomains for commoncrawl.org from the specified crawl ID: | |
# ./fetch_subdomains.sh commoncrawl.org CC-MAIN-2024-42 | |
# Notes: | |
# If no crawl ID is specified, the script retrieves the most recent crawl | |
# ID from https://index.commoncrawl.org/collinfo.json. The script determines | |
# the total number of pages for the specified domain and crawl ID, and iterates | |
# through them to collect data. Each page fetch has a retry mechanism, with up | |
# to three attempts per page and a 5-second delay between retries (please don't | |
# hammer the index server). Extracted subdomains are saved in subdomains.txt | |
# with each subdomain listed on a new line | |
# Check if domain is provided | |
if [ -z "$1" ]; then | |
echo "Usage: $0 <domain> [crawl_id]" | |
exit 1 | |
fi | |
# Set domain and optional crawl ID | |
domain="$1" | |
crawl_id="$2" | |
# If no crawl ID provided, fetch the latest crawl ID | |
if [ -z "$crawl_id" ]; then | |
echo "Fetching the latest crawl ID..." | |
crawl_id=$(curl -s "https://index.commoncrawl.org/collinfo.json" | jq -r '.[0].id') | |
echo "Using latest crawl ID: $crawl_id" | |
else | |
echo "Using specified crawl ID: $crawl_id" | |
fi | |
# Determine the number of pages | |
pages=$(curl -s "https://index.commoncrawl.org/${crawl_id}-index?url=*.$domain&showNumPages=true" | jq -r '.pages') | |
echo "Total pages to fetch: $pages" | |
# Set the output file | |
output_file="output.json" | |
> "$output_file" | |
# Loop through each page and append results | |
for ((i=0; i<pages; i++)); do | |
echo "Fetching page $((i+1)) of $pages..." | |
success=false | |
retries=3 | |
# Retry loop | |
for ((attempt=1; attempt<=retries; attempt++)); do | |
curl -s "https://index.commoncrawl.org/${crawl_id}-index?url=*.$domain&output=json&page=$i" -o temp_output.json | |
# Validate JSON | |
if jq empty temp_output.json >/dev/null 2>&1; then | |
cat temp_output.json >> "$output_file" | |
success=true | |
break | |
else | |
echo "Attempt $attempt failed due to invalid JSON. Retrying in 5 seconds..." | |
sleep 5 | |
fi | |
done | |
# Check if all retries failed | |
if [ "$success" = false ]; then | |
echo "Failed to fetch valid JSON from page $((i+1)) after $retries attempts. Skipping..." | |
fi | |
done | |
# Check if output.json has content before proceeding | |
if [ ! -s "$output_file" ]; then | |
echo "No valid data retrieved. Exiting." | |
exit 1 | |
fi | |
# Process the output to get unique subdomains, one per line | |
echo "Processing results to extract unique subdomains..." | |
jq -r 'select(type == "object") | .url' "$output_file" | awk -F/ '{print $3}' | sort -u > subdomains.txt | |
echo "Output saved to subdomains.txt" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment