thunderpoot · October 23, 2024 20:11
diff --git a/cc-get-page.sh b/cc-get-page.sh
 #!/bin/bash

 # This script retrieves WARC (Web ARChive) data from Common Crawl based on a specified URL.
 # It fetches the metadata for the URL, downloads the relevant segment of the WARC file, and extracts the HTML content.
 # The script can also fetch the latest crawl data from Common Crawl's collection info.
 # It uses Python's warcio library to extract HTML content and can open the result in the user's default browser.

 # Usage: ./script.sh [URL] [optional: crawl name]
 # If no crawl name is provided, the latest crawl is automatically selected.

 # Fetch the latest crawl if no crawl is provided
 if [ -z "$2" ]; then
    echo "Fetching the latest crawl info..."
    latest_crawl=$(curl -s https://index.commoncrawl.org/collinfo.json | jq -r '.[0].id')
    crawl=$latest_crawl
    echo "Using latest crawl: $crawl"
 else
    crawl=$2
 fi

 # Check if the URL is provided
 if [ -z "$1" ]; then
    echo "Error: No URL provided. Please provide a URL as the first argument."
    echo "Usage: ./script.sh [URL] [optional: crawl name]"
    exit 1
 fi

 url=$1
 output_filename='output.html'

 # Step 1: Fetch the crawl metadata for the given URL
 metadata=$(curl -s "https://index.commoncrawl.org/${crawl}-index?url=${url}&output=json" | jq)

 # Check if metadata was retrieved
 if [ -z "$metadata" ]; then
    echo "Failed to retrieve metadata. Please check the URL and crawl name."
    exit 1
 fi

 # Extract necessary information from the metadata
 offset=$(echo "$metadata" | jq -r '.offset')
 length=$(echo "$metadata" | jq -r '.length')
 filename=$(echo "$metadata" | jq -r '.filename')

 # Ensure required data is present
 if [ -z "$offset" ] || [ -z "$length" ] || [ -z "$filename" ]; then
    echo "Incomplete metadata received. Exiting."
    exit 1
 fi

 # Display the metadata
 echo "Metadata fetched:"
 echo "$metadata"

 # Step 2: Download the WARC segment using the offset and length
 warc_gz_file="/tmp/output.warc.gz"
 curl -s -r "$offset"-"$((offset + length - 1))" -o "$warc_gz_file" "https://data.commoncrawl.org/$filename"

 # Check if the download was successful
 if [ ! -f "$warc_gz_file" ]; then
    echo "Failed to download the WARC file."
    exit 1
 fi

 # Step 3: Extract the HTML content from the WARC file using warcio

 # Step 3a: Check if warcio is installed
 if ! python3 -c "import warcio" &> /dev/null; then
    echo "warcio module not found. Installing it..."
    pip3 install warcio
 fi

 # Step 3b: Extract HTML using Python with content stream handling
 python3 - <<EOF
 from warcio.archiveiterator import ArchiveIterator

 input_file = "$warc_gz_file"
 output_html = "$output_filename"

 with open(input_file, 'rb') as stream:
    found_html = False
    for record in ArchiveIterator(stream):
        if record.rec_type == 'response':
            content_type = record.http_headers.get_header('Content-Type')
            if content_type and 'text/html' in content_type:
                with open(output_html, 'wb') as f:
                    # Properly read the entire content stream
                    html_content = record.content_stream().read()
                    if html_content:
                        f.write(html_content)
                        found_html = True
                        print("HTML content successfully extracted.")
                    else:
                        print("Error: Content stream is empty.")
                break

    if not found_html:
        print("No HTML content found in the WARC file.")
 EOF

 # Check if the HTML was successfully extracted
 if [ ! -f "$output_filename" ]; then
    echo "Failed to extract HTML content from the WARC file."
    exit 1
 fi

 echo "HTML has been extracted to $output_filename"

 # Step 4: Ask if the user wants to view the HTML in the browser
 read -p "Do you want to view the HTML in your browser? (y/n): " view_in_browser
 if [ "$view_in_browser" == "y" ]; then
    open "$output_filename"
 fi
	#!/bin/bash

	# This script retrieves WARC (Web ARChive) data from Common Crawl based on a specified URL.
	# It fetches the metadata for the URL, downloads the relevant segment of the WARC file, and extracts the HTML content.
	# The script can also fetch the latest crawl data from Common Crawl's collection info.
	# It uses Python's warcio library to extract HTML content and can open the result in the user's default browser.

	# Usage: ./script.sh [URL] [optional: crawl name]
	# If no crawl name is provided, the latest crawl is automatically selected.

	# Fetch the latest crawl if no crawl is provided
	if [ -z "$2" ]; then
	echo "Fetching the latest crawl info..."
	latest_crawl=$(curl -s https://index.commoncrawl.org/collinfo.json \| jq -r '.[0].id')
	crawl=$latest_crawl
	echo "Using latest crawl: $crawl"
	else
	crawl=$2
	fi

	# Check if the URL is provided
	if [ -z "$1" ]; then
	echo "Error: No URL provided. Please provide a URL as the first argument."
	echo "Usage: ./script.sh [URL] [optional: crawl name]"
	exit 1
	fi

	url=$1
	output_filename='output.html'

	# Step 1: Fetch the crawl metadata for the given URL
	metadata=$(curl -s "https://index.commoncrawl.org/${crawl}-index?url=${url}&output=json" \| jq)

	# Check if metadata was retrieved
	if [ -z "$metadata" ]; then
	echo "Failed to retrieve metadata. Please check the URL and crawl name."
	exit 1
	fi

	# Extract necessary information from the metadata
	offset=$(echo "$metadata" \| jq -r '.offset')
	length=$(echo "$metadata" \| jq -r '.length')
	filename=$(echo "$metadata" \| jq -r '.filename')

	# Ensure required data is present
	if [ -z "$offset" ] \|\| [ -z "$length" ] \|\| [ -z "$filename" ]; then
	echo "Incomplete metadata received. Exiting."
	exit 1
	fi

	# Display the metadata
	echo "Metadata fetched:"
	echo "$metadata"

	# Step 2: Download the WARC segment using the offset and length
	warc_gz_file="/tmp/output.warc.gz"
	curl -s -r "$offset"-"$((offset + length - 1))" -o "$warc_gz_file" "https://data.commoncrawl.org/$filename"

	# Check if the download was successful
	if [ ! -f "$warc_gz_file" ]; then
	echo "Failed to download the WARC file."
	exit 1
	fi

	# Step 3: Extract the HTML content from the WARC file using warcio

	# Step 3a: Check if warcio is installed
	if ! python3 -c "import warcio" &> /dev/null; then
	echo "warcio module not found. Installing it..."
	pip3 install warcio
	fi

	# Step 3b: Extract HTML using Python with content stream handling
	python3 - <<EOF
	from warcio.archiveiterator import ArchiveIterator

	input_file = "$warc_gz_file"
	output_html = "$output_filename"

	with open(input_file, 'rb') as stream:
	found_html = False
	for record in ArchiveIterator(stream):
	if record.rec_type == 'response':
	content_type = record.http_headers.get_header('Content-Type')
	if content_type and 'text/html' in content_type:
	with open(output_html, 'wb') as f:
	# Properly read the entire content stream
	html_content = record.content_stream().read()
	if html_content:
	f.write(html_content)
	found_html = True
	print("HTML content successfully extracted.")
	else:
	print("Error: Content stream is empty.")
	break

	if not found_html:
	print("No HTML content found in the WARC file.")
	EOF

	# Check if the HTML was successfully extracted
	if [ ! -f "$output_filename" ]; then
	echo "Failed to extract HTML content from the WARC file."
	exit 1
	fi

	echo "HTML has been extracted to $output_filename"

	# Step 4: Ask if the user wants to view the HTML in the browser
	read -p "Do you want to view the HTML in your browser? (y/n): " view_in_browser
	if [ "$view_in_browser" == "y" ]; then
	open "$output_filename"
	fi