Created
October 23, 2024 20:11
-
-
Save thunderpoot/71f8b9cbcf32ff4b8392805c4404209a to your computer and use it in GitHub Desktop.
A shell script to retrieve a single HTML page from a Common Crawl archive
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This script retrieves WARC (Web ARChive) data from Common Crawl based on a specified URL. | |
# It fetches the metadata for the URL, downloads the relevant segment of the WARC file, and extracts the HTML content. | |
# The script can also fetch the latest crawl data from Common Crawl's collection info. | |
# It uses Python's warcio library to extract HTML content and can open the result in the user's default browser. | |
# Usage: ./script.sh [URL] [optional: crawl name] | |
# If no crawl name is provided, the latest crawl is automatically selected. | |
# Fetch the latest crawl if no crawl is provided | |
if [ -z "$2" ]; then | |
echo "Fetching the latest crawl info..." | |
latest_crawl=$(curl -s https://index.commoncrawl.org/collinfo.json | jq -r '.[0].id') | |
crawl=$latest_crawl | |
echo "Using latest crawl: $crawl" | |
else | |
crawl=$2 | |
fi | |
# Check if the URL is provided | |
if [ -z "$1" ]; then | |
echo "Error: No URL provided. Please provide a URL as the first argument." | |
echo "Usage: ./script.sh [URL] [optional: crawl name]" | |
exit 1 | |
fi | |
url=$1 | |
output_filename='output.html' | |
# Step 1: Fetch the crawl metadata for the given URL | |
metadata=$(curl -s "https://index.commoncrawl.org/${crawl}-index?url=${url}&output=json" | jq) | |
# Check if metadata was retrieved | |
if [ -z "$metadata" ]; then | |
echo "Failed to retrieve metadata. Please check the URL and crawl name." | |
exit 1 | |
fi | |
# Extract necessary information from the metadata | |
offset=$(echo "$metadata" | jq -r '.offset') | |
length=$(echo "$metadata" | jq -r '.length') | |
filename=$(echo "$metadata" | jq -r '.filename') | |
# Ensure required data is present | |
if [ -z "$offset" ] || [ -z "$length" ] || [ -z "$filename" ]; then | |
echo "Incomplete metadata received. Exiting." | |
exit 1 | |
fi | |
# Display the metadata | |
echo "Metadata fetched:" | |
echo "$metadata" | |
# Step 2: Download the WARC segment using the offset and length | |
warc_gz_file="/tmp/output.warc.gz" | |
curl -s -r "$offset"-"$((offset + length - 1))" -o "$warc_gz_file" "https://data.commoncrawl.org/$filename" | |
# Check if the download was successful | |
if [ ! -f "$warc_gz_file" ]; then | |
echo "Failed to download the WARC file." | |
exit 1 | |
fi | |
# Step 3: Extract the HTML content from the WARC file using warcio | |
# Step 3a: Check if warcio is installed | |
if ! python3 -c "import warcio" &> /dev/null; then | |
echo "warcio module not found. Installing it..." | |
pip3 install warcio | |
fi | |
# Step 3b: Extract HTML using Python with content stream handling | |
python3 - <<EOF | |
from warcio.archiveiterator import ArchiveIterator | |
input_file = "$warc_gz_file" | |
output_html = "$output_filename" | |
with open(input_file, 'rb') as stream: | |
found_html = False | |
for record in ArchiveIterator(stream): | |
if record.rec_type == 'response': | |
content_type = record.http_headers.get_header('Content-Type') | |
if content_type and 'text/html' in content_type: | |
with open(output_html, 'wb') as f: | |
# Properly read the entire content stream | |
html_content = record.content_stream().read() | |
if html_content: | |
f.write(html_content) | |
found_html = True | |
print("HTML content successfully extracted.") | |
else: | |
print("Error: Content stream is empty.") | |
break | |
if not found_html: | |
print("No HTML content found in the WARC file.") | |
EOF | |
# Check if the HTML was successfully extracted | |
if [ ! -f "$output_filename" ]; then | |
echo "Failed to extract HTML content from the WARC file." | |
exit 1 | |
fi | |
echo "HTML has been extracted to $output_filename" | |
# Step 4: Ask if the user wants to view the HTML in the browser | |
read -p "Do you want to view the HTML in your browser? (y/n): " view_in_browser | |
if [ "$view_in_browser" == "y" ]; then | |
open "$output_filename" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment