This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTERNAL TABLE IF NOT EXISTS commoncrawl_index -- let’s create a new table with the following columns: | |
( | |
url_surtkey STRING, -- Sort-friendly URI Reordering Transform | |
url STRING, -- the URL (duh) including protocol (http or https) | |
url_host_name STRING, -- the hostname, including subdomain(s) | |
url_host_tld STRING, -- the top-level domain such as `.org` | |
url_host_registered_domain STRING, -- the registered domain name | |
url_host_private_domain STRING, -- private domain such as `example.com` | |
url_host_public_suffix STRING, -- public suffix of the domain such as `.co.uk` or `.edu` | |
url_protocol STRING, -- the transfer protocol used, (http or https) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Shell script using curl and jq to retrieve all subdomains for a given domain | |
# from Common Crawl's most recent index or a specified crawl ID. This script | |
# dynamically retrieves the latest crawl ID if none is provided, fetches data | |
# (across multiple pages if necessary), retries failed requests, and extracts | |
# unique subdomains. | |
# Usage: | |
# bash fetch_subdomains.sh <domain> [crawl_id] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This script retrieves WARC (Web ARChive) data from Common Crawl based on a specified URL. | |
# It fetches the metadata for the URL, downloads the relevant segment of the WARC file, and extracts the HTML content. | |
# The script can also fetch the latest crawl data from Common Crawl's collection info. | |
# It uses Python's warcio library to extract HTML content and can open the result in the user's default browser. | |
# Usage: ./script.sh [URL] [optional: crawl name] | |
# If no crawl name is provided, the latest crawl is automatically selected. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# _ _ | |
# __ __ (_) _ __ _ _ | | _ __ _ _ | |
# \ \ / / | | | '_ \ | | | | | | | '_ \ | | | | | |
# \ V / | | | | | | | |_| | | | _ | |_) | | |_| | | |
# \_/ |_| |_| |_| \__, | |_| (_) | .__/ \__, | | |
# |___/ |_| |___/ | |
# This command-line program allows you to change the playback speed of an |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pyarrow.parquet as pq | |
def describe_parquet(file_path): | |
file_size = os.path.getsize(file_path) | |
print(f"File Size: {file_size} bytes") | |
table = pq.read_table(file_path) | |
columns = table.column_names |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# You know that really annoying message that pops up... | |
# Mosh: You have 3 detached Mosh sessions on this server, with PIDs: | |
# - mosh [2294539] | |
# - mosh [1874313] | |
# - mosh [2294805] | |
# I often find myself copying this list of PIDs in order to kill them manually |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
# For parsing URLs: | |
from urllib.parse import quote_plus | |
# For parsing WARC records: | |
from warcio.archiveiterator import ArchiveIterator | |
# The URL of the Common Crawl Index server |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# This script is useful when proofing with only some glyphs completed… | |
# Usage example: | |
# $ perl findwords.pl qwertyasdf | |
# Searching for words in /usr/share/dict/words containing only q, w, e, r, t, y, a, s, d, f | |
# westerwards | |
# afterstate | |
# aftertaste |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
if [ $# -eq 0 ] | |
then | |
echo "%usage: $0 <id> [options]" | |
exit | |
fi | |
echo "[$0] 🌶 Now servin' up hot GIFs!" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import random | |
import time | |
import sys | |
class Unbuffered( object ) : | |
def __init__( self, stream ) : | |
self.stream = stream | |
def write( self, data ) : |
NewerOlder