Skip to content

Instantly share code, notes, and snippets.

@lmmx
Created August 12, 2025 20:42
Show Gist options
  • Save lmmx/15c8d3b5f3671a1a787ff8abfee69fe2 to your computer and use it in GitHub Desktop.
Save lmmx/15c8d3b5f3671a1a787ff8abfee69fe2 to your computer and use it in GitHub Desktop.
import datetime as dt
import random
from functools import wraps
import polars as pl
from narlogs import print_step
def load_dog_data():
"""Load sample dog registry data"""
random.seed(42) # For reproducible results
n_rows = 100_000
breeds = [
"Golden Retriever",
"German Shepherd",
"Labrador",
"Bulldog",
"Poodle",
"Beagle",
"Rottweiler",
"Husky",
]
colors = ["Brown", "Black", "White", "Golden", "Gray", "Mixed"]
return pl.DataFrame(
{
"dog_id": list(range(1, n_rows + 1)),
"breed": [breeds[i % len(breeds)] for i in range(n_rows)],
"age": [1 + (i % 15) for i in range(n_rows)], # 1-15 years
"weight": [10 + (i % 90) for i in range(n_rows)], # 10-100 lbs
"color": [colors[i % len(colors)] for i in range(n_rows)],
"city": ["Boston", "NYC", "Chicago", "LA"] * (n_rows // 4),
}
)
@print_step
def filter_adult_dogs(df):
"""Filter dogs that are 2 years or older"""
return df.filter(pl.col("age") >= 2)
@print_step
def add_size_category(df):
"""Add size category based on weight"""
return df.with_columns(
pl.when(pl.col("weight") > 70)
.then(pl.lit("Large"))
.when(pl.col("weight") > 40)
.then(pl.lit("Medium"))
.otherwise(pl.lit("Small"))
.alias("size_category")
)
@print_step(shape=False)
def calculate_breed_stats(df):
"""Calculate statistics per breed"""
return df.group_by("breed").agg(
[
pl.col("age").mean().alias("avg_age"),
pl.col("weight").mean().alias("avg_weight"),
pl.col("dog_id").count().alias("dog_count"),
pl.col("color").mode().first().alias("most_common_color"),
]
)
@print_step(shape=False)
def rank_breeds_by_popularity(df):
"""Rank breeds by number of registered dogs"""
return df.with_columns(
pl.col("dog_count")
.rank(method="ordinal", descending=True)
.alias("popularity_rank")
).sort("popularity_rank")
@print_step(shape=False)
def get_top_breeds(df, n=5):
"""Get top N most popular breeds"""
return df.head(n)
def add_profiling_stats(profile_df):
"""Add duration and percentage calculations to profile DataFrame"""
return profile_df.with_columns(
[(pl.col("end") - pl.col("start")).alias("duration")]
).with_columns(
[(pl.col("duration") / pl.col("duration").sum() * 100).alias("percent_total")]
)
if __name__ == "__main__":
# Build a dog breed analysis pipeline
pipeline = (
load_dog_data()
.pipe(filter_adult_dogs)
.pipe(add_size_category)
.lazy() # Convert to LazyFrame for better performance
.pipe(calculate_breed_stats)
.pipe(rank_breeds_by_popularity)
.pipe(get_top_breeds, n=8)
)
print("Executing dog breed analysis pipeline with profiling...")
result, profile_df = pipeline.profile()
# Add timing analysis
timings = add_profiling_stats(profile_df)
print(f"\nResult: {result}")
print("\nTop breed statistics:")
print(result)
print("\nProfiling timings:")
print(timings)
@lmmx
Copy link
Author

lmmx commented Aug 12, 2025

{"step":"filter_adult_dogs","time":"a moment","n_obs":93333,"n_col":6,"dtypes":{"dog_id":"Int64","breed":"String","age":"Int64","weight":"Int64","color":"String","city":"String"},"func_call":"filter_adult_dogs()"}
{"step":"add_size_category","time":"a moment","n_obs":93333,"n_col":7,"dtypes":{"dog_id":"Int64","breed":"String","age":"Int64","weight":"Int64","color":"String","city":"String","size_category":"String"},"func_call":"add_size_category()"}
{"step":"calculate_breed_stats","time":"a moment","dtypes":{"breed":"String","avg_age":"Float64","avg_weight":"Float64","dog_count":"UInt32","most_common_color":"String"},"func_call":"calculate_breed_stats()"}
{"step":"rank_breeds_by_popularity","time":"a moment","dtypes":{"breed":"String","avg_age":"Float64","avg_weight":"Float64","dog_count":"UInt32","most_common_color":"String","popularity_rank":"UInt32"},"func_call":"rank_breeds_by_popularity()"}
{"step":"get_top_breeds","time":"a moment","dtypes":{"breed":"String","avg_age":"Float64","avg_weight":"Float64","dog_count":"UInt32","most_common_color":"String","popularity_rank":"UInt32"},"func_call":"get_top_breeds(n = 8)"}
Executing dog breed analysis pipeline with profiling...

Result: shape: (8, 6)
┌──────────────────┬──────────┬────────────┬───────────┬───────────────────┬─────────────────┐
│ breedavg_ageavg_weightdog_countmost_common_colorpopularity_rank │
│ ------------------             │
│ strf64f64u32stru32             │
╞══════════════════╪══════════╪════════════╪═══════════╪═══════════════════╪═════════════════╡
│ Poodle8.50021454.99442911667Gray1               │
│ Beagle8.50064354.99228611667Mixed2               │
│ German Shepherd8.49892955.00085711667Black3               │
│ Labrador8.49935754.99871411667Gray4               │
│ Bulldog8.49978654.99657211667Mixed5               │
│ Golden Retriever8.49914354.99914311666White6               │
│ Rottweiler8.50042954.99411666White7               │
│ Husky8.49957154.99314211666Black8               │
└──────────────────┴──────────┴────────────┴───────────┴───────────────────┴─────────────────┘

Top breed statistics:
shape: (8, 6)
┌──────────────────┬──────────┬────────────┬───────────┬───────────────────┬─────────────────┐
│ breedavg_ageavg_weightdog_countmost_common_colorpopularity_rank │
│ ------------------             │
│ strf64f64u32stru32             │
╞══════════════════╪══════════╪════════════╪═══════════╪═══════════════════╪═════════════════╡
│ Poodle8.50021454.99442911667Gray1               │
│ Beagle8.50064354.99228611667Mixed2               │
│ German Shepherd8.49892955.00085711667Black3               │
│ Labrador8.49935754.99871411667Gray4               │
│ Bulldog8.49978654.99657211667Mixed5               │
│ Golden Retriever8.49914354.99914311666White6               │
│ Rottweiler8.50042954.99411666White7               │
│ Husky8.49957154.99314211666Black8               │
└──────────────────┴──────────┴────────────┴───────────┴───────────────────┴─────────────────┘

Profiling timings:
shape: (4, 5)
┌──────────────────────────────┬───────┬──────┬──────────┬───────────────┐
│ nodestartenddurationpercent_total │
│ ---------------           │
│ stru64u64u64f64           │
╞══════════════════════════════╪═══════╪══════╪══════════╪═══════════════╡
│ optimization01181183.75557       │
│ group_by(breed)              ┆ 1183024290692.488861     │
│ with_column(popularity_rank) ┆ 30343092581.845958      │
│ sort(popularity_rank)        ┆ 30953155601.909612      │
└──────────────────────────────┴───────┴──────┴──────────┴───────────────┘

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment