Created
August 12, 2025 20:42
-
-
Save lmmx/15c8d3b5f3671a1a787ff8abfee69fe2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime as dt | |
import random | |
from functools import wraps | |
import polars as pl | |
from narlogs import print_step | |
def load_dog_data(): | |
"""Load sample dog registry data""" | |
random.seed(42) # For reproducible results | |
n_rows = 100_000 | |
breeds = [ | |
"Golden Retriever", | |
"German Shepherd", | |
"Labrador", | |
"Bulldog", | |
"Poodle", | |
"Beagle", | |
"Rottweiler", | |
"Husky", | |
] | |
colors = ["Brown", "Black", "White", "Golden", "Gray", "Mixed"] | |
return pl.DataFrame( | |
{ | |
"dog_id": list(range(1, n_rows + 1)), | |
"breed": [breeds[i % len(breeds)] for i in range(n_rows)], | |
"age": [1 + (i % 15) for i in range(n_rows)], # 1-15 years | |
"weight": [10 + (i % 90) for i in range(n_rows)], # 10-100 lbs | |
"color": [colors[i % len(colors)] for i in range(n_rows)], | |
"city": ["Boston", "NYC", "Chicago", "LA"] * (n_rows // 4), | |
} | |
) | |
@print_step | |
def filter_adult_dogs(df): | |
"""Filter dogs that are 2 years or older""" | |
return df.filter(pl.col("age") >= 2) | |
@print_step | |
def add_size_category(df): | |
"""Add size category based on weight""" | |
return df.with_columns( | |
pl.when(pl.col("weight") > 70) | |
.then(pl.lit("Large")) | |
.when(pl.col("weight") > 40) | |
.then(pl.lit("Medium")) | |
.otherwise(pl.lit("Small")) | |
.alias("size_category") | |
) | |
@print_step(shape=False) | |
def calculate_breed_stats(df): | |
"""Calculate statistics per breed""" | |
return df.group_by("breed").agg( | |
[ | |
pl.col("age").mean().alias("avg_age"), | |
pl.col("weight").mean().alias("avg_weight"), | |
pl.col("dog_id").count().alias("dog_count"), | |
pl.col("color").mode().first().alias("most_common_color"), | |
] | |
) | |
@print_step(shape=False) | |
def rank_breeds_by_popularity(df): | |
"""Rank breeds by number of registered dogs""" | |
return df.with_columns( | |
pl.col("dog_count") | |
.rank(method="ordinal", descending=True) | |
.alias("popularity_rank") | |
).sort("popularity_rank") | |
@print_step(shape=False) | |
def get_top_breeds(df, n=5): | |
"""Get top N most popular breeds""" | |
return df.head(n) | |
def add_profiling_stats(profile_df): | |
"""Add duration and percentage calculations to profile DataFrame""" | |
return profile_df.with_columns( | |
[(pl.col("end") - pl.col("start")).alias("duration")] | |
).with_columns( | |
[(pl.col("duration") / pl.col("duration").sum() * 100).alias("percent_total")] | |
) | |
if __name__ == "__main__": | |
# Build a dog breed analysis pipeline | |
pipeline = ( | |
load_dog_data() | |
.pipe(filter_adult_dogs) | |
.pipe(add_size_category) | |
.lazy() # Convert to LazyFrame for better performance | |
.pipe(calculate_breed_stats) | |
.pipe(rank_breeds_by_popularity) | |
.pipe(get_top_breeds, n=8) | |
) | |
print("Executing dog breed analysis pipeline with profiling...") | |
result, profile_df = pipeline.profile() | |
# Add timing analysis | |
timings = add_profiling_stats(profile_df) | |
print(f"\nResult: {result}") | |
print("\nTop breed statistics:") | |
print(result) | |
print("\nProfiling timings:") | |
print(timings) |
Author
lmmx
commented
Aug 12, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment