lmmx · August 12, 2025 20:42 · lmmx · Aug 12, 2025
diff --git a/lf_log.py b/lf_log.py
 import datetime as dt
 import random
 from functools import wraps

 import polars as pl
 from narlogs import print_step


 def load_dog_data():
    """Load sample dog registry data"""
    random.seed(42)  # For reproducible results
    n_rows = 100_000

    breeds = [
        "Golden Retriever",
        "German Shepherd",
        "Labrador",
        "Bulldog",
        "Poodle",
        "Beagle",
        "Rottweiler",
        "Husky",
    ]
    colors = ["Brown", "Black", "White", "Golden", "Gray", "Mixed"]

    return pl.DataFrame(
        {
            "dog_id": list(range(1, n_rows + 1)),
            "breed": [breeds[i % len(breeds)] for i in range(n_rows)],
            "age": [1 + (i % 15) for i in range(n_rows)],  # 1-15 years
            "weight": [10 + (i % 90) for i in range(n_rows)],  # 10-100 lbs
            "color": [colors[i % len(colors)] for i in range(n_rows)],
            "city": ["Boston", "NYC", "Chicago", "LA"] * (n_rows // 4),
        }
    )


 @print_step
 def filter_adult_dogs(df):
    """Filter dogs that are 2 years or older"""
    return df.filter(pl.col("age") >= 2)


 @print_step
 def add_size_category(df):
    """Add size category based on weight"""
    return df.with_columns(
        pl.when(pl.col("weight") > 70)
        .then(pl.lit("Large"))
        .when(pl.col("weight") > 40)
        .then(pl.lit("Medium"))
        .otherwise(pl.lit("Small"))
        .alias("size_category")
    )


 @print_step(shape=False)
 def calculate_breed_stats(df):
    """Calculate statistics per breed"""
    return df.group_by("breed").agg(
        [
            pl.col("age").mean().alias("avg_age"),
            pl.col("weight").mean().alias("avg_weight"),
            pl.col("dog_id").count().alias("dog_count"),
            pl.col("color").mode().first().alias("most_common_color"),
        ]
    )


 @print_step(shape=False)
 def rank_breeds_by_popularity(df):
    """Rank breeds by number of registered dogs"""
    return df.with_columns(
        pl.col("dog_count")
        .rank(method="ordinal", descending=True)
        .alias("popularity_rank")
    ).sort("popularity_rank")


 @print_step(shape=False)
 def get_top_breeds(df, n=5):
    """Get top N most popular breeds"""
    return df.head(n)


 def add_profiling_stats(profile_df):
    """Add duration and percentage calculations to profile DataFrame"""
    return profile_df.with_columns(
        [(pl.col("end") - pl.col("start")).alias("duration")]
    ).with_columns(
        [(pl.col("duration") / pl.col("duration").sum() * 100).alias("percent_total")]
    )


 if __name__ == "__main__":
    # Build a dog breed analysis pipeline
    pipeline = (
        load_dog_data()
        .pipe(filter_adult_dogs)
        .pipe(add_size_category)
        .lazy()  # Convert to LazyFrame for better performance
        .pipe(calculate_breed_stats)
        .pipe(rank_breeds_by_popularity)
        .pipe(get_top_breeds, n=8)
    )

    print("Executing dog breed analysis pipeline with profiling...")
    result, profile_df = pipeline.profile()

    # Add timing analysis
    timings = add_profiling_stats(profile_df)

    print(f"\nResult: {result}")
    print("\nTop breed statistics:")
    print(result)
    print("\nProfiling timings:")
    print(timings)
	import datetime as dt
	import random
	from functools import wraps

	import polars as pl
	from narlogs import print_step


	def load_dog_data():
	"""Load sample dog registry data"""
	random.seed(42) # For reproducible results
	n_rows = 100_000

	breeds = [
	"Golden Retriever",
	"German Shepherd",
	"Labrador",
	"Bulldog",
	"Poodle",
	"Beagle",
	"Rottweiler",
	"Husky",
	]
	colors = ["Brown", "Black", "White", "Golden", "Gray", "Mixed"]

	return pl.DataFrame(
	{
	"dog_id": list(range(1, n_rows + 1)),
	"breed": [breeds[i % len(breeds)] for i in range(n_rows)],
	"age": [1 + (i % 15) for i in range(n_rows)], # 1-15 years
	"weight": [10 + (i % 90) for i in range(n_rows)], # 10-100 lbs
	"color": [colors[i % len(colors)] for i in range(n_rows)],
	"city": ["Boston", "NYC", "Chicago", "LA"] * (n_rows // 4),
	}
	)


	@print_step
	def filter_adult_dogs(df):
	"""Filter dogs that are 2 years or older"""
	return df.filter(pl.col("age") >= 2)


	@print_step
	def add_size_category(df):
	"""Add size category based on weight"""
	return df.with_columns(
	pl.when(pl.col("weight") > 70)
	.then(pl.lit("Large"))
	.when(pl.col("weight") > 40)
	.then(pl.lit("Medium"))
	.otherwise(pl.lit("Small"))
	.alias("size_category")
	)


	@print_step(shape=False)
	def calculate_breed_stats(df):
	"""Calculate statistics per breed"""
	return df.group_by("breed").agg(
	[
	pl.col("age").mean().alias("avg_age"),
	pl.col("weight").mean().alias("avg_weight"),
	pl.col("dog_id").count().alias("dog_count"),
	pl.col("color").mode().first().alias("most_common_color"),
	]
	)


	@print_step(shape=False)
	def rank_breeds_by_popularity(df):
	"""Rank breeds by number of registered dogs"""
	return df.with_columns(
	pl.col("dog_count")
	.rank(method="ordinal", descending=True)
	.alias("popularity_rank")
	).sort("popularity_rank")


	@print_step(shape=False)
	def get_top_breeds(df, n=5):
	"""Get top N most popular breeds"""
	return df.head(n)


	def add_profiling_stats(profile_df):
	"""Add duration and percentage calculations to profile DataFrame"""
	return profile_df.with_columns(
	[(pl.col("end") - pl.col("start")).alias("duration")]
	).with_columns(
	[(pl.col("duration") / pl.col("duration").sum() * 100).alias("percent_total")]
	)


	if __name__ == "__main__":
	# Build a dog breed analysis pipeline
	pipeline = (
	load_dog_data()
	.pipe(filter_adult_dogs)
	.pipe(add_size_category)
	.lazy() # Convert to LazyFrame for better performance
	.pipe(calculate_breed_stats)
	.pipe(rank_breeds_by_popularity)
	.pipe(get_top_breeds, n=8)
	)

	print("Executing dog breed analysis pipeline with profiling...")
	result, profile_df = pipeline.profile()

	# Add timing analysis
	timings = add_profiling_stats(profile_df)

	print(f"\nResult: {result}")
	print("\nTop breed statistics:")
	print(result)
	print("\nProfiling timings:")
	print(timings)