Skip to content

Instantly share code, notes, and snippets.

@James-Rocker
Created September 17, 2024 12:44
Show Gist options
  • Save James-Rocker/560a52240274c1cb80e1a4dd87bfcf49 to your computer and use it in GitHub Desktop.
Save James-Rocker/560a52240274c1cb80e1a4dd87bfcf49 to your computer and use it in GitHub Desktop.
pydantic vs pandera performance while working with dataframes
import pandas as pd
import time
from pydantic import BaseModel, ValidationError, Field
import pandera as pa
from pandera import Column, DataFrameSchema
# Generate a synthetic DataFrame
def generate_data(n: int) -> pd.DataFrame:
data = {
"id": list(range(1, n + 1)),
"name": [f"name_{i}" for i in range(1, n + 1)],
"age": [i % 100 for i in range(1, n + 1)],
"salary": [i * 1000.0 for i in range(1, n + 1)]
}
return pd.DataFrame(data)
# Pydantic model for validating a row
class RowModel(BaseModel):
id: int = Field(..., ge=1)
name: str
age: int = Field(..., ge=0, le=100)
salary: float = Field(..., ge=0.0)
# Pandera schema for validating the entire DataFrame
schema = DataFrameSchema({
"id": Column(int, checks=pa.Check.ge(1)),
"name": Column(str),
"age": Column(int, checks=[pa.Check.ge(0), pa.Check.le(100)]),
"salary": Column(float, checks=pa.Check.ge(0.0))
})
# Validate rows using Pydantic
def validate_with_pydantic(df: pd.DataFrame) -> bool:
valid = True
for row in df.to_dict(orient="records"):
try:
RowModel(**row)
except ValidationError as e:
valid = False
print(f"Pydantic Validation Error: {e}")
return valid
# Validate entire DataFrame using Pandera
def validate_with_pandera(df: pd.DataFrame) -> bool:
try:
schema.validate(df)
return True
except pa.errors.SchemaError as e:
print(f"Pandera Validation Error: {e}")
return False
# Benchmark function
def benchmark(n: int) -> None:
df = generate_data(n)
# Benchmark Pydantic validation
start_time = time.time()
pydantic_valid = validate_with_pydantic(df)
pydantic_time = time.time() - start_time
# Benchmark Pandera validation
start_time = time.time()
pandera_valid = validate_with_pandera(df)
pandera_time = time.time() - start_time
# Results
print(f"Pydantic validation passed: {pydantic_valid}, Time taken: {pydantic_time:.6f} seconds")
print(f"Pandera validation passed: {pandera_valid}, Time taken: {pandera_time:.6f} seconds")
# Run the benchmark with 10000 rows
if __name__ == "__main__":
benchmark(10000)
@James-Rocker
Copy link
Author

Pydantic validation passed: True, Time taken: 0.021296 seconds
Pandera validation passed: True, Time taken: 0.012582 seconds

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment