Created
August 25, 2023 09:59
-
-
Save Cuahchic/6703428e0d2e9b1153c988a4b65a88bf to your computer and use it in GitHub Desktop.
Using SQL with Pandas DataFrames
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% Imports | |
import numpy as np | |
import pandas as pd | |
import sqlite3 | |
# %% Generate some data | |
data_people1 = { | |
'idx': [1, 2, 3, 4], | |
'age': [6, 39, 69, 41], | |
'name': ['John', 'Sue', 'James', 'Agatha'] | |
} | |
data_people2 = { | |
'idx': [5, 6], | |
'age': [73, 18], | |
'name': ['Dorothy', 'Peter'] | |
} | |
data_people_heights = { | |
'name': ['John', 'Sue', 'James', 'Agatha'], | |
'height_m': [0.97, 1.49, 1.58, 1.71] | |
} | |
data_people_status = { | |
'age_min': [0, 18, 65], | |
'age_max': [18, 65, 100], | |
'status': ['child', 'working age', 'retired'] | |
} | |
# Create DataFrames | |
df_people1 = pd.DataFrame(data_people1) | |
df_people2 = pd.DataFrame(data_people2) | |
df_people_heights = pd.DataFrame(data_people_heights) | |
df_people_status = pd.DataFrame(data_people_status) | |
# %% Step 1: Add heights to the people DataFrame | |
# Python way | |
df_people_python = df_people1.merge(right=df_people_heights, how="left", left_on="name", right_on="name") # Obviously you can modify the "how" to be inner, left, right etc for different types of joins | |
# SQL way | |
with sqlite3.connect(':memory:') as conn: # Create context manager for connection | |
# Write tables into database | |
df_people1.to_sql(name='people', con=conn, index=False) # See docs here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html | |
df_people_heights.to_sql(name='people_heights', con=conn, index=False) | |
# Get cursor for doing stuff | |
cur = conn.cursor() | |
# Index to speed up join | |
cur.execute('CREATE INDEX index_people__name ON people(name ASC);') # Not necessary for such a small table in this example but can improve performance on large DataFrames/tables | |
cur.close() | |
# Get data back to Python, docs: https://pandas.pydata.org/docs/reference/api/pandas.read_sql_query.html | |
# NOTE: In the docs the parse_dates parameter can be used to convert timestamps to pd.Timestamp data types | |
df_people_sql = pd.read_sql_query( | |
sql='SELECT p.*, ph.height_m FROM people as p LEFT JOIN people_heights as ph ON p.name = ph.name;', | |
con=conn | |
) | |
# Check they are the same | |
assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same" | |
# %% Step 2: Add some new rows to the people DataFrame | |
# Python way | |
df_people_python = df_people1.append(other=df_people2, ignore_index=True) | |
# SQL way | |
with sqlite3.connect(':memory:') as conn: # Create context manager for connection | |
# Write tables into database | |
df_people1.to_sql(name='people1', con=conn, index=False) | |
df_people2.to_sql(name='people2', con=conn, index=False) | |
# Get data back to Python | |
df_people_sql = pd.read_sql_query( | |
sql='SELECT * FROM people1 UNION ALL SELECT * FROM people2;', | |
con=conn | |
) | |
# Check they are the same | |
assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same" | |
# %% Step 3: Add status based on between style logic which is more awkward in Python | |
# Python way, from: https://stackoverflow.com/questions/44367672/best-way-to-join-merge-by-range-in-pandas | |
i, j = np.where((df_people1.age.values[:, None] >= df_people_status.age_min.values) & (df_people1.age.values[:, None] < df_people_status.age_max.values)) | |
df_people_python = pd.concat([ | |
df_people1.loc[i, :].reset_index(drop=True), | |
df_people_status.loc[j, ["status"]].reset_index(drop=True) | |
], axis=1) | |
# SQL way | |
with sqlite3.connect(':memory:') as conn: # Create context manager for connection | |
# Write tables into database | |
df_people1.to_sql(name='people', con=conn, index=False) | |
df_people_status.to_sql(name='people_status', con=conn, index=False) | |
# Create query to join tables | |
sql = ''' | |
SELECT p.*, ps.status | |
FROM people p | |
LEFT JOIN | |
people_status ps | |
ON p.age BETWEEN ps.age_min AND ps.age_max | |
''' | |
# Get data back to Python | |
df_people_sql = pd.read_sql_query(sql=sql, con=conn) | |
# Check they are the same | |
assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment