Cuahchic · August 25, 2023 09:59
diff --git a/using_sql_with_pandas.py b/using_sql_with_pandas.py
 # %% Imports
 import numpy as np
 import pandas as pd
 import sqlite3

 # %% Generate some data
 data_people1 = {
        'idx': [1, 2, 3, 4],
        'age': [6, 39, 69, 41],
        'name': ['John', 'Sue', 'James', 'Agatha']
    }

 data_people2 = {
        'idx': [5, 6],
        'age': [73, 18],
        'name': ['Dorothy', 'Peter']
    }

 data_people_heights = {
        'name': ['John', 'Sue', 'James', 'Agatha'],
        'height_m': [0.97, 1.49, 1.58, 1.71]
    }

 data_people_status = {
        'age_min': [0, 18, 65],
        'age_max': [18, 65, 100],
        'status': ['child', 'working age', 'retired']
    }

 # Create DataFrames
 df_people1 = pd.DataFrame(data_people1)
 df_people2 = pd.DataFrame(data_people2)
 df_people_heights = pd.DataFrame(data_people_heights)
 df_people_status = pd.DataFrame(data_people_status)


 # %% Step 1: Add heights to the people DataFrame
 # Python way
 df_people_python = df_people1.merge(right=df_people_heights, how="left", left_on="name", right_on="name") # Obviously you can modify the "how" to be inner, left, right etc for different types of joins

 # SQL way
 with sqlite3.connect(':memory:') as conn: # Create context manager for connection    
    # Write tables into database
    df_people1.to_sql(name='people', con=conn, index=False) # See docs here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html
    df_people_heights.to_sql(name='people_heights', con=conn, index=False)
    
    # Get cursor for doing stuff
    cur = conn.cursor()
    
    # Index to speed up join
    cur.execute('CREATE INDEX index_people__name ON people(name ASC);') # Not necessary for such a small table in this example but can improve performance on large DataFrames/tables
    cur.close()
    
    # Get data back to Python, docs: https://pandas.pydata.org/docs/reference/api/pandas.read_sql_query.html
    # NOTE: In the docs the parse_dates parameter can be used to convert timestamps to pd.Timestamp data types
    df_people_sql = pd.read_sql_query(
            sql='SELECT p.*, ph.height_m FROM people as p LEFT JOIN people_heights as ph ON p.name = ph.name;', 
            con=conn
        )

 # Check they are the same
 assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"

 # %% Step 2: Add some new rows to the people DataFrame
 # Python way
 df_people_python = df_people1.append(other=df_people2, ignore_index=True)

 # SQL way
 with sqlite3.connect(':memory:') as conn: # Create context manager for connection    
    # Write tables into database
    df_people1.to_sql(name='people1', con=conn, index=False)
    df_people2.to_sql(name='people2', con=conn, index=False)
    
    # Get data back to Python
    df_people_sql = pd.read_sql_query(
            sql='SELECT * FROM people1 UNION ALL SELECT * FROM people2;', 
            con=conn
        )

 # Check they are the same
 assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"

 # %% Step 3: Add status based on between style logic which is more awkward in Python
 # Python way, from: https://stackoverflow.com/questions/44367672/best-way-to-join-merge-by-range-in-pandas
 i, j = np.where((df_people1.age.values[:, None] >= df_people_status.age_min.values) & (df_people1.age.values[:, None] < df_people_status.age_max.values))

 df_people_python = pd.concat([
        df_people1.loc[i, :].reset_index(drop=True),
        df_people_status.loc[j, ["status"]].reset_index(drop=True)
    ], axis=1)

 # SQL way
 with sqlite3.connect(':memory:') as conn: # Create context manager for connection    
    # Write tables into database
    df_people1.to_sql(name='people', con=conn, index=False)
    df_people_status.to_sql(name='people_status', con=conn, index=False)
    
    # Create query to join tables
    sql = '''
        SELECT  p.*, ps.status
        FROM    people p
                LEFT JOIN
                people_status ps
                ON p.age BETWEEN ps.age_min AND ps.age_max
        '''
        
    # Get data back to Python
    df_people_sql = pd.read_sql_query(sql=sql, con=conn)
    
 # Check they are the same
 assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"
	# %% Imports
	import numpy as np
	import pandas as pd
	import sqlite3

	# %% Generate some data
	data_people1 = {
	'idx': [1, 2, 3, 4],
	'age': [6, 39, 69, 41],
	'name': ['John', 'Sue', 'James', 'Agatha']
	}

	data_people2 = {
	'idx': [5, 6],
	'age': [73, 18],
	'name': ['Dorothy', 'Peter']
	}

	data_people_heights = {
	'name': ['John', 'Sue', 'James', 'Agatha'],
	'height_m': [0.97, 1.49, 1.58, 1.71]
	}

	data_people_status = {
	'age_min': [0, 18, 65],
	'age_max': [18, 65, 100],
	'status': ['child', 'working age', 'retired']
	}

	# Create DataFrames
	df_people1 = pd.DataFrame(data_people1)
	df_people2 = pd.DataFrame(data_people2)
	df_people_heights = pd.DataFrame(data_people_heights)
	df_people_status = pd.DataFrame(data_people_status)


	# %% Step 1: Add heights to the people DataFrame
	# Python way
	df_people_python = df_people1.merge(right=df_people_heights, how="left", left_on="name", right_on="name") # Obviously you can modify the "how" to be inner, left, right etc for different types of joins

	# SQL way
	with sqlite3.connect(':memory:') as conn: # Create context manager for connection
	# Write tables into database
	df_people1.to_sql(name='people', con=conn, index=False) # See docs here: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html
	df_people_heights.to_sql(name='people_heights', con=conn, index=False)

	# Get cursor for doing stuff
	cur = conn.cursor()

	# Index to speed up join
	cur.execute('CREATE INDEX index_people__name ON people(name ASC);') # Not necessary for such a small table in this example but can improve performance on large DataFrames/tables
	cur.close()

	# Get data back to Python, docs: https://pandas.pydata.org/docs/reference/api/pandas.read_sql_query.html
	# NOTE: In the docs the parse_dates parameter can be used to convert timestamps to pd.Timestamp data types
	df_people_sql = pd.read_sql_query(
	sql='SELECT p.*, ph.height_m FROM people as p LEFT JOIN people_heights as ph ON p.name = ph.name;',
	con=conn
	)

	# Check they are the same
	assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"

	# %% Step 2: Add some new rows to the people DataFrame
	# Python way
	df_people_python = df_people1.append(other=df_people2, ignore_index=True)

	# SQL way
	with sqlite3.connect(':memory:') as conn: # Create context manager for connection
	# Write tables into database
	df_people1.to_sql(name='people1', con=conn, index=False)
	df_people2.to_sql(name='people2', con=conn, index=False)

	# Get data back to Python
	df_people_sql = pd.read_sql_query(
	sql='SELECT * FROM people1 UNION ALL SELECT * FROM people2;',
	con=conn
	)

	# Check they are the same
	assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"

	# %% Step 3: Add status based on between style logic which is more awkward in Python
	# Python way, from: https://stackoverflow.com/questions/44367672/best-way-to-join-merge-by-range-in-pandas
	i, j = np.where((df_people1.age.values[:, None] >= df_people_status.age_min.values) & (df_people1.age.values[:, None] < df_people_status.age_max.values))

	df_people_python = pd.concat([
	df_people1.loc[i, :].reset_index(drop=True),
	df_people_status.loc[j, ["status"]].reset_index(drop=True)
	], axis=1)

	# SQL way
	with sqlite3.connect(':memory:') as conn: # Create context manager for connection
	# Write tables into database
	df_people1.to_sql(name='people', con=conn, index=False)
	df_people_status.to_sql(name='people_status', con=conn, index=False)

	# Create query to join tables
	sql = '''
	SELECT p.*, ps.status
	FROM people p
	LEFT JOIN
	people_status ps
	ON p.age BETWEEN ps.age_min AND ps.age_max
	'''

	# Get data back to Python
	df_people_sql = pd.read_sql_query(sql=sql, con=conn)

	# Check they are the same
	assert df_people_python.compare(other=df_people_sql).empty, "The DataFrames are not the same"