Created
May 19, 2022 21:59
-
-
Save wolfiex/8255e8295cc803e94497aada4a93def2 to your computer and use it in GitHub Desktop.
Simulate Dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
A script to generate a synthetic dataset replacing the original one. | |
Author: danielellisresearch.com | |
''' | |
# !pip install synthia pandas | |
import pandas as pd | |
import numpy as np | |
import synthia as syn | |
import argparse | |
import warnings, pickle, sys | |
warnings.filterwarnings('ignore') | |
parser = argparse.ArgumentParser() | |
parser.add_argument('files', type=argparse.FileType('r'), nargs='+') | |
parser.add_argument('-i','--independant', action='store_false') | |
args = parser.parse_args() | |
coupled = args.independant | |
def get_synthetic_data(fname): | |
# Load the original data | |
data = pd.read_csv(fname, index_col=0) | |
# Get file datatypes | |
dtypes = data.dtypes | |
# Get the names of the columns with numeric types | |
numeric = data.columns[dtypes.apply(pd.api.types.is_numeric_dtype)] | |
# Extract numeric subset | |
subset = data.loc[:,numeric].replace(np.nan, 0) | |
# Create Generator | |
generator = syn.CopulaDataGenerator() | |
# Define Coupla and Parameterizer | |
if coupled: | |
parameterizer = syn.QuantileParameterizer(n_quantiles=100) | |
generator.fit(subset, copula=syn.GaussianCopula(), parameterize_by=parameterizer) | |
else: | |
generator.fit(subset, copula=syn.IndependenceCopula()) | |
print(f'Storage size: {len(pickle.dumps(generator))} bytes') | |
# Generate our samples to the same shape as the original data | |
samples = generator.generate(n_samples=len(subset), uniformization_ratio=0, stretch_factor=1) | |
synthetic = pd.DataFrame(samples, columns = subset.columns, index = subset.index) | |
# Create a new dataframe with the synthetic data | |
update = data.loc[:] | |
update.loc[:,numeric]= synthetic.loc[:,numeric] | |
update = update.astype(dtypes) | |
# save the new dataframe | |
update.to_csv(fname.replace('.csv', '_synthetic.csv')) | |
if __name__ == '__main__': | |
for fname in args.files: | |
print(f'Processing {fname.name}') | |
get_synthetic_data(fname.name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment