Created
December 8, 2023 07:13
-
-
Save leipzig/639442de640427e883db722a9f8273d0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tiledb | |
import tiledbvcf | |
import numpy as np | |
cloud_uri = "tiledb://TileDB-Inc/vcf-1kg-dragen-v376" | |
# This is the config I need to pass | |
cloud_config = tiledb.Config() | |
cloud_cfg = tiledbvcf.ReadConfig(tiledb_config=cloud_config) | |
# Open the VCF dataset in read mode passing the config | |
ds = tiledbvcf.Dataset(uri=cloud_uri, mode="r", cfg=cloud_cfg) | |
seenit = {} | |
newkeys_list = [] | |
samples = ds.samples() | |
# 500 randomly sampled samples no replacment | |
random_samples = np.random.choice(samples, 500, replace=False) | |
for sample in random_samples: | |
# read and return Pandas dataframe | |
df = ds.read( | |
samples=[sample], | |
regions=["chr21:1-50000000"], | |
attrs=["contig", "pos_start", "alleles", "fmt_GT"], | |
) | |
# add a new column with chr:pos:ref:alt | |
df["chrposrefalt"] = ( | |
df["contig"].astype(str) | |
+ ":" | |
+ df["pos_start"].astype(str) | |
+ ":" | |
+ df["alleles"].str[0] | |
+ ":" | |
+ df["alleles"].str[1] | |
) | |
# count how many new keys we have | |
newkeys = len(df[~df["chrposrefalt"].isin(seenit)]) | |
# add the new keys to the seenit dict | |
seenit.update(dict.fromkeys(df["chrposrefalt"].values)) | |
# append newkeys to the list | |
newkeys_list.append(newkeys) | |
print(newkeys_list) | |
#do this in ggplot | |
#df<-data.frame(x=1:500,y=listofnumbers) | |
#plot these on the y axis and their corresponding x axis values on the x axis, y axis in log scale, add a smooth line | |
#label the x axis sample and the y axis number of novel variants | |
#ggplot(df,aes(x=x,y=y))+geom_point()+scale_y_log10()+geom_smooth(method="loess")+xlab("Sample")+ylab("Number of novel variants") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment