Created
June 8, 2022 21:39
-
-
Save ckandoth/92e74fc299d9c9c8e141eb8a74f08c02 to your computer and use it in GitHub Desktop.
Smaller gnomAD 3.1.2 VCF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fetch the WGS gnomAD 3.1.2 per-chrom VCFs (the large size is mostly due to INFO fields): | |
mkdir gnomad | |
gsutil -m cp gs://gcp-public-data--gnomad/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.sites.chr*.vcf.bgz gnomad | |
gsutil -m cp gs://gcp-public-data--gnomad/release/3.1.2/vcf/genomes/gnomad.genomes.v3.1.2.sites.chr*.vcf.bgz.tbi gnomad | |
# Shortlist INFO fields we want to keep when merging these into a single VCF of reduced file size: | |
bcftools view -h gnomad/gnomad.genomes.v3.1.2.sites.chr21.vcf.bgz | grep ^##INFO | cut -f3- -d= | grep -Ev "controls|non_cancer|non_neuro|non_topmed|non_v2|vep" | sort | less -S | |
cadd_phred | |
cadd_raw_score | |
primate_ai_score | |
revel_score | |
splice_ai_consequence | |
splice_ai_max_ds | |
# Make a merged 3.1.2 VCF listing only the shortlisted INFO fields: | |
ls gnomad/gnomad.genomes.v3.1.2.sites.chr*.vcf.bgz | xargs bcftools concat --threads 16 -n | bcftools annotate --threads 16 --remove ^INF/AC,INF/AN,INF/AF,INF/AC_raw,INF/nhomalt,INF/popmax,INF/AC_popmax,INF/nhomalt_popmax,INF/MQ,INF/cadd_phred,INF/cadd_raw_score,INF/primate_ai_score,INF/revel_score,INF/splice_ai_consequence,INF/splice_ai_max_ds --output-type z --output gnomad/gnomad.genomes.v3.1.2.sites.vcf.bgz | |
tabix -p vcf gnomad/gnomad.genomes.v3.1.2.sites.vcf.bgz |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment