Commit cb431c22 authored by Kevin Kunzmann's avatar Kevin Kunzmann

scaling up to full vcf

parent 93f95db1
......@@ -10,9 +10,9 @@ localrules: impute, clean, download_container, generate_samples_file
rule impute:
input:
"container.sif",
expand("{output_dir}/gene_expression/{region}.expression.txt", region = config['brain_regions'], output_dir = config['output_dir'])
expand("{output_dir}/imputed-gene-expressions/{region}.expression.txt", region = config['brain_regions'], output_dir = config['output_dir'])
output:
expand("{output_dir}/gene_expressions_combined.rds", output_dir = config['output_dir'])
expand("{output_dir}/imputed-gene-expressionss_combined.rds", output_dir = config['output_dir'])
singularity:
"container.sif"
shell:
......@@ -29,6 +29,22 @@ rule clean:
rm -rf nohup.out
"""
# download vcf files using gsutil (needs to be set up and configured!
rule download_imputed_genotype_chromosome:
output:
"output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
shell:
"""
mkdir -p output/imputed-genotypes
gsutil cp gs://fimm-horizon-outgoing-data/CENTER_TBI_data_freeze_190829/Imputed_data/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz \
output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz
"""
rule download_imputed_genotypes:
input:
expand("output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz",
i = list(map(str, range(1, 23))) + ['X']
)
# download the required singularity container from zenodo.org
rule download_container:
output:
......@@ -42,7 +58,7 @@ rule download_container:
rule vcf_to_dosages:
input:
"container.sif",
config['input_vcf_gz_file']
vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
output:
"{output_dir}/dosages/chr{i}.dosage.txt.gz"
singularity:
......@@ -51,11 +67,8 @@ rule vcf_to_dosages:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
echo "extracting chromosome {wildcards.i} ..."
bcftools view -r chr{wildcards.i} {config[input_vcf_gz_file]} > $prefix/chr{wildcards.i}_.vcf
echo "computing MAFs ..."
bcftools +fill-tags $prefix/chr{wildcards.i}_.vcf > $prefix/chr{wildcards.i}.vcf
rm $prefix/chr{wildcards.i}_.vcf
echo "extracting and computing MAFs ..."
bcftools +fill-tags {inputs.vcf_gz_file} > $prefix/chr{wildcards.i}.vcf
echo 'querying dosages ...'
bcftools query -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
echo 'compressing ...'
......@@ -64,11 +77,13 @@ rule vcf_to_dosages:
printf "done.\n\r\n\r"
"""
bcftools query -e 'MAF[0]<0.01 | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
# extract sample file for PrediXcan
rule generate_samples_file:
input:
"container.sif",
config['input_vcf_gz_file']
vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr1.vcf.gz"
output:
"{output_dir}/dosages/samples.txt"
singularity:
......@@ -79,32 +94,32 @@ rule generate_samples_file:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
bcftools query -l {config[input_vcf_gz_file]} >> $prefix/samples_.txt
bcftools query -l {inputs.vcf_gz_file} >> $prefix/samples_.txt
# family ID = individual ID
awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt
rm $prefix/samples_.txt
"""
# run PrediXcan to impute gene expression for individual tissue type
rule impute_gene_expression:
rule impute_gene_expressions:
input:
"container.sif",
samples_file = expand("{output_dir}/dosages/samples.txt", output_dir = config['output_dir']),
dosage_files = expand("{output_dir}/dosages/chr{i}.dosage.txt.gz", i = range(1, 24), output_dir = config['output_dir'])
output:
"{output_dir}/gene_expression/{region}.expression.txt"
"{output_dir}/imputed-gene-expressions/{region}.expression.txt"
singularity:
"container.sif"
shell:
"""
mkdir -p {wildcards.output_dir}/gene_expression
mkdir -p {wildcards.output_dir}/imputed-gene-expressions
predixcan \
--predict \
--dosages {config[output_dir]}/dosages \
--dosages_prefix chr \
--samples samples.txt \
--weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \
--output_prefix {wildcards.output_dir}/gene_expression/{wildcards.region}
mv {wildcards.output_dir}/gene_expression/{wildcards.region}_predicted_expression.txt \
{wildcards.output_dir}/gene_expression/{wildcards.region}.expression.txt
--output_prefix {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}
mv {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}_predicted_expression.txt \
{wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}.expression.txt
"""
# input genotype file
input_vcf_gz_file: 'test.vcf.gz'
# where to put things
output_dir: 'output'
......
......@@ -15,7 +15,7 @@ col_types <- cols(
config$brain_regions %>%
purrr::map(
function(x) {
sprintf("%s/gene_expression/%s.expression.txt", config$output_dir, x) %>%
sprintf("%s/imputed-gene-expressions/%s.expression.txt", config$output_dir, x) %>%
read_tsv(col_types = col_types, progress = FALSE) %>%
tidyr::gather('ensembl_gene_id', 'expression', -FID, -IID) %>%
mutate(tissue = x) %>%
......@@ -26,4 +26,4 @@ config$brain_regions %>%
# make missing genes explicit
tidyr::spread(ensembl_gene_id, expression, fill = NA_real_) %>%
tidyr::gather('ensembl_gene_id', 'expression', -FID, -IID, -tissue) %>%
write_rds(sprintf('%s/gene_expressions_combined.rds', config$output_dir), compress = 'gz')
write_rds(sprintf('%s/imputed-gene-expressionss_combined.rds', config$output_dir), compress = 'gz')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment