1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# edit configuration here (see cluster.json for running on slurm scheduler)
configfile: "config.yml"
localrules: impute, clean, download_container, generate_samples_file
# this rule runs the entire pipeline and creates a R data set file (rds)
# containing all imputed expression values
rule impute:
input:
"container.sif",
expand("{output_dir}/imputed-gene-expressions/{region}.expression.txt", region = config['brain_regions'], output_dir = config['output_dir'])
output:
expand("{output_dir}/imputed-gene-expressionss_combined.rds", output_dir = config['output_dir'])
singularity:
"container.sif"
shell:
"""
Rscript scripts/combine_expression_data.R
"""
# delete output and logs (if run on slurm cluster)
rule clean:
shell:
"""
rm -rf {config[output_dir]}
rm -rf logs
rm -rf nohup.out
"""
# download vcf files using gsutil (needs to be set up and configured!
rule download_imputed_genotype_chromosome:
output:
"output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
singularity:
"container.sif"
shell:
"""
mkdir -p output/imputed-genotypes
gsutil cp gs://fimm-horizon-outgoing-data/CENTER_TBI_data_freeze_190829/Imputed_data/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz \
output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz
"""
rule download_imputed_genotypes:
input:
expand("output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz",
i = list(map(str, range(1, 23))) + ['X']
)
# download the required singularity container from zenodo.org
rule download_container:
output:
"container.sif"
shell:
"""
bash scripts/download_container.sh
"""
# extract dosages in custom PrediXcan format from vcf file
rule vcf_to_dosages:
input:
"container.sif",
vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
output:
"{output_dir}/dosages/chr{i}.dosage.txt.gz"
singularity:
"container.sif"
shell:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
echo "extracting and computing MAFs ..."
bcftools +fill-tags {inputs.vcf_gz_file} > $prefix/chr{wildcards.i}.vcf
echo 'querying dosages ...'
bcftools query -e 'MAF[0]>{config[min_MAF]} | INFO>{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
echo 'compressing ...'
gzip $prefix/chr{wildcards.i}.dosage.txt
rm $prefix/chr{wildcards.i}.vcf
printf "done.\n\r\n\r"
"""
# extract sample file for PrediXcan
rule generate_samples_file:
input:
"container.sif",
vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr1.vcf.gz"
output:
"{output_dir}/dosages/samples.txt"
singularity:
"container.sif"
params:
format = "\'{ printf(\"%s %s\\n\", $1, $1); }\'"
shell:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
bcftools query -l {inputs.vcf_gz_file} >> $prefix/samples_.txt
# family ID = individual ID
awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt
rm $prefix/samples_.txt
"""
# run PrediXcan to impute gene expression for individual tissue type
rule impute_gene_expressions:
input:
"container.sif",
samples_file = expand("{output_dir}/dosages/samples.txt", output_dir = config['output_dir']),
dosage_files = expand("{output_dir}/dosages/chr{i}.dosage.txt.gz", i = range(1, 24), output_dir = config['output_dir'])
output:
"{output_dir}/imputed-gene-expressions/{region}.expression.txt"
singularity:
"container.sif"
shell:
"""
mkdir -p {wildcards.output_dir}/imputed-gene-expressions
predixcan \
--predict \
--dosages {config[output_dir]}/dosages \
--dosages_prefix chr \
--samples samples.txt \
--weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \
--output_prefix {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}
mv {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}_predicted_expression.txt \
{wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}.expression.txt
"""