1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# define the singularity (https://sylabs.io/) container image to use
# see also https://cloud.sylabs.io/library/kkmann/default/center-grex-imputation#
singularity: "library://kkmann/default/center-grex-imputation:sha256.321833b357b20feda51877dcef6e9bac2dfa706e9ed6c32da39baaa8ead9e1f7"
# modify this file to define global parameters
# (e.g. set of tissues to impute genetically regulated expression levels for)
configfile: "config/parameters.yml"
# these rules are computationally less expensive and run locally, even in
# cluster execution mode
localrules: all, container, download_genome, download_weights, samples_file,
model_variant_lookup_table, mapping_report
# this rule is exected by default, i.e.
# > snakemake
# is equivalent to
# > snakemake all
# the target imputes gene expression for all tissues listed in the config file
# and the report on mapping mopdel variants to available dosage information.
rule all:
input:
expand(
"output/imputed-grex/{tissue}.igrex.txt.gz",
tissue=config["tissues"]
),
"output/mapping-report.html"
# dummy rule doing nothing; can be used to download the required container;
# this might be helpful to set up authentification with the data servers
# (requires an interactive step); see readme.md for details
rule container:
shell:
"""
echo done.
"""
# download vcf files using gsutil (needs to be set up and configured!)
rule download_genome:
output:
"output/imputed-genotypes/chromosome-{chr}.vcf.gz",
"output/imputed-genotypes/chromosome-{chr}.vcf.gz.tbi"
shell:
"""
set -ex
mkdir -p output/imputed-genotypes
FILEDIR=gs://fimm-horizon-outgoing-data/genetic-data/imputed-genomes/{config[data_date]}/{config[cohort]}
gsutil cp $FILEDIR/chromosome-{wildcards.chr}.vcf.gz \
output/imputed-genotypes/chromosome-{wildcards.chr}.vcf.gz
gsutil cp $FILEDIR/chromosome-{wildcards.chr}.vcf.gz.tbi \
output/imputed-genotypes/chromosome-{wildcards.chr}.vcf.gz.tbi
"""
# download PrediXcan EQTL weights
rule download_weights:
output:
directory("output/weights")
shell:
"""
set -ex
mkdir -p logs
mkdir -p output/weights
# wget https://zenodo.org/record/3518299/files/mashr_eqtl.tar?download=1 \
-O output/weights/eqtl.tar
wget https://zenodo.org/record/3519321/files/elastic_net_eqtl.tar?download=1 \
-O output/weights/eqtl.tar
tar -C output/weights -xvf output/weights/eqtl.tar
rm output/weights/eqtl.tar
mv output/weights/elastic_net_models/* output/weights/
rm -rf output/weights/elastic_net_models
"""
# generate a table with all model variants
rule model_variant_lookup_table:
input:
rules.download_weights.output
output:
"output/model-variants-lookup-table.csv.gz"
shell:
"""
set -ex
Rscript scripts/get-model-variants.R
"""
# generate the samples file in the format required by PrediXcan
rule samples_file:
input:
"output/imputed-genotypes/chromosome-1.vcf.gz"
output:
"output/dosages/samples.txt"
shell:
"""
set -ex
mkdir -p output/dosages
# query sample ids from first chromosome
bcftools query -l {input} > output/samples.txt
# predixcan allows family and sample id, duplicate column and overwrite
Rscript -e "library(dplyr); readr::read_tsv('output/samples.txt', col_names = 'SID', col_types = 'c') %>% dplyr::mutate(FID = SID) %>% readr::write_tsv(path = 'output/dosages/samples.txt', col_names = FALSE)"
"""
# extract and map dosage information to model variants by position and alleles
# (only works with hg38 inputs, otherwise a liftover needs to be performed!)
rule dosage:
input:
vcf = "output/imputed-genotypes/chromosome-{chr}.vcf.gz",
index = "output/imputed-genotypes/chromosome-{chr}.vcf.gz.tbi",
lookup = "output/model-variants-lookup-table.csv.gz",
samples = "output/dosages/samples.txt"
output:
"output/dosages/matched-chromosome-{chr}.dosage.txt.gz",
"output/dosages/unmatched-chromosome-{chr}.csv.gz"
shell:
"""
# setup
set -ex
mkdir -p output/dosages
# create file with chromosomal positions for all model variants
Rscript scripts/get-model-variant-positions.R {wildcards.chr}
nlines=$(wc -l < "output/model-variant-positions-chromosome-{wildcards.chr}.txt" | bc)
if [[ $nlines == 0 ]]
then
touch "output/dosages/matched-chromosome-{wildcards.chr}.dosage.txt.gz"
touch "output/dosages/unmatched-chromosome-{wildcards.chr}.csv.gz"
else
# filter for model variant positions and extract dosage information
bcftools query \
-o "output/dosages/chromosome-{wildcards.chr}.dosage.csv" \
-R "output/model-variant-positions-chromosome-{wildcards.chr}.txt" \
-f "%CHROM,%ID,%POS,%REF,%ALT,%INFO/MAF[,%DS]\n\r" \
{input.vcf}
# match model variants with extracted dosage
Rscript scripts/match-variants.R {wildcards.chr}
# cleanup
rm "output/dosages/chromosome-{wildcards.chr}.dosage.csv"
fi
# cleanup
rm "output/model-variant-positions-chromosome-{wildcards.chr}.txt"
"""
# build an overview over the found model SNPS per chromosome
rule mapping_report:
input:
dosage_files = expand(
"output/dosages/matched-chromosome-{chr}.dosage.txt.gz",
chr=range(1,24)
)
output:
"output/mapping-report.html"
shell:
"""
set -ex
Rscript -e "rmarkdown::render('scripts/mapping-report.Rmd', output_dir = 'output', output_file = 'mapping-report.html', knit_root_dir = '$PWD')"
"""
# impute GREx for a specific tissue (sums up the mapped dosages times the
# PrediXcan weights)
rule grex:
input:
samples_file = "output/dosages/samples.txt",
dosage_files = expand(
"output/dosages/matched-chromosome-{chr}.dosage.txt.gz",
chr=range(1,24)
)
output:
"output/imputed-grex/{tissue}.igrex.txt.gz"
shell:
"""
mkdir -p output/imputed-grex
predixcan \
--predict \
--dosages output/dosages \
--dosages_prefix matched-chromosome- \
--samples samples.txt \
--weights output/weights/en_{wildcards.tissue}.db \
--output_prefix output/imputed-grex/{wildcards.tissue} \
--model_variant_id varID
mv output/imputed-grex/{wildcards.tissue}_predicted_expression.txt \
output/imputed-grex/{wildcards.tissue}.igrex.txt
gzip output/imputed-grex/{wildcards.tissue}.igrex.txt
"""