Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
impute-gene-expression
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Kevin Kunzmann
impute-gene-expression
Commits
cb431c22
Commit
cb431c22
authored
Sep 02, 2019
by
Kevin Kunzmann
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
scaling up to full vcf
parent
93f95db1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
33 additions
and
21 deletions
+33
-21
Snakefile
Snakefile
+31
-16
config.yml
config.yml
+0
-3
scripts/combine_expression_data.R
scripts/combine_expression_data.R
+2
-2
No files found.
Snakefile
View file @
cb431c22
...
...
@@ -10,9 +10,9 @@ localrules: impute, clean, download_container, generate_samples_file
rule impute:
input:
"container.sif",
expand("{output_dir}/
gene_expression
/{region}.expression.txt", region = config['brain_regions'], output_dir = config['output_dir'])
expand("{output_dir}/
imputed-gene-expressions
/{region}.expression.txt", region = config['brain_regions'], output_dir = config['output_dir'])
output:
expand("{output_dir}/
gene_expression
s_combined.rds", output_dir = config['output_dir'])
expand("{output_dir}/
imputed-gene-expressions
s_combined.rds", output_dir = config['output_dir'])
singularity:
"container.sif"
shell:
...
...
@@ -29,6 +29,22 @@ rule clean:
rm -rf nohup.out
"""
# download vcf files using gsutil (needs to be set up and configured!
rule download_imputed_genotype_chromosome:
output:
"output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
shell:
"""
mkdir -p output/imputed-genotypes
gsutil cp gs://fimm-horizon-outgoing-data/CENTER_TBI_data_freeze_190829/Imputed_data/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz \
output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz
"""
rule download_imputed_genotypes:
input:
expand("output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz",
i = list(map(str, range(1, 23))) + ['X']
)
# download the required singularity container from zenodo.org
rule download_container:
output:
...
...
@@ -42,7 +58,7 @@ rule download_container:
rule vcf_to_dosages:
input:
"container.sif",
config['input_vcf_gz_file']
vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
output:
"{output_dir}/dosages/chr{i}.dosage.txt.gz"
singularity:
...
...
@@ -51,11 +67,8 @@ rule vcf_to_dosages:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
echo "extracting chromosome {wildcards.i} ..."
bcftools view -r chr{wildcards.i} {config[input_vcf_gz_file]} > $prefix/chr{wildcards.i}_.vcf
echo "computing MAFs ..."
bcftools +fill-tags $prefix/chr{wildcards.i}_.vcf > $prefix/chr{wildcards.i}.vcf
rm $prefix/chr{wildcards.i}_.vcf
echo "extracting and computing MAFs ..."
bcftools +fill-tags {inputs.vcf_gz_file} > $prefix/chr{wildcards.i}.vcf
echo 'querying dosages ...'
bcftools query -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
echo 'compressing ...'
...
...
@@ -64,11 +77,13 @@ rule vcf_to_dosages:
printf "done.\n\r\n\r"
"""
bcftools query -e 'MAF[0]<0.01 | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
# extract sample file for PrediXcan
rule generate_samples_file:
input:
"container.sif",
config['input_vcf_gz_file']
vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr1.vcf.gz"
output:
"{output_dir}/dosages/samples.txt"
singularity:
...
...
@@ -79,32 +94,32 @@ rule generate_samples_file:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
bcftools query -l {
config[input_vcf_gz_file]
} >> $prefix/samples_.txt
bcftools query -l {
inputs.vcf_gz_file
} >> $prefix/samples_.txt
# family ID = individual ID
awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt
rm $prefix/samples_.txt
"""
# run PrediXcan to impute gene expression for individual tissue type
rule impute_gene_expression:
rule impute_gene_expression
s
:
input:
"container.sif",
samples_file = expand("{output_dir}/dosages/samples.txt", output_dir = config['output_dir']),
dosage_files = expand("{output_dir}/dosages/chr{i}.dosage.txt.gz", i = range(1, 24), output_dir = config['output_dir'])
output:
"{output_dir}/
gene_expression
/{region}.expression.txt"
"{output_dir}/
imputed-gene-expressions
/{region}.expression.txt"
singularity:
"container.sif"
shell:
"""
mkdir -p {wildcards.output_dir}/
gene_expression
mkdir -p {wildcards.output_dir}/
imputed-gene-expressions
predixcan \
--predict \
--dosages {config[output_dir]}/dosages \
--dosages_prefix chr \
--samples samples.txt \
--weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \
--output_prefix {wildcards.output_dir}/
gene_expression
/{wildcards.region}
mv {wildcards.output_dir}/
gene_expression
/{wildcards.region}_predicted_expression.txt \
{wildcards.output_dir}/
gene_expression
/{wildcards.region}.expression.txt
--output_prefix {wildcards.output_dir}/
imputed-gene-expressions
/{wildcards.region}
mv {wildcards.output_dir}/
imputed-gene-expressions
/{wildcards.region}_predicted_expression.txt \
{wildcards.output_dir}/
imputed-gene-expressions
/{wildcards.region}.expression.txt
"""
config.yml
View file @
cb431c22
# input genotype file
input_vcf_gz_file
:
'
test.vcf.gz'
# where to put things
output_dir
:
'
output'
...
...
scripts/combine_expression_data.R
View file @
cb431c22
...
...
@@ -15,7 +15,7 @@ col_types <- cols(
config
$
brain_regions
%>%
purrr
::
map
(
function
(
x
)
{
sprintf
(
"%s/
gene_expression
/%s.expression.txt"
,
config
$
output_dir
,
x
)
%>%
sprintf
(
"%s/
imputed-gene-expressions
/%s.expression.txt"
,
config
$
output_dir
,
x
)
%>%
read_tsv
(
col_types
=
col_types
,
progress
=
FALSE
)
%>%
tidyr
::
gather
(
'ensembl_gene_id'
,
'expression'
,
-
FID
,
-
IID
)
%>%
mutate
(
tissue
=
x
)
%>%
...
...
@@ -26,4 +26,4 @@ config$brain_regions %>%
# make missing genes explicit
tidyr
::
spread
(
ensembl_gene_id
,
expression
,
fill
=
NA_real_
)
%>%
tidyr
::
gather
(
'ensembl_gene_id'
,
'expression'
,
-
FID
,
-
IID
,
-
tissue
)
%>%
write_rds
(
sprintf
(
'%s/
gene_expression
s_combined.rds'
,
config
$
output_dir
),
compress
=
'gz'
)
write_rds
(
sprintf
(
'%s/
imputed-gene-expressions
s_combined.rds'
,
config
$
output_dir
),
compress
=
'gz'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment