Commit cb431c22 authored by Kevin Kunzmann's avatar Kevin Kunzmann

scaling up to full vcf

parent 93f95db1
...@@ -10,9 +10,9 @@ localrules: impute, clean, download_container, generate_samples_file ...@@ -10,9 +10,9 @@ localrules: impute, clean, download_container, generate_samples_file
rule impute: rule impute:
input: input:
"container.sif", "container.sif",
expand("{output_dir}/gene_expression/{region}.expression.txt", region = config['brain_regions'], output_dir = config['output_dir']) expand("{output_dir}/imputed-gene-expressions/{region}.expression.txt", region = config['brain_regions'], output_dir = config['output_dir'])
output: output:
expand("{output_dir}/gene_expressions_combined.rds", output_dir = config['output_dir']) expand("{output_dir}/imputed-gene-expressionss_combined.rds", output_dir = config['output_dir'])
singularity: singularity:
"container.sif" "container.sif"
shell: shell:
...@@ -29,6 +29,22 @@ rule clean: ...@@ -29,6 +29,22 @@ rule clean:
rm -rf nohup.out rm -rf nohup.out
""" """
# download vcf files using gsutil (needs to be set up and configured!
rule download_imputed_genotype_chromosome:
output:
"output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
shell:
"""
mkdir -p output/imputed-genotypes
gsutil cp gs://fimm-horizon-outgoing-data/CENTER_TBI_data_freeze_190829/Imputed_data/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz \
output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz
"""
rule download_imputed_genotypes:
input:
expand("output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz",
i = list(map(str, range(1, 23))) + ['X']
)
# download the required singularity container from zenodo.org # download the required singularity container from zenodo.org
rule download_container: rule download_container:
output: output:
...@@ -42,7 +58,7 @@ rule download_container: ...@@ -42,7 +58,7 @@ rule download_container:
rule vcf_to_dosages: rule vcf_to_dosages:
input: input:
"container.sif", "container.sif",
config['input_vcf_gz_file'] vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
output: output:
"{output_dir}/dosages/chr{i}.dosage.txt.gz" "{output_dir}/dosages/chr{i}.dosage.txt.gz"
singularity: singularity:
...@@ -51,11 +67,8 @@ rule vcf_to_dosages: ...@@ -51,11 +67,8 @@ rule vcf_to_dosages:
""" """
export prefix={wildcards.output_dir}/dosages export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix mkdir -p $prefix
echo "extracting chromosome {wildcards.i} ..." echo "extracting and computing MAFs ..."
bcftools view -r chr{wildcards.i} {config[input_vcf_gz_file]} > $prefix/chr{wildcards.i}_.vcf bcftools +fill-tags {inputs.vcf_gz_file} > $prefix/chr{wildcards.i}.vcf
echo "computing MAFs ..."
bcftools +fill-tags $prefix/chr{wildcards.i}_.vcf > $prefix/chr{wildcards.i}.vcf
rm $prefix/chr{wildcards.i}_.vcf
echo 'querying dosages ...' echo 'querying dosages ...'
bcftools query -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt bcftools query -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
echo 'compressing ...' echo 'compressing ...'
...@@ -64,11 +77,13 @@ rule vcf_to_dosages: ...@@ -64,11 +77,13 @@ rule vcf_to_dosages:
printf "done.\n\r\n\r" printf "done.\n\r\n\r"
""" """
bcftools query -e 'MAF[0]<0.01 | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
# extract sample file for PrediXcan # extract sample file for PrediXcan
rule generate_samples_file: rule generate_samples_file:
input: input:
"container.sif", "container.sif",
config['input_vcf_gz_file'] vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr1.vcf.gz"
output: output:
"{output_dir}/dosages/samples.txt" "{output_dir}/dosages/samples.txt"
singularity: singularity:
...@@ -79,32 +94,32 @@ rule generate_samples_file: ...@@ -79,32 +94,32 @@ rule generate_samples_file:
""" """
export prefix={wildcards.output_dir}/dosages export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix mkdir -p $prefix
bcftools query -l {config[input_vcf_gz_file]} >> $prefix/samples_.txt bcftools query -l {inputs.vcf_gz_file} >> $prefix/samples_.txt
# family ID = individual ID # family ID = individual ID
awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt
rm $prefix/samples_.txt rm $prefix/samples_.txt
""" """
# run PrediXcan to impute gene expression for individual tissue type # run PrediXcan to impute gene expression for individual tissue type
rule impute_gene_expression: rule impute_gene_expressions:
input: input:
"container.sif", "container.sif",
samples_file = expand("{output_dir}/dosages/samples.txt", output_dir = config['output_dir']), samples_file = expand("{output_dir}/dosages/samples.txt", output_dir = config['output_dir']),
dosage_files = expand("{output_dir}/dosages/chr{i}.dosage.txt.gz", i = range(1, 24), output_dir = config['output_dir']) dosage_files = expand("{output_dir}/dosages/chr{i}.dosage.txt.gz", i = range(1, 24), output_dir = config['output_dir'])
output: output:
"{output_dir}/gene_expression/{region}.expression.txt" "{output_dir}/imputed-gene-expressions/{region}.expression.txt"
singularity: singularity:
"container.sif" "container.sif"
shell: shell:
""" """
mkdir -p {wildcards.output_dir}/gene_expression mkdir -p {wildcards.output_dir}/imputed-gene-expressions
predixcan \ predixcan \
--predict \ --predict \
--dosages {config[output_dir]}/dosages \ --dosages {config[output_dir]}/dosages \
--dosages_prefix chr \ --dosages_prefix chr \
--samples samples.txt \ --samples samples.txt \
--weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \ --weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \
--output_prefix {wildcards.output_dir}/gene_expression/{wildcards.region} --output_prefix {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}
mv {wildcards.output_dir}/gene_expression/{wildcards.region}_predicted_expression.txt \ mv {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}_predicted_expression.txt \
{wildcards.output_dir}/gene_expression/{wildcards.region}.expression.txt {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}.expression.txt
""" """
# input genotype file
input_vcf_gz_file: 'test.vcf.gz'
# where to put things # where to put things
output_dir: 'output' output_dir: 'output'
......
...@@ -15,7 +15,7 @@ col_types <- cols( ...@@ -15,7 +15,7 @@ col_types <- cols(
config$brain_regions %>% config$brain_regions %>%
purrr::map( purrr::map(
function(x) { function(x) {
sprintf("%s/gene_expression/%s.expression.txt", config$output_dir, x) %>% sprintf("%s/imputed-gene-expressions/%s.expression.txt", config$output_dir, x) %>%
read_tsv(col_types = col_types, progress = FALSE) %>% read_tsv(col_types = col_types, progress = FALSE) %>%
tidyr::gather('ensembl_gene_id', 'expression', -FID, -IID) %>% tidyr::gather('ensembl_gene_id', 'expression', -FID, -IID) %>%
mutate(tissue = x) %>% mutate(tissue = x) %>%
...@@ -26,4 +26,4 @@ config$brain_regions %>% ...@@ -26,4 +26,4 @@ config$brain_regions %>%
# make missing genes explicit # make missing genes explicit
tidyr::spread(ensembl_gene_id, expression, fill = NA_real_) %>% tidyr::spread(ensembl_gene_id, expression, fill = NA_real_) %>%
tidyr::gather('ensembl_gene_id', 'expression', -FID, -IID, -tissue) %>% tidyr::gather('ensembl_gene_id', 'expression', -FID, -IID, -tissue) %>%
write_rds(sprintf('%s/gene_expressions_combined.rds', config$output_dir), compress = 'gz') write_rds(sprintf('%s/imputed-gene-expressionss_combined.rds', config$output_dir), compress = 'gz')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment