Commit ccb8b3be authored by Kevin Kunzmann's avatar Kevin Kunzmann

refactor

parent 7cfd7495
......@@ -5,3 +5,4 @@ logs
nohup.out
container.sif
.DS_Store
.Rproj.user
# edit configuration here (see cluster.json for running on slurm scheduler)
configfile: "config.yml"
configfile: "config/parameters.yml"
singularity: "container.sif"
localrules: impute, generate_samples_file
localrules: impute, clean, download_container, generate_samples_file
# this rule runs the entire pipeline and creates a R data set file (rds)
# containing all imputed expression values
rule impute:
input:
"container.sif",
expand("{output_dir}/imputed-gene-expressions/{region}.expression.txt.gz", region = config['brain_regions'], output_dir = config['output_dir'])
# delete output and logs (if run on slurm cluster)
rule clean:
shell:
"""
rm -rf {config[output_dir]}
rm -rf logs
rm -rf nohup.out
"""
# download vcf files using gsutil (needs to be set up and configured!
rule download_imputed_genotype_chromosome:
output:
"output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{i}.vcf.gz"
singularity:
"container.sif"
"output/imputed-genotypes/chromosome-{i}.vcf.gz"
shell:
"""
setr -ex
mkdir -p output/imputed-genotypes
gsutil cp gs://fimm-horizon-outgoing-data/CENTER_TBI_data_freeze_191014/Imputed_data/CENTER.TBI_imputed_N3741_hg19_HRC_MAC1_freeze_191014_chr{wildcards.i}.vcf.gz \
output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{wildcards.i}.vcf.gz
FILEDIR=gs://fimm-horizon-outgoing-data/20201002-center-tbi-genetic-data/genome-wide-imputation-data
gsutil cp $FILEDIR/chromosome-{wildcards.i}.vcf.gz {output}
"""
rule download_imputed_genotypes:
input:
expand("output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{i}.vcf.gz",
i = list(map(str, range(1, 23))) + ['X']
)
# download the required singularity container from zenodo.org
rule download_container:
output:
"container.sif"
shell:
"""
bash scripts/download_container.sh
"""
# extract dosages in custom PrediXcan format from vcf file
rule vcf_to_dosages:
rule dosages:
input:
"container.sif",
vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{i}.vcf.gz"
expand("{out}/imputed-genotypes/chromosome-22.vcf.gz", out=config["output_dir"])
output:
"{output_dir}/dosages/chr{i}.dosage.txt.gz"
singularity:
"container.sif"
expand("{out}/imputed-genotypes/chromosome-22.dosage.txt.gz", out=config["output_dir"])
shell:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
echo "decompress and filter out: 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' ..."
bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' {input.vcf_gz_file} > $prefix/chr{wildcards.i}_.vcf
echo "add MAFs ..."
bcftools +fill-tags $prefix/chr{wildcards.i}_.vcf > $prefix/chr{wildcards.i}.vcf
rm $prefix/chr{wildcards.i}_.vcf
echo 'querying dosages ...'
bcftools query -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
rm $prefix/chr{wildcards.i}.vcf
echo 'compressing ...'
gzip $prefix/chr{wildcards.i}.dosage.txt
printf "done.\n\r\n\r"
set -ex
mkdir -p {config[output_dir]}/dosages
pv {input} | \
bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' | \
bcftools +fill-tags | \
bcftools query -f \ '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' > \
{config[output_dir]}/dosages/chromosome-22.dosage.txt
gzip {config[output_dir]}/dosages/chromosome-22.dosage.txt
# convert locations to GTEX v8 by hg38 position
"""
# extract sample file for PrediXcan
rule generate_samples_file:
rule samples_file:
input:
"container.sif",
vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr1.vcf.gz"
expand("{out}/imputed-genotypes/chromosome-22.vcf", out=config["output_dir"])
output:
"{output_dir}/dosages/samples.txt"
singularity:
"container.sif"
params:
format = "\'{ printf(\"%s %s\\n\", $1, $1); }\'"
expand("{out}/dosages/samples.txt", out=config["output_dir"])
shell:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
bcftools query -l {input.vcf_gz_file} >> $prefix/samples_.txt
# family ID = individual ID
awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt
rm $prefix/samples_.txt
set -ex
mkdir -p {config[output_dir]}/dosages
# query sample ids from first chromosome
bcftools query -l {input} >> {config[output_dir]}/dosages/samples.txt
# predixcan allows family and sample id, duplicate column and overwrite
Rscript -e "library(dplyr); readr::read_tsv('output/dosages/samples.txt', col_names = 'SID', col_types = 'c') %>% dplyr::mutate(FID = SID) %>% readr::write_tsv(path = 'output/dosages/samples.txt', col_names = FALSE)"
"""
# run PrediXcan to impute gene expression for individual tissue type
rule impute_gene_expressions:
input:
"container.sif",
......
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 4
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
......@@ -39,6 +39,7 @@ From: ubuntu:18.04
chmod +x /usr/bin/predixcan
pip install \
argparse datetime numpy
# download, extract and store (brain) weights
mkdir /usr/predixcan
wget https://s3.amazonaws.com/predictdb2/GTEx-V7_HapMap-2017-11-29.tar.gz -O /usr/predixcan/GTEx-V7_HapMap-2017-11-29.tar.gz
......@@ -48,8 +49,9 @@ From: ubuntu:18.04
tar -xvz -f GTEx-V7_HapMap-2017-11-29.tar.gz -C GTEx-V7_HapMap-2017-11-29 --wildcards "*_Brain_*"; \
rm GTEx-V7_HapMap-2017-11-29.tar.gz \
)
# predixcan connects to the weights database with sql, needs write permission
# even if the file system will be read only for the cvontainer
# even if the file system will be read only for the container
chmod -R 777 /usr/predixcan
# install R and packages
......
---
title: "Mapping hg38 to GTEx variants"
author: "Kevin Kunzmann"
date: "`r Sys.time()`"
output:
html_document:
code_folding: hide
params:
chromosome: 22
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
echo = TRUE
)
pander::panderOptions("table.split.table", 200)
options(
tidyverse.quite = TRUE,
width = 90
)
library(tidyverse, warn.conflicts = FALSE)
library(glue, warn.conflicts = FALSE)
set.seed(42)
```
```{r load-data}
dosages_file <- glue(
"output/dosages/chromosome-{params$chromosome}.dosage.txt.gz"
)
out_file <- str_replace(
dosages_file,
"(?<=-)([0-9]{1,2})(?=.dosage.txt)",
"\\1-gtex-v8"
)
tbl_gtex_lookup <- vroom::vroom(
"GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz"
) %>%
filter(
# to make sure we do not lose corner cases,
# also load neighboring chromosomes
chr %in% glue("chr{(params$chromosome - 1):(params$chromosome + 1)}"),
nchar(ref) == 1,
nchar(alt) == 1
) %>%
mutate(
variant_pos = as.integer(variant_pos)
) %>%
arrange(variant_pos)
sample_ids <- read_delim(
"output/dosages/samples.txt",
delim = "\t",
col_types = "cc",
col_names = c("FID", "IID")
)
tbl_dosage <- vroom::vroom(
dosages_file,
delim = " ",
col_names = c(
"dummy_",
"chromosome",
"rsid",
"variant_pos",
"ref",
"alt",
"MAF",
sample_ids$IID
),
col_types = cols(
.default = col_double(),
dummy_ = col_character(),
chromosome = col_integer(),
rsid = col_character(),
variant_pos = col_integer(),
ref = col_character(),
alt = col_character()
),
na = "."
) %>%
select(-dummy_) %>%
arrange(variant_pos)
```
```{r}
tbl_dosage <- left_join(
tbl_dosage,
tbl_gtex_lookup %>% transmute(
variant_pos,
gtex_id = variant_id,
ref_gtex = ref,
alt_gtex = alt
),
by = "variant_pos"
) %>%
select(
chromosome, variant_pos, gtex_id, rsid,
ref, alt, ref_gtex, alt_gtex,
MAF,
everything()
)
```
```{r}
tbl_dosage %>%
filter(
is.na(ref_gtex) | is.na(alt_gtex)
) %>%
{n_mis <<- nrow(.); .} %>%
ggplot() +
aes(variant_pos) +
geom_histogram(
fill = "black",
color = NA,
bins = ceiling(n_mis / 33)
) +
theme_bw()
```
```{r}
tbl_dosage %>%
filter(is.na(ref_gtex) | is.na(alt_gtex)) %>%
ggplot() +
aes(variant_pos, y = 0) +
geom_point(
alpha = 0.2,
shape = 16,
position = position_jitter(width = 0, height = 1)
) +
theme_bw()
```
## Session info
```{r, echo=FALSE}
sessionInfo() %>%
pander::pander()
```
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment