Commit 421e49a2 authored by kevin's avatar kevin

initial commit

parents
singularity: "container.sif"
configfile: "config.yml"
localrules: impute, clean, generate_samples_file
rule impute:
input:
expand("{output_dir}/gene_expression/{region}.expression.txt", region = config['brain_regions'], output_dir = config['output_dir'])
output:
expand("{output_dir}/gene_expressions_combined.rds", output_dir = config['output_dir'])
shell:
"""
Rscript scripts/combine_expression_data.R
"""
rule vcf_to_dosages:
input:
config['input_vcf_gz_file']
output:
"{output_dir}/dosages/chr{i}.dosage.txt.gz"
shell:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
echo "extracting chromosome {wildcards.i} ..."
bcftools view -r chr{wildcards.i} {config[input_vcf_gz_file]} > $prefix/chr{wildcards.i}_.vcf
echo "computing MAFs ..."
bcftools +fill-tags $prefix/chr{wildcards.i}_.vcf > $prefix/chr{wildcards.i}.vcf
rm $prefix/chr{wildcards.i}_.vcf
echo 'querying dosages ...'
bcftools query -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
echo 'compressing ...'
gzip $prefix/chr{wildcards.i}.dosage.txt
rm $prefix/chr{wildcards.i}.vcf
printf "done.\n\r\n\r"
"""
rule generate_samples_file:
input:
config['input_vcf_gz_file']
output:
"{output_dir}/dosages/samples.txt"
params:
format = "\'{ printf(\"%s %s\\n\", $1, $1); }\'"
shell:
"""
export prefix={wildcards.output_dir}/dosages
mkdir -p $prefix
bcftools query -l {config[input_vcf_gz_file]} >> $prefix/samples_.txt
# family ID = individual ID
awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt
rm $prefix/samples_.txt
"""
rule impute_gene_expression:
input:
samples_file = expand("{output_dir}/dosages/samples.txt", output_dir = config['output_dir']),
dosage_files = expand("{output_dir}/dosages/chr{i}.dosage.txt.gz", i = range(1, 24), output_dir = config['output_dir'])
output:
"{output_dir}/gene_expression/{region}.expression.txt"
shell:
"""
mkdir -p {wildcards.output_dir}/gene_expression
predixcan \
--predict \
--dosages {config[output_dir]}/dosages \
--dosages_prefix chr \
--samples samples.txt \
--weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \
--output_prefix {wildcards.output_dir}/gene_expression/{wildcards.region}
mv {wildcards.output_dir}/gene_expression/{wildcards.region}_predicted_expression.txt \
{wildcards.output_dir}/gene_expression/{wildcards.region}.expression.txt
"""
rule clean:
shell:
"""
rm -rf {config[output_dir]}
rm -rf logs
rm -rf nohup.out
"""
{
"__default__" :
{
"account" : "MRC-BSU-SL2-CPU",
"partition" : "skylake",
"time" : "00:10:00",
"ntasks" : "1",
"nodes" : "1",
"ncpu" : "1",
"name" : "{rule}__{wildcards}",
"output" : "logs/{rule}__{wildcards}.out",
"error" : "logs/{rule}__{wildcards}.out"
}
}
# input genotype file
input_vcf_gz_file: 'test.vcf.gz'
# where to put things
output_dir: 'output'
# suggested filtering options for PrediXcan
min_MAF: '0.01'
min_INFO: '0.8'
# brain regions to impute expression for
brain_regions:
- 'Amygdala'
- 'Anterior_cingulate_cortex_BA24'
- 'Caudate_basal_ganglia'
- 'Cerebellar_Hemisphere'
- 'Cerebellum'
- 'Cortex'
- 'Frontal_Cortex_BA9'
- 'Hippocampus'
- 'Hypothalamus'
- 'Nucleus_accumbens_basal_ganglia'
- 'Putamen_basal_ganglia'
- 'Spinal_cord_cervical_c-1'
- 'Substantia_nigra'
Bootstrap: docker
From: ubuntu:18.04
%post
# non-interactive debconf
export DEBIAN_FRONTEND=noninteractive DEBCONF_NONINTERACTIVE_SEEN=true
# update apt
apt-get update
# install google cloud services to download vcf genotype files
apt-get -y install \
curl gnupg2
echo "deb http://packages.cloud.google.com/apt cloud-sdk-stretch main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
apt-get update && apt-get -y install google-cloud-sdk
# install python3 and snakemake
apt-get -y install \
python3 python3-pip
pip3 install snakemake
# install bcftools
apt-get -y install \
gcc wget make zlib1g zlib1g-dev libbz2-dev liblzma-dev libcurl4-openssl-dev
wget https://github.com/samtools/bcftools/releases/download/1.9/bcftools-1.9.tar.bz2
tar -xvjf bcftools-1.9.tar.bz2
cd bcftools-1.9
./configure --prefix=/usr/bcftools
make
make install
(cd /usr/bin; ln -s /usr/bcftools/bin/bcftools bcftools)
# install PrediXcan and python dependencies (uses python 2.7)
apt-get -y install \
wget python-pip
wget https://raw.githubusercontent.com/hakyimlab/PrediXcan/master/Software/PrediXcan.py -O /usr/bin/predixcan
chmod +x /usr/bin/predixcan
pip install \
argparse datetime numpy
# download, extract and store (brain) weights
mkdir /usr/predixcan
wget https://s3.amazonaws.com/predictdb2/GTEx-V7_HapMap-2017-11-29.tar.gz -O /usr/predixcan/GTEx-V7_HapMap-2017-11-29.tar.gz
(cd /usr/predixcan; \
mkdir GTEx-V7_HapMap-2017-11-29; \
# we only need the brain tissue weights:
tar -xvz -f GTEx-V7_HapMap-2017-11-29.tar.gz -C GTEx-V7_HapMap-2017-11-29 --wildcards "*_Brain_*"; \
rm GTEx-V7_HapMap-2017-11-29.tar.gz \
)
# predixcan connects to the weights database with sql, needs write permission
# even if the file system will be read only for the cvontainer
chmod -R 777 /usr/predixcan
# install R and packages
apt-get -y install r-base
Rscript -e "install.packages('dplyr')"
Rscript -e "install.packages('yaml')"
Rscript -e "install.packages('purrr')"
Rscript -e "install.packages('readr')"
Rscript -e "install.packages('tidyr')"
#!/usr/bin/env Rscript
library(dplyr, warn.conflicts = FALSE)
library(readr, warn.conflicts = FALSE)
config <- yaml::read_yaml('config.yml')
col_types <- cols(
.default = col_double(),
FID = col_character(),
IID = col_character()
)
# read and combine individual expression data in long format
config$brain_regions %>%
purrr::map(
function(x) {
sprintf("%s/gene_expression/%s.expression.txt", config$output_dir, x) %>%
read_tsv(col_types = col_types, progress = FALSE) %>%
tidyr::gather('ensembl_gene_id', 'expression', -FID, -IID) %>%
mutate(tissue = x) %>%
select(tissue, everything())
}
) %>%
{do.call(rbind, .)} %>%
# make missing genes explicit
tidyr::spread(ensembl_gene_id, expression, fill = NA_real_) %>%
tidyr::gather('ensembl_gene_id', 'expression', -FID, -IID, -tissue) %>%
write_rds(sprintf('%s/gene_expressions_combined.rds', config$output_dir), compress = 'gz')
mkdir logs
nohup snakemake $1 \
--jobs 999 \
--use-singularity \
--cluster-config cluster.json \
--cluster "sbatch -A {cluster.account} -p {cluster.partition} --ntasks {cluster.ntasks} --cpus-per-task {cluster.ncpu} --nodes {cluster.nodes} -t {cluster.time} --job-name {cluster.name} --output {cluster.output} --error {cluster.error}" &
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment