# define the singularity (https://sylabs.io/) container image to use # see also https://cloud.sylabs.io/library/kkmann/default/center-grex-imputation# singularity: "library://kkmann/default/center-grex-imputation:sha256.811ce3ed0a73d21dc54f9aeceb94185e3804b83a30ab07a0142ab8da86802d6a" # modify this file to define global parameters # (e.g. set of tissues to impute genetically regulated expression levels for) configfile: "config/parameters.yml" # these rules are computationally less expensive and run locally, even in # cluster execution mode localrules: all, container, download_genome, download_weights, samples_file, model_variant_lookup_table, mapping_report # this rule is exected by default, i.e. # > snakemake # is equivalent to # > snakemake all # the target imputes gene expression for all tissues listed in the config file # and the report on mapping mopdel variants to available dosage information. rule all: input: expand( "output/imputed-grex/{tissue}.igrex.txt.gz", tissue=config["tissues"] ), "output/mapping-report.html" # dummy rule doing nothing; can be used to download the required container; # this might be helpful to set up authentification with the data servers # (requires an interactive step); see readme.md for details rule container: shell: """ echo done. """ # download vcf files using gsutil (needs to be set up and configured!) rule download_genome: output: "output/imputed-genotypes/chromosome-{chr}.vcf.gz", "output/imputed-genotypes/chromosome-{chr}.vcf.gz.tbi" shell: """ set -ex mkdir -p output/imputed-genotypes FILEDIR=gs://fimm-horizon-outgoing-data/genetic-data/imputed-genomes/{config[data_date]}/{config[cohort]} gsutil cp $FILEDIR/chromosome-{wildcards.chr}.vcf.gz \ output/imputed-genotypes/chromosome-{wildcards.chr}.vcf.gz gsutil cp $FILEDIR/chromosome-{wildcards.chr}.vcf.gz.tbi \ output/imputed-genotypes/chromosome-{wildcards.chr}.vcf.gz.tbi """ # download PrediXcan EQTL weights rule download_weights: output: directory("output/weights") shell: """ set -ex mkdir -p logs mkdir -p output/weights # wget https://zenodo.org/record/3518299/files/mashr_eqtl.tar?download=1 \ -O output/weights/eqtl.tar wget https://zenodo.org/record/3519321/files/elastic_net_eqtl.tar?download=1 \ -O output/weights/eqtl.tar tar -C output/weights -xvf output/weights/eqtl.tar rm output/weights/eqtl.tar mv output/weights/elastic_net_models/* output/weights/ rm -rf output/weights/elastic_net_models """ # generate a table with all model variants rule model_variant_lookup_table: input: rules.download_weights.output output: "output/model-variants-lookup-table.csv.gz" shell: """ set -ex Rscript scripts/get-model-variants.R """ # generate the samples file in the format required by PrediXcan rule samples_file: input: "output/imputed-genotypes/chromosome-1.vcf.gz" output: "output/dosages/samples.txt" shell: """ set -ex mkdir -p output/dosages # query sample ids from first chromosome bcftools query -l {input} > output/samples.txt # predixcan allows family and sample id, duplicate column and overwrite Rscript -e "library(dplyr); readr::read_tsv('output/samples.txt', col_names = 'SID', col_types = 'c') %>% dplyr::mutate(FID = SID) %>% readr::write_tsv(path = 'output/dosages/samples.txt', col_names = FALSE)" """ # extract and map dosage information to model variants by position and alleles # (only works with hg38 inputs, otherwise a liftover needs to be performed!) rule dosage: input: vcf = "output/imputed-genotypes/chromosome-{chr}.vcf.gz", index = "output/imputed-genotypes/chromosome-{chr}.vcf.gz.tbi", lookup = "output/model-variants-lookup-table.csv.gz", samples = "output/dosages/samples.txt" output: "output/dosages/matched-chromosome-{chr}.dosage.txt.gz", "output/dosages/unmatched-chromosome-{chr}.csv.gz" shell: """ # setup set -ex mkdir -p output/dosages # create file with chromosomal positions for all model variants Rscript scripts/get-model-variant-positions.R {wildcards.chr} nlines=$(wc -l < "output/model-variant-positions-chromosome-{wildcards.chr}.txt" | bc) if [[ $nlines == 0 ]] then touch "output/dosages/matched-chromosome-{wildcards.chr}.dosage.txt.gz" touch "output/dosages/unmatched-chromosome-{wildcards.chr}.csv.gz" else # filter for model variant positions and extract dosage information bcftools query \ -o "output/dosages/chromosome-{wildcards.chr}.dosage.csv" \ -R "output/model-variant-positions-chromosome-{wildcards.chr}.txt" \ -f "%CHROM,%ID,%POS,%REF,%ALT,%INFO/MAF[,%DS]\n\r" \ {input.vcf} # match model variants with extracted dosage Rscript scripts/match-variants.R {wildcards.chr} # cleanup rm "output/dosages/chromosome-{wildcards.chr}.dosage.csv" fi # cleanup rm "output/model-variant-positions-chromosome-{wildcards.chr}.txt" """ # build an overview over the found model SNPS per chromosome rule mapping_report: input: dosage_files = expand( "output/dosages/matched-chromosome-{chr}.dosage.txt.gz", chr=range(1,24) ) output: "output/mapping-report.html" shell: """ set -ex Rscript -e "rmarkdown::render('scripts/mapping-report.Rmd', output_dir = 'output', output_file = 'mapping-report.html', knit_root_dir = '$PWD')" """ # impute GREx for a specific tissue (sums up the mapped dosages times the # PrediXcan weights) rule grex: input: samples_file = "output/dosages/samples.txt", dosage_files = expand( "output/dosages/matched-chromosome-{chr}.dosage.txt.gz", chr=range(1,24) ) output: "output/imputed-grex/{tissue}.igrex.txt.gz" shell: """ mkdir -p output/imputed-grex predixcan \ --predict \ --dosages output/dosages \ --dosages_prefix matched-chromosome- \ --samples samples.txt \ --weights output/weights/en_{wildcards.tissue}.db \ --output_prefix output/imputed-grex/{wildcards.tissue} \ --model_variant_id varID mv output/imputed-grex/{wildcards.tissue}_predicted_expression.txt \ output/imputed-grex/{wildcards.tissue}.igrex.txt gzip output/imputed-grex/{wildcards.tissue}.igrex.txt """