Snakefile 6.85 KB
# define the singularity (https://sylabs.io/) container image to use
# see also https://cloud.sylabs.io/library/kkmann/default/center-grex-imputation#
singularity: "library://kkmann/default/center-grex-imputation:sha256.321833b357b20feda51877dcef6e9bac2dfa706e9ed6c32da39baaa8ead9e1f7"

# modify this file to define global parameters
# (e.g. set of tissues to impute genetically regulated expression levels for)
configfile:  "config/parameters.yml"

# these rules are computationally less expensive and run locally, even in
# cluster execution mode
localrules:  all, container, download_genome, download_weights, samples_file,
    model_variant_lookup_table, mapping_report

# this rule is exected by default, i.e.
# > snakemake
# is equivalent to
# > snakemake all
# the target imputes gene expression for all tissues listed in the config file
# and the report on mapping mopdel variants to available dosage information.
rule all:
    input:
        expand(
            "output/imputed-grex/{tissue}.igrex.txt.gz",
            tissue=config["tissues"]
        ),
        "output/mapping-report.html"



# dummy rule doing nothing; can be used to download the required container;
# this might be helpful to set up authentification with the data servers
# (requires an interactive step); see readme.md for details
rule container:
    shell:
        """
        echo done.
        """

# download vcf files using gsutil (needs to be set up and configured!)
rule download_genome:
    output:
        "output/imputed-genotypes/chromosome-{chr}.vcf.gz",
        "output/imputed-genotypes/chromosome-{chr}.vcf.gz.tbi"
    shell:
        """
        set -ex
        mkdir -p output/imputed-genotypes
        FILEDIR=gs://fimm-horizon-outgoing-data/genetic-data/imputed-genomes/{config[data_date]}/{config[cohort]}
        gsutil cp $FILEDIR/chromosome-{wildcards.chr}.vcf.gz \
                output/imputed-genotypes/chromosome-{wildcards.chr}.vcf.gz
        gsutil cp $FILEDIR/chromosome-{wildcards.chr}.vcf.gz.tbi \
                output/imputed-genotypes/chromosome-{wildcards.chr}.vcf.gz.tbi
        """

# download PrediXcan EQTL weights
rule download_weights:
    output:
        directory("output/weights")
    shell:
        """
        set -ex
        mkdir -p logs
        mkdir -p output/weights
        # wget https://zenodo.org/record/3518299/files/mashr_eqtl.tar?download=1 \
            -O output/weights/eqtl.tar
        wget https://zenodo.org/record/3519321/files/elastic_net_eqtl.tar?download=1 \
            -O output/weights/eqtl.tar
        tar -C output/weights -xvf output/weights/eqtl.tar
        rm output/weights/eqtl.tar
        mv output/weights/elastic_net_models/* output/weights/
        rm -rf output/weights/elastic_net_models
        """

# generate a table with all model variants
rule model_variant_lookup_table:
    input:
        rules.download_weights.output
    output:
        "output/model-variants-lookup-table.csv.gz"
    shell:
        """
        set -ex
        Rscript scripts/get-model-variants.R
        """

# generate the samples file in the format required by PrediXcan
rule samples_file:
    input:
        "output/imputed-genotypes/chromosome-1.vcf.gz"
    output:
        "output/dosages/samples.txt"
    shell:
        """
        set -ex
        mkdir -p output/dosages
        # query sample ids from first chromosome
        bcftools query -l {input} > output/samples.txt
        # predixcan allows family and sample id, duplicate column and overwrite
        Rscript -e "library(dplyr); readr::read_tsv('output/samples.txt', col_names = 'SID', col_types = 'c') %>% dplyr::mutate(FID = SID) %>% readr::write_tsv(path = 'output/dosages/samples.txt', col_names = FALSE)"
        """

# extract and map dosage information to model variants by position and alleles
# (only works with hg38 inputs, otherwise a liftover needs to be performed!)
rule dosage:
    input:
        vcf     = "output/imputed-genotypes/chromosome-{chr}.vcf.gz",
        index   = "output/imputed-genotypes/chromosome-{chr}.vcf.gz.tbi",
        lookup  = "output/model-variants-lookup-table.csv.gz",
        samples = "output/dosages/samples.txt"
    output:
        "output/dosages/matched-chromosome-{chr}.dosage.txt.gz",
        "output/dosages/unmatched-chromosome-{chr}.csv.gz"
    shell:
        """
        # setup
        set -ex
        mkdir -p output/dosages

        # create file with chromosomal positions for all model variants
        Rscript scripts/get-model-variant-positions.R {wildcards.chr}

        nlines=$(wc -l < "output/model-variant-positions-chromosome-{wildcards.chr}.txt" | bc)
        if [[ $nlines == 0 ]]
        then
            touch "output/dosages/matched-chromosome-{wildcards.chr}.dosage.txt.gz"
            touch "output/dosages/unmatched-chromosome-{wildcards.chr}.csv.gz"
        else
            # filter for model variant positions and extract dosage information
            bcftools query \
                -o "output/dosages/chromosome-{wildcards.chr}.dosage.csv" \
                -R "output/model-variant-positions-chromosome-{wildcards.chr}.txt" \
                -f "%CHROM,%ID,%POS,%REF,%ALT,%INFO/MAF[,%DS]\n\r" \
                {input.vcf}

            # match model variants with extracted dosage
            Rscript scripts/match-variants.R {wildcards.chr}

            # cleanup
            rm "output/dosages/chromosome-{wildcards.chr}.dosage.csv"
        fi

        # cleanup
        rm "output/model-variant-positions-chromosome-{wildcards.chr}.txt"
        """

# build an overview over the found model SNPS per chromosome
rule mapping_report:
    input:
        dosage_files = expand(
            "output/dosages/matched-chromosome-{chr}.dosage.txt.gz",
            chr=range(1,24)
        )
    output:
        "output/mapping-report.html"
    shell:
        """
        set -ex
        Rscript -e "rmarkdown::render('scripts/mapping-report.Rmd', output_dir = 'output', output_file = 'mapping-report.html', knit_root_dir = '$PWD')"
        """

# impute GREx for a specific tissue (sums up the mapped dosages times the
# PrediXcan weights)
rule grex:
    input:
        samples_file = "output/dosages/samples.txt",
        dosage_files = expand(
            "output/dosages/matched-chromosome-{chr}.dosage.txt.gz",
            chr=range(1,24)
        )
    output:
        "output/imputed-grex/{tissue}.igrex.txt.gz"
    shell:
        """
        mkdir -p output/imputed-grex
        predixcan \
            --predict \
            --dosages output/dosages \
            --dosages_prefix matched-chromosome- \
            --samples samples.txt \
            --weights output/weights/en_{wildcards.tissue}.db \
            --output_prefix output/imputed-grex/{wildcards.tissue} \
            --model_variant_id varID
        mv output/imputed-grex/{wildcards.tissue}_predicted_expression.txt \
            output/imputed-grex/{wildcards.tissue}.igrex.txt
        gzip output/imputed-grex/{wildcards.tissue}.igrex.txt
        """