Snakefile 4.69 KB
# edit configuration here (see cluster.json for running on slurm scheduler)
configfile: "config.yml"

localrules: impute, clean, download_container, generate_samples_file



# this rule runs the entire pipeline and creates a R data set file (rds)
# containing all imputed expression values
rule impute:
    input:
        "container.sif",
        expand("{output_dir}/imputed-gene-expressions/{region}.expression.txt", region = config['brain_regions'], output_dir = config['output_dir'])
    output:
        expand("{output_dir}/imputed-gene-expressionss_combined.rds", output_dir = config['output_dir'])
    singularity:
        "container.sif"
    shell:
        """
        Rscript scripts/combine_expression_data.R
        """

# delete output and logs (if run on slurm cluster)
rule clean:
    shell:
        """
        rm -rf {config[output_dir]}
        rm -rf logs
        rm -rf nohup.out
        """

# download vcf files using gsutil (needs to be set up and configured!
rule download_imputed_genotype_chromosome:
    output:
        "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
    singularity:
        "container.sif"
    shell:
        """
        mkdir -p output/imputed-genotypes
        gsutil cp gs://fimm-horizon-outgoing-data/CENTER_TBI_data_freeze_190829/Imputed_data/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz \
            output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{wildcards.i}.vcf.gz
        """
rule download_imputed_genotypes:
    input:
        expand("output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz",
            i = list(map(str, range(1, 23))) + ['X']
        )

# download the required singularity container from zenodo.org
rule download_container:
    output:
        "container.sif"
    shell:
        """
        bash scripts/download_container.sh
        """

# extract dosages in custom PrediXcan format from vcf file
rule vcf_to_dosages:
    input:
        "container.sif",
        vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr{i}.vcf.gz"
    output:
        "{output_dir}/dosages/chr{i}.dosage.txt.gz"
    singularity:
        "container.sif"
    shell:
        """
        export prefix={wildcards.output_dir}/dosages
        mkdir -p $prefix
        echo "extracting and computing MAFs ..."
        bcftools +fill-tags {inputs.vcf_gz_file} > $prefix/chr{wildcards.i}.vcf
        echo 'querying dosages ...'
        bcftools query -e 'MAF[0]>{config[min_MAF]} | INFO>{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
        echo 'compressing ...'
        gzip $prefix/chr{wildcards.i}.dosage.txt
        rm $prefix/chr{wildcards.i}.vcf
        printf "done.\n\r\n\r"
        """

# extract sample file for PrediXcan
rule generate_samples_file:
    input:
        "container.sif",
        vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_3695_1K_MAC1_freeze_190829_chr1.vcf.gz"
    output:
        "{output_dir}/dosages/samples.txt"
    singularity:
        "container.sif"
    params:
        format = "\'{ printf(\"%s %s\\n\", $1, $1); }\'"
    shell:
        """
        export prefix={wildcards.output_dir}/dosages
        mkdir -p $prefix
        bcftools query -l {inputs.vcf_gz_file} >> $prefix/samples_.txt
        # family ID = individual ID
        awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt
        rm $prefix/samples_.txt
        """

# run PrediXcan to impute gene expression for individual tissue type
rule impute_gene_expressions:
    input:
        "container.sif",
        samples_file = expand("{output_dir}/dosages/samples.txt", output_dir = config['output_dir']),
        dosage_files = expand("{output_dir}/dosages/chr{i}.dosage.txt.gz", i = range(1, 24), output_dir = config['output_dir'])
    output:
        "{output_dir}/imputed-gene-expressions/{region}.expression.txt"
    singularity:
        "container.sif"
    shell:
        """
        mkdir -p {wildcards.output_dir}/imputed-gene-expressions
        predixcan \
            --predict \
            --dosages {config[output_dir]}/dosages \
            --dosages_prefix chr \
            --samples samples.txt \
            --weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \
            --output_prefix {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}
        mv {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}_predicted_expression.txt \
            {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}.expression.txt
        """