Snakefile 4.73 KB
# edit configuration here (see cluster.json for running on slurm scheduler)
configfile: "config.yml"

localrules: impute, clean, download_container, generate_samples_file



# this rule runs the entire pipeline and creates a R data set file (rds)
# containing all imputed expression values
rule impute:
    input:
        "container.sif",
        expand("{output_dir}/imputed-gene-expressions/{region}.expression.txt.gz", region = config['brain_regions'], output_dir = config['output_dir'])

# delete output and logs (if run on slurm cluster)
rule clean:
    shell:
        """
        rm -rf {config[output_dir]}
        rm -rf logs
        rm -rf nohup.out
        """

# download vcf files using gsutil (needs to be set up and configured!
rule download_imputed_genotype_chromosome:
    output:
        "output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{i}.vcf.gz"
    singularity:
        "container.sif"
    shell:
        """
        mkdir -p output/imputed-genotypes
        gsutil cp gs://fimm-horizon-outgoing-data/CENTER_TBI_data_freeze_191014/Imputed_data/CENTER.TBI_imputed_N3741_hg19_HRC_MAC1_freeze_191014_chr1.vcf.gz \
            output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{wildcards.i}.vcf.gz
        """

rule download_imputed_genotypes:
    input:
        expand("output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{i}.vcf.gz",
            i = list(map(str, range(1, 23))) + ['X']
        )

# download the required singularity container from zenodo.org
rule download_container:
    output:
        "container.sif"
    shell:
        """
        bash scripts/download_container.sh
        """

# extract dosages in custom PrediXcan format from vcf file
rule vcf_to_dosages:
    input:
        "container.sif",
        vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{i}.vcf.gz"
    output:
        "{output_dir}/dosages/chr{i}.dosage.txt.gz"
    singularity:
        "container.sif"
    shell:
        """
        export prefix={wildcards.output_dir}/dosages
        mkdir -p $prefix
        echo "decompress and filter out: 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' ..."
        bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' {input.vcf_gz_file} > $prefix/chr{wildcards.i}_.vcf
        echo "add MAFs ..."
        bcftools +fill-tags $prefix/chr{wildcards.i}_.vcf > $prefix/chr{wildcards.i}.vcf
        rm $prefix/chr{wildcards.i}_.vcf
        echo 'querying dosages ...'
        bcftools query -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
        rm $prefix/chr{wildcards.i}.vcf
        echo 'compressing ...'
        gzip $prefix/chr{wildcards.i}.dosage.txt
        printf "done.\n\r\n\r"
        """

# extract sample file for PrediXcan
rule generate_samples_file:
    input:
        "container.sif",
        vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr1.vcf.gz"
    output:
        "{output_dir}/dosages/samples.txt"
    singularity:
        "container.sif"
    params:
        format = "\'{ printf(\"%s %s\\n\", $1, $1); }\'"
    shell:
        """
        export prefix={wildcards.output_dir}/dosages
        mkdir -p $prefix
        bcftools query -l {input.vcf_gz_file} >> $prefix/samples_.txt
        # family ID = individual ID
        awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt
        rm $prefix/samples_.txt
        """

# run PrediXcan to impute gene expression for individual tissue type
rule impute_gene_expressions:
    input:
        "container.sif",
        samples_file = expand("{output_dir}/dosages/samples.txt", output_dir = config['output_dir']),
        dosage_files = expand("{output_dir}/dosages/chr{i}.dosage.txt.gz",
            i = list(map(str, range(1, 23))) + ['X'],
            output_dir = config['output_dir']
        )
    output:
        "{output_dir}/imputed-gene-expressions/{region}.expression.txt.gz"
    singularity:
        "container.sif"
    shell:
        """
        mkdir -p {wildcards.output_dir}/imputed-gene-expressions
        predixcan \
            --predict \
            --dosages {config[output_dir]}/dosages \
            --dosages_prefix chr \
            --samples samples.txt \
            --weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \
            --output_prefix {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}
        mv {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}_predicted_expression.txt \
            {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}.expression.txt
        gzip {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}.expression.txt
        """