configfile:  "config/parameters.yml"
singularity: "container.sif"
localrules:  impute, generate_samples_file



rule impute:
    input:
        expand("{output_dir}/imputed-gene-expressions/{region}.expression.txt.gz", region = config['brain_regions'], output_dir = config['output_dir'])


# download vcf files using gsutil (needs to be set up and configured!
rule download_imputed_genotype_chromosome:
    output:
        "output/imputed-genotypes/chromosome-{i}.vcf.gz"
    shell:
        """
        setr -ex
        mkdir -p output/imputed-genotypes
        FILEDIR=gs://fimm-horizon-outgoing-data/20201002-center-tbi-genetic-data/genome-wide-imputation-data
        gsutil cp $FILEDIR/chromosome-{wildcards.i}.vcf.gz {output}
        """


rule dosages:
    input:
        expand("{out}/imputed-genotypes/chromosome-22.vcf.gz", out=config["output_dir"])
    output:
        expand("{out}/imputed-genotypes/chromosome-22.dosage.txt.gz", out=config["output_dir"])
    shell:
        """
        set -ex
        mkdir -p {config[output_dir]}/dosages
        pv {input} | \
            bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' | \
            bcftools +fill-tags | \
            bcftools query -f \ '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' > \
            {config[output_dir]}/dosages/chromosome-22.dosage.txt
        gzip {config[output_dir]}/dosages/chromosome-22.dosage.txt
        # convert locations to GTEX v8 by hg38 position
        """


rule samples_file:
    input:
        expand("{out}/imputed-genotypes/chromosome-22.vcf", out=config["output_dir"])
    output:
        expand("{out}/dosages/samples.txt", out=config["output_dir"])
    shell:
        """
        set -ex
        mkdir -p {config[output_dir]}/dosages
        # query sample ids from first chromosome
        bcftools query -l {input} >> {config[output_dir]}/dosages/samples.txt
        # predixcan allows family and sample id, duplicate column and overwrite
        Rscript -e "library(dplyr); readr::read_tsv('output/dosages/samples.txt', col_names = 'SID', col_types = 'c') %>% dplyr::mutate(FID = SID) %>% readr::write_tsv(path = 'output/dosages/samples.txt', col_names = FALSE)"
        """


rule impute_gene_expressions:
    input:
        "container.sif",
        samples_file = expand("{output_dir}/dosages/samples.txt", output_dir = config['output_dir']),
        dosage_files = expand("{output_dir}/dosages/chr{i}.dosage.txt.gz",
            i = list(map(str, range(1, 23))) + ['X'],
            output_dir = config['output_dir']
        )
    output:
        "{output_dir}/imputed-gene-expressions/{region}.expression.txt.gz"
    singularity:
        "container.sif"
    shell:
        """
        mkdir -p {wildcards.output_dir}/imputed-gene-expressions
        predixcan \
            --predict \
            --dosages {config[output_dir]}/dosages \
            --dosages_prefix chr \
            --samples samples.txt \
            --weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \
            --output_prefix {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}
        mv {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}_predicted_expression.txt \
            {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}.expression.txt
        gzip {wildcards.output_dir}/imputed-gene-expressions/{wildcards.region}.expression.txt
        """
