configfile:  "config/parameters.yml"
singularity: "container.sif"
localrules:  data


# download vcf files using gsutil (needs to be set up and configured!) and
# variant id <-> position mapping for GTEx v8 (based on hg38p7 assembly)
rule data:
    output:
        expand(
            "{output_dir}/imputed-genotypes/chromosome-{i}.vcf.gz",
            output_dir=config["output_dir"],
            i=range(1,24)
        ),
        expand(
            "{output_dir}/GTEx_v8_hg38p7_variant_lookup_table.txt.gz",
            output_dir=config["output_dir"]
        )
    shell:
        """
        set -ex
        mkdir -p {config[output_dir]}/imputed-genotypes
        FILEDIR=gs://fimm-horizon-outgoing-data/genetic-data/imputed-genomes/20200306/all-acgm-filtered
        for i in {{1..23}}
        do
            gsutil cp $FILEDIR/chromosome-$i.vcf.gz \
                {config[output_dir]}/imputed-genotypes/chromosome-$i.vcf.gz
        done
        # download official GTEx version 8 mapping of variants to hg38 positions
        # from https://gtexportal.org/home/datasets, 'reference', 2020-03-06
        wget https://storage.googleapis.com/gtex_analysis_v8/reference/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz \
            -O {config[output_dir]}/GTEx_v8_hg38p7_variant_lookup_table.txt.gz
        """


rule dosage:
    input:
        "{out}/imputed-genotypes/chromosome-{i}.vcf.gz",
        "{out}/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
    output:
        "{out}/dosages/chromosome-{i}.dosage.txt.gz"
    shell:
        """
        set -ex
        mkdir -p {config[output_dir]}/dosages
        # filter for SNPs of defined quality and extract dosage
        pv {input} | \
            bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' | \
            bcftools +fill-tags | \
            bcftools query -f \ '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' > \
            {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt
        # compress
        gzip {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt
        # convert locations to GTEx v8 by hg38 position
        """

# compute dosage for all 23 chromosomes
rule dosages:
    input:
        rules.data.output,
        expand(
            "{out}/imputed-genotypes/chromosome-{i}.vcf.gz",
            out=config["output_dir"],
            i=range(1,24)
        )



rule samples_file:
    input:
        expand("{out}/imputed-genotypes/chromosome-22.vcf", out=config["output_dir"])
    output:
        expand("{out}/dosages/samples.txt", out=config["output_dir"])
    shell:
        """
        set -ex
        mkdir -p {config[output_dir]}/dosages
        # query sample ids from first chromosome
        bcftools query -l {input} >> {config[output_dir]}/dosages/samples.txt
        # predixcan allows family and sample id, duplicate column and overwrite
        Rscript -e "library(dplyr); readr::read_tsv('output/dosages/samples.txt', col_names = 'SID', col_types = 'c') %>% dplyr::mutate(FID = SID) %>% readr::write_tsv(path = 'output/dosages/samples.txt', col_names = FALSE)"
        """


rule grex:
    input:
        samples_file = expand(
            "{output_dir}/dosages/samples.txt",
            output_dir = config['output_dir']
        ),
        dosage_files = expand(
            "{output_dir}/dosages/chromosome-{i}.dosage.txt.gz",
            i=range(1,24) ,
            output_dir=config["output_dir"]
        )
    output:
        "{output_dir}/imputed-grex/{region}.igrex.txt.gz"
    shell:
        """
        mkdir -p {wildcards.output_dir}/imputed-grex
        predixcan \
            --predict \
            --dosages {config[output_dir]}/dosages \
            --dosages_prefix chromosome- \
            --samples samples.txt \
            --weights /usr/predixcan/GTEx-V7_HapMap-2017-11-29/gtex_v7_Brain_{wildcards.region}_imputed_europeans_tw_0.5_signif.db \
            --output_prefix {wildcards.output_dir}/imputed-grex/{wildcards.region}
        mv {wildcards.output_dir}/imputed-grex/{wildcards.region}_predicted_expression.txt \
            {wildcards.output_dir}/imputed-grex/{wildcards.region}.igrex.txt
        gzip {wildcards.output_dir}/imputed-grex/{wildcards.region}.igrex.txt
        """
