Snakefile 5.41 KB
singularity: "docker://kkmann/gose-6mo-imputation@sha256:02a8f55c53d6f4917193abda823938e1ad7e8891fd6c392bd94f509e058eb34b"

configfile: "config.yml"





rule import_neurobot_csv:
    output:
        "data/{version}/df_baseline.rds",
        "data/{version}/df_ctmri.rds",
        "data/{version}/df_imaging.rds",
        "data/{version}/df_labs.rds",
        "data/{version}/df_gose.rds"
    shell:
        """
        Rscript scripts/import_neurobot_data.R data/{wildcards.version} data/{wildcards.version}
        """





rule prepare_data:
    input:
        rules.import_neurobot_csv.output,
        markdown = "reports/prepare_data.Rmd"
    output:
        "output/{version}/data/df_gose.rds",
        "output/{version}/data/df_baseline.rds",
        "output/{version}/prepare_data.pdf",
        figures = "output/{version}/prepare_data_figures.zip"
    shell:
        """
        mkdir -p output/{wildcards.version}/data
        Rscript -e "rmarkdown::render('{input.markdown}', output_dir = 'output/{wildcards.version}', params = list(datapath = '../data/{wildcards.version}', max_lab_days = {config[max_lab_days]}, seed = {config[seed]}))"
        mv reports/*.rds output/{wildcards.version}/data
        mv reports/figures.zip {output.figures}
        """





rule impute_baseline:
    input:
        rules.prepare_data.output
    output:
        ["output/{version}/data/mi_baseline/df_baseline_mi_%i.rds" % i for i in range(1, config["mi_m"] + 1)]
    shell:
        """
        mkdir -p output/{wildcards.version}/data/mi_baseline
        Rscript scripts/impute_baseline.R output/{wildcards.version}/data/df_baseline.rds output/{wildcards.version}/data/mi_baseline {config[mi_m]} {config[mi_maxiter]} {config[seed]}
        """





rule generate_validation_data:
    input:
        rules.prepare_data.output,
        rules.impute_baseline.output
    output:
        ["output/{version}/data/validation/df_%s_mi_%i_fold_%i.rds" % (s, i, j)
          for s in ("train", "test")
          for i in range(1, config["mi_m"] + 1)
          for j in range(1, config["folds"] + 1)
        ]
    shell:
        """
        mkdir -p output/{wildcards.version}/data/validation
        Rscript scripts/generate_validation_data.R output/{wildcards.version}/data {config[mi_m]} {config[folds]} {config[seed]}
        """


# adjust threads by model type
def get_rule_threads(wildcards):
    if wildcards.model in ("locf", "msm"):
        return 1
    else:
        return config["stan"]["chains"]

rule fit_model_validation_set:
    input:
        "config.yml",
        "output/{version}/data/validation/df_train_mi_{i}_fold_{j}.rds"
    output:
        "output/{version}/data/validation/posteriors/{model}/df_posterior_mi_{i}_fold_{j}.rds"
    threads:
        get_rule_threads
    shell:
        """
        mkdir -p output/{wildcards.version}/data/validation/posteriors/{wildcards.model}
        Rscript models/{wildcards.model}/fit.R {input[1]} {output}
        """




# helper rule to just build all posterior datasets
rule model_posteriors:
    input:
        ["output/v1.1/data/validation/posteriors/%s/df_posterior_mi_%i_fold_%i.rds" % (m, i, j)
          for m in ("locf", "msm", "gp", "gp_nb", "mm", "mm_nb")
          for i in range(1, config["mi_m"] + 1)
          for j in range(1, config["folds"] + 1)
        ]

rule model_assessment:
    input:
        pop_report = rules.prepare_data.output,
        posteriors = rules.model_posteriors.input,
        markdown   = "reports/model_assessment.Rmd"
    output:
        pdf     = "output/{version}/model_assessment.pdf",
        figures = "output/{version}/model_assessment_figures.zip"
    shell:
        """
        mkdir -p output/{wildcards.version}
        Rscript -e "rmarkdown::render('{input.markdown}', output_dir = 'output/{wildcards.version}', params = list(data_dir = '../output/{wildcards.version}/data', config_file = '../config.yml'))"
        mv reports/figures.zip {output.figures}
        """





rule generate_imputation_data:
    input:
        rules.prepare_data.output,
        rules.impute_baseline.output
    output:
        ["output/{version}/data/imputation/df_mi_%i.rds" % i
            for i in range(1, config["mi_m"] + 1)
        ]
    shell:
        """
        mkdir -p output/{wildcards.version}/data/imputation
        Rscript scripts/generate_imputation_data.R output/{wildcards.version}/data {config[mi_m]}
        """





# rules for imputing on entire dataset
rule model_impute:
    input:
        "config.yml",
        "output/{version}/data/imputation/df_mi_{i}.rds"
    output:
        "output/{version}/data/imputation/{model}/df_gose_imputed_mi_{i}.rds"
    threads:
        get_rule_threads
    shell:
        """
        mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
        Rscript models/{wildcards.model}/fit.R {input[1]} {output}
        """

# final reported values are a combination of the imputed and per-protocol
# observed ones
rule post_process_imputations:
    input:
        "config.yml",
        ["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
            for i in range(1, config["mi_m"] + 1)
        ]
    output:
        "output/{version}/data/imputation/{model}/df_gose_imputed.csv"
    shell:
        """
        mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
        Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds {output}
        """