Commit 38279fd2 authored by Kevin's avatar Kevin

major snakemake refactor

parent 94d21ac2
...@@ -2,194 +2,34 @@ singularity: "docker://kkmann/gose-6mo-imputation@sha256:85724229d8f4243aaebd622 ...@@ -2,194 +2,34 @@ singularity: "docker://kkmann/gose-6mo-imputation@sha256:85724229d8f4243aaebd622
configfile: "config.yml" configfile: "config.yml"
include: "rules/fit_model_validation.rule"
include: "rules/final_imputation_report.rule"
rule download_data:
output:
"data/{version}/df_baseline.rds",
"data/{version}/df_ctmri.rds",
"data/{version}/df_imaging.rds",
"data/{version}/df_labs.rds",
"data/{version}/df_gose.rds"
shell:
"""
bash scripts/download_{wildcards.version}.sh
"""
rule download_data_v1_1:
input:
"data/v1.1/df_baseline.rds",
"data/v1.1/df_ctmri.rds",
"data/v1.1/df_imaging.rds",
"data/v1.1/df_labs.rds",
"data/v1.1/df_gose.rds"
rule prepare_data:
input:
rules.download_data.output,
markdown = "reports/prepare_data.Rmd"
output:
"output/{version}/data/df_gose.rds",
"output/{version}/data/df_baseline.rds",
"output/{version}/prepare_data.html",
figures = "output/{version}/prepare_data_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}/data
Rscript -e "rmarkdown::render(\\"{input.markdown}\\", output_dir = \\"output/{wildcards.version}\\", params = list(datapath = \\"../data/{wildcards.version}\\", max_lab_days = {config[max_lab_days]}, seed = {config[seed]}, age_min = {config[age_min]}, age_max = {config[age_max]}))"
mv reports/*.rds output/{wildcards.version}/data
mv reports/figures.zip {output.figures}
"""
rule impute_baseline:
input:
rules.prepare_data.output
output:
["output/{version}/data/mi_baseline/df_baseline_mi_%i.rds" % i for i in range(1, config["mi_m"] + 1)]
shell:
"""
mkdir -p output/{wildcards.version}/data/mi_baseline
Rscript scripts/impute_baseline.R output/{wildcards.version}/data/df_baseline.rds output/{wildcards.version}/data/mi_baseline {config[mi_m]} {config[mi_maxiter]} {config[seed]}
"""
rule generate_validation_data:
input:
rules.prepare_data.output,
rules.impute_baseline.output
output:
["output/{version}/data/validation/df_%s_mi_%i_fold_%i.rds" % (s, i, j)
for s in ("train", "test")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
shell:
"""
mkdir -p output/{wildcards.version}/data/validation
Rscript scripts/generate_validation_data.R output/{wildcards.version}/data {config[mi_m]} {config[folds]} {config[seed]}
"""
# adjust threads by model type
def get_rule_threads(wildcards):
if wildcards.model in ("locf", "msm", "msm_age"):
return 1
else:
return config["stan"]["chains"]
rule fit_model_validation_set:
input:
"config.yml",
"output/{version}/data/validation/df_train_mi_{i}_fold_{j}.rds"
output:
"output/{version}/data/validation/posteriors/{model}/df_posterior_mi_{i}_fold_{j}.rds"
threads:
get_rule_threads
shell:
"""
mkdir -p output/{wildcards.version}/data/validation/posteriors/{wildcards.model}
Rscript models/{wildcards.model}/fit.R {input[1]} {output}
"""
# rules for imputing on entire dataset rule create_manuscript_v1_1:
rule generate_imputation_data:
input: input:
rules.prepare_data.output, pop_report = rules.prepare_data_v1_1.output,
rules.impute_baseline.output posteriors = rules.fit_models_validation_v1_1.input,
output: markdown = "manuscript/manuscript.Rmd"
["output/{version}/data/imputation/df_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
]
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation
Rscript scripts/generate_imputation_data.R output/{wildcards.version}/data {config[mi_m]}
"""
rule model_impute:
input:
"config.yml",
"output/{version}/data/imputation/df_mi_{i}.rds"
output:
"output/{version}/data/imputation/{model}/df_gose_imputed_mi_{i}.rds"
threads:
get_rule_threads
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript models/{wildcards.model}/fit.R {input[1]} {output}
"""
# final reported values are a combination of the imputed and per-protocol
# observed ones
rule post_process_imputations:
input:
"config.yml",
["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
],
"data/{version}/df_baseline.rds"
output:
"output/{version}/data/imputation/{model}/df_gose_imputed.csv"
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds data/{wildcards.version}/df_baseline.rds {output}
"""
rule imputation_report:
input:
"config.yml",
rules.post_process_imputations.output,
markdown = "reports/imputations.Rmd"
output: output:
html = "output/{version}/gose_imputations_{model}.html", pdf = "output/v1.1/manuscript.docx",
figures = "output/{version}/gose_imputations_{model}_figures.zip" figures = "output/v1.1/manuscript_figures.zip"
shell: shell:
""" """
mkdir -p output/{wildcards.version} mkdir -p output/v1.1
Rscript -e "rmarkdown::render(\\"{input.markdown}\\", params = list(data_dir = \\"../output/{wildcards.version}/data\\", imputations = \\"../output/v1.1/data/imputation/{wildcards.model}/df_gose_imputed.csv\\"))" Rscript -e "rmarkdown::render(\\"{input.markdown}\\", output_dir = \\"output/v1.1\\", params = list(data_dir = \\"../output/v1.1/data\\", config_file = \\"../config.yml\\"))"
mv reports/imputations.html {output.html} mv manuscript/figures.zip {output.figures}
mv reports/figures.zip {output.figures}
""" """
# define corresponding target rule for ease of use
rule impute_msm_v1_1:
input:
html = "output/v1.1/gose_imputations_msm.html",
figures = "output/v1.1/gose_imputations_msm_figures.zip"
# helper rule to just build all posterior datasets
rule model_posteriors_v1_1:
input:
["output/v1.1/data/validation/posteriors/%s/df_posterior_mi_%i_fold_%i.rds" % (m, i, j)
for m in ("locf", "msm", "msm_age", "gp", "gp_nb", "mm", "mm_nb")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
rule manuscript_v1_1: rule impute_population_wide_msm_v1_1:
input: input:
pop_report = rules.prepare_data.output, "output/v1.1/gose_imputations_msm.html",
posteriors = rules.model_posteriors_v1_1.input, "output/v1.1/gose_imputations_msm_figures.zip"
markdown = "manuscript/manuscript.Rmd"
output:
pdf = "output/v1.1/manuscript.docx",
figures = "output/v1.1/manuscript_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}
Rscript -e "rmarkdown::render(\\"{input.markdown}\\", output_dir = \\"output/v1.1\\", params = list(data_dir = \\"../output/v1.1/data\\", config_file = \\"../config.yml\\"))"
mv manuscript/figures.zip {output.figures}
"""
rule download_data:
output:
"data/{version}/df_baseline.rds",
"data/{version}/df_ctmri.rds",
"data/{version}/df_imaging.rds",
"data/{version}/df_labs.rds",
"data/{version}/df_gose.rds"
shell:
"""
bash scripts/download_{wildcards.version}.sh
"""
include: "post_process_final_imputations.rule"
rule final_imputation_report:
input:
"config.yml",
rules.post_process_final_imputations.output,
markdown = "reports/imputations.Rmd"
output:
html = "output/{version}/gose_imputations_{model}.html",
figures = "output/{version}/gose_imputations_{model}_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}
Rscript -e "rmarkdown::render(\\"{input.markdown}\\", params = list(data_dir = \\"../output/{wildcards.version}/data\\", imputations = \\"../output/v1.1/data/imputation/{wildcards.model}/df_gose_imputed.csv\\"))"
mv reports/imputations.html {output.html}
mv reports/figures.zip {output.figures}
"""
include: "generate_input_data_for_population_wide_imputation.rule"
rule fit_model_population_wide:
input:
"config.yml",
"output/{version}/data/imputation/df_mi_{i}.rds"
output:
"output/{version}/data/imputation/{model}/df_gose_imputed_mi_{i}.rds"
threads:
get_rule_threads
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript models/{wildcards.model}/fit.R {input[1]} {output}
"""
# final reported values are a combination of the imputed and per-protocol
# observed ones
rule post_process_imputations:
input:
"config.yml",
["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
],
"data/{version}/df_baseline.rds"
output:
"output/{version}/data/imputation/{model}/df_gose_imputed.csv"
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds data/{wildcards.version}/df_baseline.rds {output}
"""
include: "generate_folds.rule"
# adjust threads by model type
def get_rule_threads(wildcards):
if wildcards.model in ("locf", "msm", "msm_age"):
return 1
else:
return config["stan"]["chains"]
rule fit_validation_model:
input:
"config.yml",
"output/{version}/data/validation/df_train_mi_{i}_fold_{j}.rds"
output:
"output/{version}/data/validation/posteriors/{model}/df_posterior_mi_{i}_fold_{j}.rds"
threads:
get_rule_threads
shell:
"""
mkdir -p output/{wildcards.version}/data/validation/posteriors/{wildcards.model}
Rscript models/{wildcards.model}/fit.R {input[1]} {output}
"""
# helper rule to just build all posterior datasets
rule fit_models_validation_v1_1:
input:
["output/v1.1/data/validation/posteriors/%s/df_posterior_mi_%i_fold_%i.rds" % (m, i, j)
for m in ("locf", "msm", "msm_age", "gp", "gp_nb", "mm", "mm_nb")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
include: "impute_baseline.rule"
rule generate_folds:
input:
rules.prepare_data.output,
rules.impute_baseline.output
output:
["output/{version}/data/validation/df_%s_mi_%i_fold_%i.rds" % (s, i, j)
for s in ("train", "test")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
shell:
"""
mkdir -p output/{wildcards.version}/data/validation
Rscript scripts/generate_validation_data.R output/{wildcards.version}/data {config[mi_m]} {config[folds]} {config[seed]}
"""
rule generate_folds_v1_1:
input:
["output/v1.1/data/validation/df_%s_mi_%i_fold_%i.rds" % (s, i, j)
for s in ("train", "test")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
include: "impute_baseline.rule"
# rules for imputing on entire dataset
rule generate_input_data_for_population_wide_imputation:
input:
rules.prepare_data.output,
rules.impute_baseline.output
output:
["output/{version}/data/imputation/df_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
]
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation
Rscript scripts/generate_imputation_data.R output/{wildcards.version}/data {config[mi_m]}
"""
# helper rule to just build all posterior datasets
rule generate_input_data_for_population_wide_imputation_v1_1:
input:
["output/v1_1/data/imputation/df_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
]
include: "prepare_data.rule"
rule impute_baseline:
input:
rules.prepare_data.output
output:
["output/{version}/data/mi_baseline/df_baseline_mi_%i.rds" % i for i in range(1, config["mi_m"] + 1)]
shell:
"""
mkdir -p output/{wildcards.version}/data/mi_baseline
Rscript scripts/impute_baseline.R output/{wildcards.version}/data/df_baseline.rds output/{wildcards.version}/data/mi_baseline {config[mi_m]} {config[mi_maxiter]} {config[seed]}
"""
rule impute_baseline_v1_1:
input:
["output/v1.1/data/mi_baseline/df_baseline_mi_%i.rds" % i for i in range(1, config["mi_m"] + 1)]
include: "fit_model_population_wide.rule"
rule post_process_final_imputations:
input:
"config.yml",
["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
],
"data/{version}/df_baseline.rds"
output:
"output/{version}/data/imputation/{model}/df_gose_imputed.csv"
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds data/{wildcards.version}/df_baseline.rds {output}
"""
include: "download.rule"
rule prepare_data:
input:
rules.download_data.output,
markdown = "reports/prepare_data.Rmd"
output:
"output/{version}/data/df_gose.rds",
"output/{version}/data/df_baseline.rds",
"output/{version}/prepare_data.html",
figures = "output/{version}/prepare_data_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}/data
Rscript -e "rmarkdown::render(\\"{input.markdown}\\", output_dir = \\"output/{wildcards.version}\\", params = list(datapath = \\"../data/{wildcards.version}\\", max_lab_days = {config[max_lab_days]}, seed = {config[seed]}, age_min = {config[age_min]}, age_max = {config[age_max]}))"
mv reports/*.rds output/{wildcards.version}/data
mv reports/figures.zip {output.figures}
"""
rule prepare_data_v1_1:
input:
"output/v1.1/prepare_data.html"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment