Commit 725f4ca4 authored by Kevin's avatar Kevin

extended generation of imputed values to produce a file with one row per individual

parent 85ae8228
...@@ -190,13 +190,14 @@ rule post_process_imputations: ...@@ -190,13 +190,14 @@ rule post_process_imputations:
"config.yml", "config.yml",
["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i ["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1) for i in range(1, config["mi_m"] + 1)
] ],
"data/{version}/df_baseline.rds"
output: output:
"output/{version}/data/imputation/{model}/df_gose_imputed.csv" "output/{version}/data/imputation/{model}/df_gose_imputed.csv"
shell: shell:
""" """
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model} mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds {output} Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds data/{wildcards.version}/df_baseline.rds {output}
""" """
rule imputation_report: rule imputation_report:
......
...@@ -16,7 +16,7 @@ git-commit-hash: "`r system('git rev-parse --verify HEAD', intern=TRUE)`" ...@@ -16,7 +16,7 @@ git-commit-hash: "`r system('git rev-parse --verify HEAD', intern=TRUE)`"
git-wd-clean: "`r ifelse(system('git diff-index --quiet HEAD') == 0, 'clean', 'file changes, working directory not clean!')`" git-wd-clean: "`r ifelse(system('git diff-index --quiet HEAD') == 0, 'clean', 'file changes, working directory not clean!')`"
params: params:
data_dir: "../output/v1.1/data" data_dir: "../output/v1.1/data"
imputations: "../output/v1.1/data/imputation/msm/df_gose_imputed.csv" imputations: "../output/v1.1/data/imputation/msm/df_gose_imputed.csv"
config_file: "../config.yml" config_file: "../config.yml"
--- ---
...@@ -38,6 +38,7 @@ Since we do not use the raw imputed values but give preference to the per ...@@ -38,6 +38,7 @@ Since we do not use the raw imputed values but give preference to the per
protocol values (when a derived composite GOSE is availabel within 5-8 months protocol values (when a derived composite GOSE is availabel within 5-8 months
after injury), we start by comparing the final combined version with the raw after injury), we start by comparing the final combined version with the raw
imputations. imputations.
Note that this plot also includes the confirmed deaths before 6 months!
```{r, fig.height=7, fig.width=7} ```{r, fig.height=7, fig.width=7}
caret::confusionMatrix( caret::confusionMatrix(
...@@ -84,8 +85,8 @@ df_per_protocol_gose <- df_gose %>% ...@@ -84,8 +85,8 @@ df_per_protocol_gose <- df_gose %>%
``` ```
Overall, only `r nrow(df_per_protocol_gose)` six-months GOSE are observed, Overall, only `r nrow(df_per_protocol_gose)` six-months GOSE are observed,
i.e., `r nrow(df_imputations) - nrow(df_per_protocol_gose)` model-based values i.e., `r nrow(df_imputations %>% filter(complete.cases(.) & Subject.DerivedImputed180DaysGOSE > 1)) - nrow(df_per_protocol_gose)`
are used. model-based values are used.
```{r, fig.height=8} ```{r, fig.height=8}
df_posteriors <- df_imputations %>% df_posteriors <- df_imputations %>%
......
...@@ -4,7 +4,8 @@ args <- commandArgs(trailingOnly = TRUE) ...@@ -4,7 +4,8 @@ args <- commandArgs(trailingOnly = TRUE)
modelimputations_dir <- args[[1]] modelimputations_dir <- args[[1]]
gosefile <- args[[2]] gosefile <- args[[2]]
outputfile <- args[[3]] baseline_raw <- args[[3]]
outputfile <- args[[4]]
set.seed(42) set.seed(42)
...@@ -89,4 +90,37 @@ df_imputations <- df_imputations %>% ...@@ -89,4 +90,37 @@ df_imputations <- df_imputations %>%
-closest_per_protocol_GOSE -closest_per_protocol_GOSE
) )
write_csv(df_imputations, outputfile)
# combine with information about anybody who died before 6 months
death_gupis <- df_imputations %>%
right_join(
read_rds(baseline_raw) %>% select(gupi, Subject.DeathDate),
by = "gupi"
) %>%
filter( # identify patients who are definitely dead
difftime(Subject.DeathDate, "1970-01-01", units = "days") %>% as.numeric <= 6*30
) %>%
.[["gupi"]]
get_row <- function(gupi) {tibble(
var = names(df_imputations),
val = list(gupi, 1L, 1L, 1, 0, 0, 0, 0, 0, 0, 0)
) %>%
spread(var, val) %>%
unnest
}
# complete data set with missings
df_final <- read_rds(baseline_raw) %>%
select(gupi) %>%
left_join(
df_imputations %>%
filter(!(gupi %in% death_gupis)) %>%
rbind(
do.call(rbind, lapply(death_gupis, get_row))
)
) %>%
arrange(gupi)
write_csv(df_final, outputfile)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment