Commit 725f4ca4 authored by Kevin's avatar Kevin

extended generation of imputed values to produce a file with one row per individual

parent 85ae8228
......@@ -190,13 +190,14 @@ rule post_process_imputations:
"config.yml",
["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
]
],
"data/{version}/df_baseline.rds"
output:
"output/{version}/data/imputation/{model}/df_gose_imputed.csv"
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds {output}
Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds data/{wildcards.version}/df_baseline.rds {output}
"""
rule imputation_report:
......
......@@ -38,6 +38,7 @@ Since we do not use the raw imputed values but give preference to the per
protocol values (when a derived composite GOSE is availabel within 5-8 months
after injury), we start by comparing the final combined version with the raw
imputations.
Note that this plot also includes the confirmed deaths before 6 months!
```{r, fig.height=7, fig.width=7}
caret::confusionMatrix(
......@@ -84,8 +85,8 @@ df_per_protocol_gose <- df_gose %>%
```
Overall, only `r nrow(df_per_protocol_gose)` six-months GOSE are observed,
i.e., `r nrow(df_imputations) - nrow(df_per_protocol_gose)` model-based values
are used.
i.e., `r nrow(df_imputations %>% filter(complete.cases(.) & Subject.DerivedImputed180DaysGOSE > 1)) - nrow(df_per_protocol_gose)`
model-based values are used.
```{r, fig.height=8}
df_posteriors <- df_imputations %>%
......
......@@ -4,7 +4,8 @@ args <- commandArgs(trailingOnly = TRUE)
modelimputations_dir <- args[[1]]
gosefile <- args[[2]]
outputfile <- args[[3]]
baseline_raw <- args[[3]]
outputfile <- args[[4]]
set.seed(42)
......@@ -89,4 +90,37 @@ df_imputations <- df_imputations %>%
-closest_per_protocol_GOSE
)
write_csv(df_imputations, outputfile)
# combine with information about anybody who died before 6 months
death_gupis <- df_imputations %>%
right_join(
read_rds(baseline_raw) %>% select(gupi, Subject.DeathDate),
by = "gupi"
) %>%
filter( # identify patients who are definitely dead
difftime(Subject.DeathDate, "1970-01-01", units = "days") %>% as.numeric <= 6*30
) %>%
.[["gupi"]]
get_row <- function(gupi) {tibble(
var = names(df_imputations),
val = list(gupi, 1L, 1L, 1, 0, 0, 0, 0, 0, 0, 0)
) %>%
spread(var, val) %>%
unnest
}
# complete data set with missings
df_final <- read_rds(baseline_raw) %>%
select(gupi) %>%
left_join(
df_imputations %>%
filter(!(gupi %in% death_gupis)) %>%
rbind(
do.call(rbind, lapply(death_gupis, get_row))
)
) %>%
arrange(gupi)
write_csv(df_final, outputfile)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment