extended generation of imputed values to produce a file with one row per individual

725f4ca4 · Kevin · 85ae8228 · 725f4ca4 · 725f4ca4 · 725f4ca4
Commit 725f4ca4 authored Mar 12, 2019 by Kevin
Hide whitespace changes
Inline Side-by-side

Showing with 43 additions and 7 deletions

Snakefile Snakefile +3 -2

reports/imputations.Rmd reports/imputations.Rmd +4 -3

scripts/post_process_imputations.R scripts/post_process_imputations.R +36 -2

No files found.
--- a/Snakefile
+++ b/Snakefile
@@ -190,13 +190,14 @@ rule post_process_imputations:
        "config.yml",
        ["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
            for i in range(1, config["mi_m"] + 1)
-        ]
+        ],
+        "data/{version}/df_baseline.rds"
    output:
        "output/{version}/data/imputation/{model}/df_gose_imputed.csv"
    shell:
        """
        mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
-        Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds {output}
+        Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds data/{wildcards.version}/df_baseline.rds {output}
        """

 rule imputation_report:

--- a/reports/imputations.Rmd
+++ b/reports/imputations.Rmd
@@ -16,7 +16,7 @@ git-commit-hash: "`r system('git rev-parse --verify HEAD', intern=TRUE)`"
 git-wd-clean: "`r ifelse(system('git diff-index --quiet HEAD') == 0, 'clean', 'file changes, working directory not clean!')`"

 params:
-  data_dir: "../output/v1.1/data"
+  data_dir:    "../output/v1.1/data"
  imputations: "../output/v1.1/data/imputation/msm/df_gose_imputed.csv"
  config_file: "../config.yml"
 ---
@@ -38,6 +38,7 @@ Since we do not use the raw imputed values but give preference to the per
 protocol values (when a derived composite GOSE is availabel within 5-8 months
 after injury), we start by comparing the final combined version with the raw
 imputations.
+Note that this plot also includes the confirmed deaths before 6 months!

 ```{r, fig.height=7, fig.width=7}
 caret::confusionMatrix(
@@ -84,8 +85,8 @@ df_per_protocol_gose <- df_gose %>%
 ```

 Overall, only `r nrow(df_per_protocol_gose)` six-months GOSE are observed,
-i.e., `r nrow(df_imputations) - nrow(df_per_protocol_gose)` model-based values
-are used.
+i.e., `r nrow(df_imputations %>% filter(complete.cases(.) & Subject.DerivedImputed180DaysGOSE > 1)) - nrow(df_per_protocol_gose)` 
+model-based values are used.

 ```{r, fig.height=8}
 df_posteriors <- df_imputations %>% 

--- a/scripts/post_process_imputations.R
+++ b/scripts/post_process_imputations.R
@@ -4,7 +4,8 @@ args                 <- commandArgs(trailingOnly = TRUE)

 modelimputations_dir <- args[[1]]
 gosefile             <- args[[2]]
-outputfile           <- args[[3]]
+baseline_raw         <- args[[3]]
+outputfile           <- args[[4]]

 set.seed(42)

@@ -89,4 +90,37 @@ df_imputations <- df_imputations %>%
    -closest_per_protocol_GOSE
  )

-write_csv(df_imputations, outputfile)
+
+# combine with information about anybody who died before 6 months
+death_gupis <- df_imputations %>%
+  right_join(
+    read_rds(baseline_raw) %>% select(gupi, Subject.DeathDate),
+    by = "gupi"
+  ) %>%
+  filter( # identify patients who are definitely dead
+    difftime(Subject.DeathDate, "1970-01-01", units = "days") %>% as.numeric <= 6*30
+  ) %>%
+  .[["gupi"]]
+
+get_row <- function(gupi) {tibble(
+    var = names(df_imputations),
+    val = list(gupi, 1L, 1L, 1, 0, 0, 0, 0, 0, 0, 0)
+  ) %>%
+  spread(var, val) %>%
+  unnest
+}
+
+# complete data set with missings
+df_final <- read_rds(baseline_raw) %>%
+  select(gupi) %>%
+  left_join(
+    df_imputations %>%
+      filter(!(gupi %in% death_gupis)) %>%
+      rbind(
+        do.call(rbind, lapply(death_gupis, get_row))
+      )
+  ) %>%
+  arrange(gupi)
+
+
+write_csv(df_final, outputfile)