fixed msm fit, introduced target rules and added propr imputation report

aa0d7bad · Kevin · 210b1842 · aa0d7bad · aa0d7bad · aa0d7bad
Commit aa0d7bad authored Feb 03, 2019 by Kevin
Hide whitespace changes
Inline Side-by-side

Showing with 136 additions and 6 deletions

Snakefile Snakefile +37 -1

models/msm/fit.R models/msm/fit.R +1 -1

reports/imputations.Rmd reports/imputations.Rmd +98 -4

No files found.
--- a/Snakefile
+++ b/Snakefile
@@ -39,6 +39,12 @@ rule prepare_data:
        mv reports/figures.zip {output.figures}
        """
+# define corresponding target rule for ease of use
+rule data_report_v1_1:
+    input:
+        pdf     = "output/v1.1/prepare_data.pdf",
+        figures = "output/v1.1/prepare_data_figures.zip"
@@ -75,6 +81,9 @@ rule generate_validation_data:
        """
 # adjust threads by model type
 def get_rule_threads(wildcards):
    if wildcards.model in ("locf", "msm"):
@@ -123,10 +132,17 @@ rule model_assessment:
        mv reports/figures.zip {output.figures}
        """
+# define corresponding target rule for ease of use
+rule cv_model_comparison_report_v1_1:
+    input:
+        pdf     = "output/v1.1/model_assessment.pdf",
+        figures = "output/v1.1/model_assessment_figures.zip"
+# rules for imputing on entire dataset
 rule generate_imputation_data:
    input:
        rules.prepare_data.output,
@@ -145,7 +161,6 @@ rule generate_imputation_data:
-# rules for imputing on entire dataset
 rule model_impute:
    input:
        "config.yml",
@@ -175,3 +190,24 @@ rule post_process_imputations:
        mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
        Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds {output}
        """
+rule imputation_report:
+    input:
+        "config.yml",
+        rules.post_process_imputations.output,
+        markdown = "reports/imputations.Rmd"
+    output:
+        pdf     = "output/{version}/gose_imputations_{model}.pdf",
+        figures = "output/{version}/gose_imputations_{model}_figures.zip"
+    shell:
+        """
+        mkdir -p output/{wildcards.version}
+        Rscript -e "rmarkdown::render(\\"{input.markdown}\\", output_dir = \\"output/{wildcards.version}\\", params = list(data_dir = \\"../output/{wildcards.version}/data\\", imputations = "../output/v1.1/data/imputation/{wildcards.model}/df_gose_imputed.csv"))"
+        mv reports/figures.zip {output.figures}
+        """
+# define corresponding target rule for ease of use
+rule impute_msm_v1_1:
+    input:
+        pdf     = "output/v1.1/gose_imputations_msm.pdf",
+        figures = "output/v1.1/gose_imputations_msm_figures.zip"
--- a/models/msm/fit.R
+++ b/models/msm/fit.R
@@ -93,7 +93,7 @@ df_posteriors <- tibble(
    fitted$pstate %>% {colnames(.) <- c("1", as.character(3:8)); .}
  ) %>%
  mutate("2" = 0) %>%
-  filter(t == config$t_out_msm) %>%
+  filter(t == config$t_out_msm + .5) %>%
  gather(GOSE, probability, as.character(1:8)) %>%
  arrange(gupi, t, GOSE)

--- a/reports/imputations.Rmd
+++ b/reports/imputations.Rmd
 ---
 title: "Imputing GOSE scores in CENTER-TBI"
+subtitle: "assessing final imputations"
 date: "`r Sys.time()`"
 statistician: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
@@ -15,6 +17,7 @@ git-wd-clean: "`r ifelse(system('git diff-index --quiet HEAD') == 0, 'clean', 'f
 params:
  data_dir: "../output/v1.1/data"
+  imputations: "../output/v1.1/data/imputation/msm/df_gose_imputed.csv"
  config_file: "../config.yml"
 ---
@@ -25,19 +28,110 @@ require(tidyverse, quietly = TRUE)
 config <- yaml::read_yaml(params$config_file)
 set.seed(config$seed)
+df_imputations <- read_csv(params$imputations)
 ```
-# Model descriptions
+Since we do not use the raw imputed values but give preference to the per
+protocol values (when a derived composite GOSE is availabel within 5-8 months
+after injury), we start by comparing the final combined version with the raw
+imputations.
+```{r, fig.height=7, fig.width=7}
+caret::confusionMatrix(
+    df_imputations$Subject.DerivedImputed180DaysGOSE %>% factor,
+    df_imputations$Subject.GOSE6monthEndpointDerived %>% factor
+  ) %>% 
+  as.matrix %>% as_tibble %>%
+  mutate(
+    Subject.DerivedImputed180DaysGOSE = row_number() %>% as.character
+  ) %>% 
+  gather(Subject.GOSE6monthEndpointDerived, n, `1`:`8`) %>%
+  ggplot(aes(Subject.GOSE6monthEndpointDerived, Subject.DerivedImputed180DaysGOSE, fill = n)) +
+    geom_raster() +
+    geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
+    geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
+    geom_text(aes(label = sprintf("%i", n)), vjust = 1) +
+    scale_fill_gradient(low = "white", high = "black") +
+    coord_fixed(expand = FALSE) + 
+    theme_bw() +
+    theme(
+      panel.grid = element_blank()
+    ) + 
+    ggtitle("Confusion matrix (absolute counts)")
+ggsave(filename = "confusion_matrix.pdf", width = 7, height = 7)
+ggsave(filename = "confusion_matrix.png", width = 7, height = 7)
+```
 ```{r}
-df_imputations <- read_csv(sprintf("%s/imputation/msm/df_gose_imputed.csv", params$data_dir))
 df_gose <- readRDS(sprintf("%s/df_gose.rds", params$data_dir))
-```
+df_per_protocol_gose <- df_gose %>%
+  filter(
+    Outcomes.DerivedCompositeGOSEDaysPostInjury >= 5*30,
+    Outcomes.DerivedCompositeGOSEDaysPostInjury <= 8*30
+  ) %>%
+  # pick closest to 180
+  group_by(gupi) %>%
+  summarize(
+    closest_per_protocol_GOSE = Outcomes.DerivedCompositeGOSE[
+      which.min(abs(Outcomes.DerivedCompositeGOSEDaysPostInjury - 180))
+    ]
+  )
+```
+Overall, only `r nrow(df_per_protocol_gose)` six-months GOSE are observed,
+i.e., `r nrow(df_imputations) - nrow(df_per_protocol_gose)` model-based values
+are used.
+```{r, fig.height=8}
+df_posteriors <- df_imputations %>% 
+  select(-Subject.GOSE6monthEndpointDerived) %>% 
+  gather(GOSE, probability, 3:10) %>% 
+  arrange(gupi, GOSE) %>% 
+  mutate(
+    GOSE = factor(GOSE) %>% as.numeric,
+    t = 180
+  )
+idx <- unique(df_gose$gupi)[1:40]
+df_gose %>% 
+  filter(gupi %in% idx) %>% 
+  transmute(
+    gupi,
+    t = Outcomes.DerivedCompositeGOSEDaysPostInjury,
+    GOSE = Outcomes.DerivedCompositeGOSE %>% as.numeric
+  ) %>% 
+  ggplot(aes(t, GOSE)) + 
+    geom_rect(
+      aes(
+        xmin = t - 14, 
+        xmax = t + 14, 
+        ymin = GOSE -.5, 
+        ymax = GOSE + .5, 
+        fill = probability
+      ), 
+      data = df_posteriors %>% filter(gupi %in% idx)
+    ) +
+    facet_wrap(~gupi, ncol = 5) + 
+    geom_point(size = .5, color = "red") + 
+    theme_bw() + 
+    theme(
+      panel.grid = element_blank(),
+      strip.background = element_blank(),
+      strip.text.x = element_blank()
+    ) +
+    scale_fill_gradient2(low = "white", high = "black", limits = c(0, 1)) +
+    scale_y_continuous(breaks = 1:8, limits = c(.5, 8.5)) +
+    ggtitle("First 30 GOSE trajectories with 6-months predicted probabilities.")
+ggsave(filename = "sample_probs.pdf", width = 7, height = 8)
+ggsave(filename = "sample_probs.png", width = 7, height = 8)
+```