revision of manuscript

6b448f2e · Kevin Kunzmann · 0badf884 · 6b448f2e
Commit 6b448f2e authored Mar 22, 2019 by Kevin Kunzmann
Hide whitespace changes
Inline Side-by-side

Showing with 32 additions and 28 deletions

manuscript/manuscript.Rmd manuscript/manuscript.Rmd +32 -28

No files found.
--- a/manuscript/manuscript.Rmd
+++ b/manuscript/manuscript.Rmd
@@ -553,7 +553,7 @@ alternative measure of bias which does not require this tacit assumption.
 Note that the scale is not directlz comparable to the one of the
 other three quantities!
 All measures are considered both conditional on the ground-truth
-(unobserved true GOSe) as well as averaged over the entire test set.
+(unobserved observed GOSe) as well as averaged over the entire test set.


 LOCF, by design, cannot provide imputed values when there are no
@@ -671,7 +671,7 @@ baseline covariates.

 We first consider results for the set of test cases which allow LOCF imputation 
 (n = `r df_predictions %>% filter(model == "LOCF") %>% nrow - length(idx)`).
-Both the raw count as well as the relative (by left-out true GOSe) confusion matrices
+Both the raw count as well as the relative (by left-out observed GOSe) confusion matrices
 are presented in Figure ???.

 ```{r confusion-matrix-locf, warning=FALSE, message=FALSE, echo=FALSE, fig.cap="Confusion matrices on LOCF subset."}
@@ -687,16 +687,16 @@ plot_confusion_matrices <- function(df_predictions, models) {
        ) %>% 
        as.matrix %>% as_tibble %>% 
        mutate(`Predicted GOSE` = row_number() %>% as.character) %>% 
-        gather(`True GOSE`, n, 1:8)
+        gather(`Observed GOSE`, n, 1:8)
    ) %>% 
    unnest %>% 
-    group_by(model, `Predicted GOSE`, `True GOSE`) %>% 
+    group_by(model, `Predicted GOSE`, `Observed GOSE`) %>% 
    summarize(n = mean(n)) %>% 
    ungroup %>% 
    mutate(model = factor(model, models))
  
  p_cnf_mtrx_raw <- df_average_confusion_matrices %>%
-    ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = n)) +
+    ggplot(aes(`Observed GOSE`, `Predicted GOSE`, fill = n)) +
      geom_raster() +
      geom_text(aes(
          label = sprintf("%.1f", n) %>% 
@@ -708,7 +708,7 @@ plot_confusion_matrices <- function(df_predictions, models) {
      geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
      scale_fill_gradient(low = "white", high = "#555555") +
      coord_fixed(expand = FALSE) + 
-      labs(x = "true GOSe", y = "imputed GOSe", fill = "") +
+      labs(x = "observed GOSe", y = "imputed GOSe", fill = "") +
      theme_bw() +
      theme(
        panel.grid = element_blank()
@@ -717,18 +717,18 @@ plot_confusion_matrices <- function(df_predictions, models) {
      ggtitle("Average confusion matrix accross folds (absolute counts)")
  
  p_cnf_mtrx_colnrm <- df_average_confusion_matrices %>%
-    group_by(model, `True GOSE`) %>%
+    group_by(model, `Observed GOSE`) %>%
    mutate(
      `fraction (column)` = n / sum(n),
      `fraction (column)` = ifelse(is.nan(`fraction (column)`), 0, `fraction (column)`)
    ) %>% 
-    ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = `fraction (column)`)) +
+    ggplot(aes(`Observed GOSE`, `Predicted GOSE`, fill = `fraction (column)`)) +
      geom_raster() +
      geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
      geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
      scale_fill_gradient("", low = "white", high = "black", limits = c(0, 1)) +
      coord_fixed(expand = FALSE) + 
-      labs(x = "true GOSe", y = "imputed GOSe", fill = "") +
+      labs(x = "observed GOSe", y = "imputed GOSe", fill = "") +
      theme_bw() +
      theme(
        panel.grid = element_blank()
@@ -786,10 +786,10 @@ df_average_confusion_matrices <- df_predictions %>%
        ) %>% 
        as.matrix %>% as_tibble %>% 
        mutate(`Predicted GOSE` = row_number() %>% as.character) %>% 
-        gather(`True GOSE`, n, 1:8)
+        gather(`Observed GOSE`, n, 1:8)
    ) %>% 
    unnest %>% 
-    group_by(model, `Predicted GOSE`, `True GOSE`) %>% 
+    group_by(model, `Predicted GOSE`, `Observed GOSE`) %>% 
    summarize(n = mean(n)) %>% 
    ungroup %>% 
    mutate(model = factor(model, models))
@@ -797,7 +797,7 @@ rbind(
 df_average_confusion_matrices %>% 
  filter(model %in% c("LOCF", "MM", "GP + cov", "MSM")) %>% 
  group_by(model) %>% 
-  filter(`True GOSE` <= 3) %>% 
+  filter(`Observed GOSE` <= 3) %>% 
  mutate(n_total = sum(n)) %>% 
  filter(`Predicted GOSE` > 3) %>% 
  summarize(fraction = sum(n / n_total)) %>% 
@@ -806,7 +806,7 @@ df_average_confusion_matrices %>%
 df_average_confusion_matrices %>% 
  filter(model %in% c("LOCF", "MM", "GP + cov", "MSM")) %>% 
  group_by(model) %>% 
-  filter(`True GOSE` == 4) %>% 
+  filter(`Observed GOSE` == 4) %>% 
  mutate(n_total = sum(n)) %>% 
  filter(`Predicted GOSE` > 4) %>% 
  summarize(fraction = sum(n / n_total)) %>% 
@@ -815,7 +815,7 @@ df_average_confusion_matrices %>%
 df_average_confusion_matrices %>% 
  filter(model %in% c("LOCF", "MM", "GP + cov", "MSM")) %>% 
  group_by(model) %>% 
-  filter(`True GOSE` < 8) %>% 
+  filter(`Observed GOSE` < 8) %>% 
  mutate(n_total = sum(n)) %>% 
  filter(`Predicted GOSE` == 8) %>% 
  summarize(fraction = sum(n / n_total)) %>% 
@@ -895,13 +895,17 @@ ggsave(filename = "errors_stratified_locf.png", width = 9, height = 3)

 Just as with the overall performance, differences are most pronounced in terms
 of bias.
+Interestingly, the conditional perspective reveals differences between bias
+as difference between mean imputed and mean observed values (tacitly assuming
+an at least interval scale) and the difference in the probability to
+over- or undershoot the observed value. 
 Again, the category imbalance in the GOSe distribution explains the fact that
 all model-based approaches tend to perform better for the most frequent
 categories 6, 7, and 8 while sacrificing performance for the less frequent
 categories 4 and 5 as compared to LOCF.
 Bias-wise all methods exhibit a certain regression to the mean effect since low
 categories tend to be confused with better (higher) GOSe on average while
-high true GOSe values are dampened (negative bias at 7, 8).
+high observed GOSe values are dampened (negative bias at 7, 8).
 Since LOCF does not take the category imbalance into account and since it exhibits
 a relatively large negative bias at the most frequent GOSe values, it is
 overall negatively biased.
@@ -909,14 +913,14 @@ Interestingly, the conditional assessment of the GP regressions bias profile
 reveals that the overall unbiasedness can be explained by the relatively high
 positive and negative biases conditional on low/high GOSe values canceling out
 in the overall population.
-Since the accuracy results mirror this effect, the GP regression model seems
-to suffer from an overly string regression to the mean effect. 
-
-The MSM and MM models are fairly similar with respect to accuracy with a slight
-advantage for MSM.
-Interestingly, though, the MSM approach is consistently less positively biased
-across the rarer low GOSe categories while being only insignificantly more 
-negatively biased for categories 7 and 8.
+The MSM and MM models are fairly similar with respect to accuracy but MSM
+clearly dominates with respect to bias.
+Note that irrespective of the exact definition of bias used, MSM ominates the other
+model-based approaches. 
+Comparing LOCF and MSM, there is a slight advantage of MSM in terms of accuracy for
+the majority classes 3, 7, 8 which explain the overall difference shwon in Figure ???.
+With respect to bias, MSM also performs better than LOCF for the most frequently
+observed categories, but the extent of this improvement depend on the performance measure.



@@ -956,17 +960,17 @@ df_average_confusion_matrices <- df_predictions %>%
        ) %>% 
        as.matrix %>% as_tibble %>% 
        mutate(`Predicted GOSE` = row_number() %>% as.character) %>% 
-        gather(`True GOSE`, n, 1:8)
+        gather(`Observed GOSE`, n, 1:8)
    ) %>% 
    unnest %>% 
-    group_by(model, `Predicted GOSE`, `True GOSE`) %>% 
+    group_by(model, `Predicted GOSE`, `Observed GOSE`) %>% 
    summarize(n = mean(n)) %>% 
    ungroup %>% 
    mutate(model = factor(model, models))
 rbind(
 df_average_confusion_matrices %>% 
  group_by(model) %>% 
-  filter(`True GOSE` <= 3) %>% 
+  filter(`Observed GOSE` <= 3) %>% 
  mutate(n_total = sum(n)) %>% 
  filter(`Predicted GOSE` > 3) %>% 
  summarize(fraction = sum(n / n_total)) %>% 
@@ -974,7 +978,7 @@ df_average_confusion_matrices %>%

 df_average_confusion_matrices %>% 
  group_by(model) %>% 
-  filter(`True GOSE` == 4) %>% 
+  filter(`Observed GOSE` == 4) %>% 
  mutate(n_total = sum(n)) %>% 
  filter(`Predicted GOSE` > 4) %>% 
  summarize(fraction = sum(n / n_total)) %>% 
@@ -982,7 +986,7 @@ df_average_confusion_matrices %>%

 df_average_confusion_matrices %>% 
  group_by(model) %>% 
-  filter(`True GOSE` < 8) %>% 
+  filter(`Observed GOSE` < 8) %>% 
  mutate(n_total = sum(n)) %>% 
  filter(`Predicted GOSE` == 8) %>% 
  summarize(fraction = sum(n / n_total)) %>%