The markdown file requires the recorded GOSe values in a .csv file names tbl_gose.csv in the same folder. The .csv file must contain the Neurobot variables

subjectId
Outcomes.DerivedCompositeGOSE
Outcomes.DerivedCompositeGOSEDaysPostInjury
Subject.Age
Subject.PatientType
Subject.DeathDate

in exactly that order. At the time of writing this report the newest staging data set could be accessed via the saved link https://neurobot-stage.incf.org/_5f476d41eedb867816a10376.

The required R environment can be installed by

install.packages("renv")
renv::restore(clean = TRUE, prompt = FALSE)
suppressPackageStartupMessages({
    library(tidyverse)
    library(msm)
})

print(params)
## $seed
## [1] 42
## 
## $age_range
## [1]   0 Inf
## 
## $per_protocol_window
## [1] 150 240
knitr::opts_chunk$set(
    cache = TRUE,
     echo = TRUE
)

set.seed(params$seed)
# check data 
varnames <- c(
    "subjectId",
    "Outcomes.DerivedCompositeGOSE",
    "Outcomes.DerivedCompositeGOSEDaysPostInjury",
    "Subject.Age",
    "Subject.PatientType",
    "Subject.DeathDate"
)

colnames <- readr::read_csv(
        "tbl_gose.csv",
        n_max = 1,
        col_types = "cinncD"
    ) %>%
    names

if (!all(colnames == varnames))
    stop("colnames do not match varnames, check order!")

tbl_combined <- readr::read_csv("tbl_gose.csv", col_types = "cinncD") %>% 
    rename(gupi = subjectId) %>%
    filter(
        # throw out all rows with missing GOSE
        !is.na(Outcomes.DerivedCompositeGOSE),
        # throw out all observations without a precise timestamp
        !is.na(Outcomes.DerivedCompositeGOSEDaysPostInjury)
    ) %>% 
    mutate(
        Subject.PatientType = case_when(
            Subject.PatientType == "1" ~ "ER",
            Subject.PatientType == "2" ~ "Admission",
            Subject.PatientType == "3" ~ "ICU"
        )
    ) 

# add death date as GOSE 1
tbl_combined <- tbl_combined %>%
    mutate(
        death_days = as.numeric(difftime(Subject.DeathDate, "1970-01-01", units = "days"))
    ) %>% 
    group_by(gupi) %>% 
    summarize_all(
        first
    ) %>% 
    ungroup() %>% 
    filter(
        !is.na(death_days),
        death_days > 180
    ) %>% 
    mutate(
        Outcomes.DerivedCompositeGOSEDaysPostInjury = death_days,
        Outcomes.DerivedCompositeGOSE               = 1L
    ) %>% 
    select(-death_days) %>% 
    rbind(tbl_combined) %>% 
    arrange(
       gupi, 
       Outcomes.DerivedCompositeGOSEDaysPostInjury
    )

# make sure GOSE is unique at each observation time
tbl_combined  <- tbl_combined %>% 
    group_by(
        gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury
    ) %>% 
    summarize_all(
        first
    ) %>% 
    ungroup

# filter all individuals with GOSE = 1 before 180 days post injury
# (no imputation required!)
tbl_combined <- tbl_combined %>% 
    group_by(gupi) %>% 
    filter(
        # all GOSEs before 180 dayspost injury must be > 1 (dead)
        all(Outcomes.DerivedCompositeGOSE[Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180] > 1),
        # ensure age-bracket
        Subject.Age >= params$age_range[1],
        Subject.Age <= params$age_range[2],
        # censor at first GOSE == 1
        all(Outcomes.DerivedCompositeGOSE > 1) | (row_number() <= which(Outcomes.DerivedCompositeGOSE == 1)[1])
    ) %>% 
    ungroup
tbl_combined$Outcomes.DerivedCompositeGOSE %>% table
## .
##    1    3    4    5    6    7    8 
##   39  792  600  838 1236 1782 3159

Impute

tbl_gose_msm <- tbl_combined %>%
    select(
        gupi,
        days_post_injury = Outcomes.DerivedCompositeGOSEDaysPostInjury,
        gose = Outcomes.DerivedCompositeGOSE
    ) %>% 
    rbind(
      # add censored states at 180.1 days to make prediction for
      tibble(
          gupi             = unique(.$gupi),
          # needed to offset to get a prediction in all cases
          days_post_injury = 180 + .1, 
          gose             = 99
      )
    ) %>% 
    mutate(
        # all deaths are excactly observed time - tell MSM!
        observation_type = ifelse(gose == 1, 3, 1),
        # recode levels to 1:7, 99 (needed for MSM!)
        gose = factor(gose,
                levels = c(1, 3:8, 99), 
                labels = c(1:7, 99)
            ) %>% as.character %>% as.numeric
    ) %>% 
    arrange(
        gupi,
        days_post_injury
    )

# define transition matrix
Q <- matrix(0, nrow = 7, ncol = 7)
for (i in 1:7) {
  for (j in 1:7) {
    if (i == j - 1 & i != 1) {
      Q[i, j] <- 1
    }
    if (i == j + 1) {
      Q[i, j] <- 1
    }
  }
}
Q[2:7, 1] <- 1 # allow instantaneous deaths from all states

fit <- msm::msm(
  gose ~ days_post_injury,
  subject       = gupi,
  data          = tbl_gose_msm,
  obstype       = observation_type,
  gen.inits     = TRUE,
  qmatrix       = Q,
  censor        = 99,
  pci           = c(90, 270),
  censor.states = 2:7, # cannot be dead since these were filtered previously 
  control   = list(
      fnscale = 12000,
      maxit   = 10^4,
      trace   = 2
  )
)
## initial  value 1.449729 
## iter  10 value 1.109494
## iter  20 value 1.068045
## iter  30 value 1.055315
## iter  40 value 1.051368
## iter  50 value 1.049075
## iter  60 value 1.048322
## iter  70 value 1.047718
## iter  80 value 1.047442
## iter  90 value 1.047106
## iter 100 value 1.046910
## iter 110 value 1.046860
## iter 120 value 1.046790
## iter 130 value 1.046727
## iter 140 value 1.046659
## iter 150 value 1.046577
## iter 160 value 1.046480
## iter 170 value 1.046330
## iter 180 value 1.046167
## iter 190 value 1.046081
## iter 200 value 1.045961
## iter 210 value 1.045938
## iter 220 value 1.045912
## iter 230 value 1.045894
## iter 240 value 1.045881
## iter 250 value 1.045873
## iter 260 value 1.045852
## iter 270 value 1.045838
## iter 280 value 1.045826
## iter 290 value 1.045807
## iter 300 value 1.045795
## iter 310 value 1.045790
## iter 320 value 1.045787
## iter 330 value 1.045785
## iter 340 value 1.045782
## iter 350 value 1.045780
## iter 360 value 1.045777
## iter 370 value 1.045771
## iter 380 value 1.045766
## iter 390 value 1.045759
## iter 400 value 1.045754
## iter 410 value 1.045751
## iter 420 value 1.045749
## iter 430 value 1.045746
## iter 440 value 1.045744
## iter 450 value 1.045742
## iter 460 value 1.045740
## iter 470 value 1.045738
## iter 480 value 1.045736
## iter 490 value 1.045732
## iter 500 value 1.045730
## iter 510 value 1.045729
## iter 520 value 1.045728
## iter 530 value 1.045727
## iter 540 value 1.045725
## iter 550 value 1.045724
## iter 560 value 1.045721
## iter 570 value 1.045719
## iter 580 value 1.045716
## iter 590 value 1.045714
## iter 600 value 1.045711
## iter 610 value 1.045710
## iter 620 value 1.045708
## iter 630 value 1.045707
## final  value 1.045706 
## converged
## Used 638 function and 636 gradient evaluations
## Warning in msm::msm(gose ~ days_post_injury, subject = gupi, data =
## tbl_gose_msm, : Optimisation has probably not converged to the maximum
## likelihood - Hessian is not positive definite.
# get fitted values at censored (prediction) states
fitted <- msm::viterbi.msm(fit) # viterbi does not respect the fact that 
#states cannot be 1 at 180 days -> need to adjust 

# store posteriors
tbl_posteriors <- tibble(
        gupi = fitted$subject %>% as.character,
        t    = fitted$time
    ) %>%
    cbind(
        fitted$pstate %>% {colnames(.) <- c("1", as.character(3:8)); .}
    ) %>%
    rename(
        `2_or_3` = `3`
    ) %>%
    filter(
        t == 180.1
    ) %>%
    select(
        gupi, `1`, `2_or_3`, `4`, `5`, `6`, `7`, `8`
    ) %>% 
    pivot_longer(`1`:`8`, names_to  = "gose", values_to = "probability") %>%
    group_by(gupi ) %>%
    mutate(
        probability = ifelse(gose == "1", 0.0, probability),
        probability = probability / sum(probability),
        gose        = sprintf("gose_%s_probability", gose)
    ) %>% 
    ungroup %>% 
    pivot_wider(names_from = gose, values_from = probability)
# start by adding confirmed <= 180d deaths back in
tbl_final_imputations <- readr::read_csv(
        "tbl_gose.csv", 
        col_types = "cinncD"
    ) %>% 
    rename(gupi = subjectId) %>% 
    select(gupi, Subject.DeathDate) %>% 
    distinct %>% 
    transmute(
        gupi       = gupi,
        death_days = difftime(Subject.DeathDate, "1970-01-01", unit = "days") %>% as.numeric
    ) %>% 
    left_join(tbl_posteriors, by = "gupi") %>% 
    pivot_longer(
        names_to  = "gose",
        values_to = "probability",
        gose_1_probability:gose_8_probability
    ) %>% 
    mutate(
        probability = ifelse(
            is.na(death_days) | (death_days > 180),
            probability,
            ifelse(gose != "gose_1_probability", 0.0, 1.0)
        )
    ) %>%
    group_by(gupi) %>%
    mutate(
        gose_map_imputed = ifelse(all(is.na(probability)), 
                NA, 
                names(tbl_posteriors %>% select(-gupi))[which.max(probability)]
            )
    ) %>%
    ungroup() %>% 
    select(-death_days) %>% 
    pivot_wider(names_from = "gose", values_from = "probability") %>% 
    mutate(
        gose_map_imputed = stringr::str_extract(
          gose_map_imputed, 
          "[0-9]{1}(_or_3){0,1}"
        )
    ) %>% 
    left_join(
        # add new column giving priority to per-protocol gose
        readr::read_csv(
            "tbl_gose.csv", 
            col_types = "cinncD"
        ) %>% 
        rename(gupi = subjectId) %>%
        transmute(
          gupi = gupi, 
          gose = Outcomes.DerivedCompositeGOSE,
          days = Outcomes.DerivedCompositeGOSEDaysPostInjury
        ) %>% 
        filter(
            complete.cases(.),
            days >= params$per_protocol_window[1],
            days <= params$per_protocol_window[2]
        ) %>% 
        group_by(gupi) %>% 
        summarize(
            gose = gose[which.min(abs(days - 180))],
            days = days[which.min(abs(days - 180))],
            .groups = "drop"
        ) %>% 
        transmute(
            gupi,
            gose_per_protocol = ifelse(
                gose %in% c(2, 3), 
                "2_or_3", 
                as.character(gose)
            )
        ),
        by = "gupi"
    ) %>% 
    mutate(
        # gose_map_imputed is only 1 when we know that the individual died before
        # 6 months, i.e. in these cases the per-protocol gose should also be 1
        gose_per_protocol = ifelse(gose_map_imputed == "1", "1", gose_per_protocol),
        gose_map_per_protocol_combined = ifelse(
            is.na(gose_per_protocol), 
            gose_map_imputed, 
            gose_per_protocol
        )
    ) %>% 
    # reorder
    select(
        gupi, 
        gose_per_protocol, 
        gose_map_imputed, 
        gose_map_per_protocol_combined, 
        everything()
    )
tbl_final_imputations %>% 
    select(
        gose_map_imputed,
        gose_per_protocol
    ) %>% 
    filter(
        complete.cases(.)
    ) %>% 
    mutate_all(
        ~factor(.x, levels = c("1", "2_or_3", as.character(4:8)))
    ) %>% 
    {caret::confusionMatrix(
        data      = .$gose_map_imputed,
        reference = .$gose_per_protocol
    )$table} %>% 
    as_tibble() %>% 
    rename(
        `GOSe, MAP imputed`          = Prediction,
        `GOSe, closest per-protocol` = Reference
    ) %>% 
    ggplot(aes(y = `GOSe, MAP imputed`, x = `GOSe, closest per-protocol`)) +
      geom_raster(aes(fill = n)) + 
      geom_text(aes(label = n)) +
      coord_fixed(expand = FALSE) + 
      scale_fill_continuous(low = "#FFFFFF", high = "#555555") +
      theme_bw() +
      ggtitle("Confusion matrix: imputed vs. per-protocol")

Save

tbl_final_imputations %>% {
    tmp <- .
    names(tmp) <- gsub("gose", "Subject.DerivedImputed180DaysGOSE", x = names(.))
    tmp } %>% 
    rename(
      "Subject.GOSE6monthEndpointDerived" = "Subject.DerivedImputed180DaysGOSE_map_per_protocol_combined"
    ) %>% 
    write_csv("tbl_gose_imputed.csv")

Session Info

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin19.5.0 (64-bit)
## Running under: macOS Catalina 10.15.5
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.10_1/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] msm_1.6.8       forcats_0.5.0   stringr_1.4.0   dplyr_1.0.1    
##  [5] purrr_0.3.4     readr_1.3.1     tidyr_1.1.1     tibble_3.0.3   
##  [9] ggplot2_3.3.2   tidyverse_1.3.0
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.1           jsonlite_1.7.0       splines_4.0.2       
##  [4] foreach_1.5.0        prodlim_2019.11.13   modelr_0.1.8        
##  [7] assertthat_0.2.1     expm_0.999-5         stats4_4.0.2        
## [10] blob_1.2.1           renv_0.11.0          cellranger_1.1.0    
## [13] yaml_2.2.1           ipred_0.9-9          pillar_1.4.6        
## [16] backports_1.1.8      lattice_0.20-41      glue_1.4.1          
## [19] pROC_1.16.2          digest_0.6.25        rvest_0.3.5         
## [22] colorspace_1.4-1     recipes_0.1.13       htmltools_0.5.0     
## [25] Matrix_1.2-18        plyr_1.8.6           timeDate_3043.102   
## [28] pkgconfig_2.0.3      broom_0.7.0          haven_2.3.1         
## [31] caret_6.0-86         mvtnorm_1.1-1        scales_1.1.1        
## [34] gower_0.2.2          lava_1.6.7           farver_2.0.3        
## [37] generics_0.0.2       ellipsis_0.3.1       withr_2.2.0         
## [40] nnet_7.3-14          cli_2.0.2            survival_3.1-12     
## [43] magrittr_1.5         crayon_1.3.4         readxl_1.3.1        
## [46] evaluate_0.14        fs_1.4.2             fansi_0.4.1         
## [49] nlme_3.1-148         MASS_7.3-51.6        xml2_1.3.2          
## [52] class_7.3-17         data.table_1.12.8    tools_4.0.2         
## [55] hms_0.5.3            lifecycle_0.2.0      munsell_0.5.0       
## [58] reprex_0.3.0         e1071_1.7-3          compiler_4.0.2      
## [61] rlang_0.4.7          grid_4.0.2           iterators_1.0.12    
## [64] rstudioapi_0.11      labeling_0.3         rmarkdown_2.3       
## [67] ModelMetrics_1.2.2.2 gtable_0.3.0         codetools_0.2-16    
## [70] DBI_1.1.0            reshape2_1.4.4       R6_2.4.1            
## [73] lubridate_1.7.9      knitr_1.29           stringi_1.4.6       
## [76] Rcpp_1.0.5           vctrs_0.3.2          rpart_4.1-15        
## [79] dbplyr_1.4.4         tidyselect_1.1.0     xfun_0.15