The markdown file requires the recorded GOSe values in a .csv file names tbl_gose.csv
in the same folder. The .csv file must contain the Neurobot variables
subjectId
Outcomes.DerivedCompositeGOSE
Outcomes.DerivedCompositeGOSEDaysPostInjury
Subject.Age
Subject.PatientType
Subject.DeathDate
in exactly that order. At the time of writing this report the newest staging data set could be accessed via the saved link https://neurobot-stage.incf.org/_5f476d41eedb867816a10376
.
The required R environment can be installed by
install.packages("renv")
renv::restore(clean = TRUE, prompt = FALSE)
suppressPackageStartupMessages({
library(tidyverse)
library(msm)
})
print(params)
## $seed
## [1] 42
##
## $age_range
## [1] 0 Inf
##
## $per_protocol_window
## [1] 150 240
knitr::opts_chunk$set(
cache = TRUE,
echo = TRUE
)
set.seed(params$seed)
# check data
varnames <- c(
"subjectId",
"Outcomes.DerivedCompositeGOSE",
"Outcomes.DerivedCompositeGOSEDaysPostInjury",
"Subject.Age",
"Subject.PatientType",
"Subject.DeathDate"
)
colnames <- readr::read_csv(
"tbl_gose.csv",
n_max = 1,
col_types = "cinncD"
) %>%
names
if (!all(colnames == varnames))
stop("colnames do not match varnames, check order!")
tbl_combined <- readr::read_csv("tbl_gose.csv", col_types = "cinncD") %>%
rename(gupi = subjectId) %>%
filter(
# throw out all rows with missing GOSE
!is.na(Outcomes.DerivedCompositeGOSE),
# throw out all observations without a precise timestamp
!is.na(Outcomes.DerivedCompositeGOSEDaysPostInjury)
) %>%
mutate(
Subject.PatientType = case_when(
Subject.PatientType == "1" ~ "ER",
Subject.PatientType == "2" ~ "Admission",
Subject.PatientType == "3" ~ "ICU"
)
)
# add death date as GOSE 1
tbl_combined <- tbl_combined %>%
mutate(
death_days = as.numeric(difftime(Subject.DeathDate, "1970-01-01", units = "days"))
) %>%
group_by(gupi) %>%
summarize_all(
first
) %>%
ungroup() %>%
filter(
!is.na(death_days),
death_days > 180
) %>%
mutate(
Outcomes.DerivedCompositeGOSEDaysPostInjury = death_days,
Outcomes.DerivedCompositeGOSE = 1L
) %>%
select(-death_days) %>%
rbind(tbl_combined) %>%
arrange(
gupi,
Outcomes.DerivedCompositeGOSEDaysPostInjury
)
# make sure GOSE is unique at each observation time
tbl_combined <- tbl_combined %>%
group_by(
gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>%
summarize_all(
first
) %>%
ungroup
# filter all individuals with GOSE = 1 before 180 days post injury
# (no imputation required!)
tbl_combined <- tbl_combined %>%
group_by(gupi) %>%
filter(
# all GOSEs before 180 dayspost injury must be > 1 (dead)
all(Outcomes.DerivedCompositeGOSE[Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180] > 1),
# ensure age-bracket
Subject.Age >= params$age_range[1],
Subject.Age <= params$age_range[2],
# censor at first GOSE == 1
all(Outcomes.DerivedCompositeGOSE > 1) | (row_number() <= which(Outcomes.DerivedCompositeGOSE == 1)[1])
) %>%
ungroup
tbl_combined$Outcomes.DerivedCompositeGOSE %>% table
## .
## 1 3 4 5 6 7 8
## 39 792 600 838 1236 1782 3159
tbl_gose_msm <- tbl_combined %>%
select(
gupi,
days_post_injury = Outcomes.DerivedCompositeGOSEDaysPostInjury,
gose = Outcomes.DerivedCompositeGOSE
) %>%
rbind(
# add censored states at 180.1 days to make prediction for
tibble(
gupi = unique(.$gupi),
# needed to offset to get a prediction in all cases
days_post_injury = 180 + .1,
gose = 99
)
) %>%
mutate(
# all deaths are excactly observed time - tell MSM!
observation_type = ifelse(gose == 1, 3, 1),
# recode levels to 1:7, 99 (needed for MSM!)
gose = factor(gose,
levels = c(1, 3:8, 99),
labels = c(1:7, 99)
) %>% as.character %>% as.numeric
) %>%
arrange(
gupi,
days_post_injury
)
# define transition matrix
Q <- matrix(0, nrow = 7, ncol = 7)
for (i in 1:7) {
for (j in 1:7) {
if (i == j - 1 & i != 1) {
Q[i, j] <- 1
}
if (i == j + 1) {
Q[i, j] <- 1
}
}
}
Q[2:7, 1] <- 1 # allow instantaneous deaths from all states
fit <- msm::msm(
gose ~ days_post_injury,
subject = gupi,
data = tbl_gose_msm,
obstype = observation_type,
gen.inits = TRUE,
qmatrix = Q,
censor = 99,
pci = c(90, 270),
censor.states = 2:7, # cannot be dead since these were filtered previously
control = list(
fnscale = 12000,
maxit = 10^4,
trace = 2
)
)
## initial value 1.449729
## iter 10 value 1.109494
## iter 20 value 1.068045
## iter 30 value 1.055315
## iter 40 value 1.051368
## iter 50 value 1.049075
## iter 60 value 1.048322
## iter 70 value 1.047718
## iter 80 value 1.047442
## iter 90 value 1.047106
## iter 100 value 1.046910
## iter 110 value 1.046860
## iter 120 value 1.046790
## iter 130 value 1.046727
## iter 140 value 1.046659
## iter 150 value 1.046577
## iter 160 value 1.046480
## iter 170 value 1.046330
## iter 180 value 1.046167
## iter 190 value 1.046081
## iter 200 value 1.045961
## iter 210 value 1.045938
## iter 220 value 1.045912
## iter 230 value 1.045894
## iter 240 value 1.045881
## iter 250 value 1.045873
## iter 260 value 1.045852
## iter 270 value 1.045838
## iter 280 value 1.045826
## iter 290 value 1.045807
## iter 300 value 1.045795
## iter 310 value 1.045790
## iter 320 value 1.045787
## iter 330 value 1.045785
## iter 340 value 1.045782
## iter 350 value 1.045780
## iter 360 value 1.045777
## iter 370 value 1.045771
## iter 380 value 1.045766
## iter 390 value 1.045759
## iter 400 value 1.045754
## iter 410 value 1.045751
## iter 420 value 1.045749
## iter 430 value 1.045746
## iter 440 value 1.045744
## iter 450 value 1.045742
## iter 460 value 1.045740
## iter 470 value 1.045738
## iter 480 value 1.045736
## iter 490 value 1.045732
## iter 500 value 1.045730
## iter 510 value 1.045729
## iter 520 value 1.045728
## iter 530 value 1.045727
## iter 540 value 1.045725
## iter 550 value 1.045724
## iter 560 value 1.045721
## iter 570 value 1.045719
## iter 580 value 1.045716
## iter 590 value 1.045714
## iter 600 value 1.045711
## iter 610 value 1.045710
## iter 620 value 1.045708
## iter 630 value 1.045707
## final value 1.045706
## converged
## Used 638 function and 636 gradient evaluations
## Warning in msm::msm(gose ~ days_post_injury, subject = gupi, data =
## tbl_gose_msm, : Optimisation has probably not converged to the maximum
## likelihood - Hessian is not positive definite.
# get fitted values at censored (prediction) states
fitted <- msm::viterbi.msm(fit) # viterbi does not respect the fact that
#states cannot be 1 at 180 days -> need to adjust
# store posteriors
tbl_posteriors <- tibble(
gupi = fitted$subject %>% as.character,
t = fitted$time
) %>%
cbind(
fitted$pstate %>% {colnames(.) <- c("1", as.character(3:8)); .}
) %>%
rename(
`2_or_3` = `3`
) %>%
filter(
t == 180.1
) %>%
select(
gupi, `1`, `2_or_3`, `4`, `5`, `6`, `7`, `8`
) %>%
pivot_longer(`1`:`8`, names_to = "gose", values_to = "probability") %>%
group_by(gupi ) %>%
mutate(
probability = ifelse(gose == "1", 0.0, probability),
probability = probability / sum(probability),
gose = sprintf("gose_%s_probability", gose)
) %>%
ungroup %>%
pivot_wider(names_from = gose, values_from = probability)
# start by adding confirmed <= 180d deaths back in
tbl_final_imputations <- readr::read_csv(
"tbl_gose.csv",
col_types = "cinncD"
) %>%
rename(gupi = subjectId) %>%
select(gupi, Subject.DeathDate) %>%
distinct %>%
transmute(
gupi = gupi,
death_days = difftime(Subject.DeathDate, "1970-01-01", unit = "days") %>% as.numeric
) %>%
left_join(tbl_posteriors, by = "gupi") %>%
pivot_longer(
names_to = "gose",
values_to = "probability",
gose_1_probability:gose_8_probability
) %>%
mutate(
probability = ifelse(
is.na(death_days) | (death_days > 180),
probability,
ifelse(gose != "gose_1_probability", 0.0, 1.0)
)
) %>%
group_by(gupi) %>%
mutate(
gose_map_imputed = ifelse(all(is.na(probability)),
NA,
names(tbl_posteriors %>% select(-gupi))[which.max(probability)]
)
) %>%
ungroup() %>%
select(-death_days) %>%
pivot_wider(names_from = "gose", values_from = "probability") %>%
mutate(
gose_map_imputed = stringr::str_extract(
gose_map_imputed,
"[0-9]{1}(_or_3){0,1}"
)
) %>%
left_join(
# add new column giving priority to per-protocol gose
readr::read_csv(
"tbl_gose.csv",
col_types = "cinncD"
) %>%
rename(gupi = subjectId) %>%
transmute(
gupi = gupi,
gose = Outcomes.DerivedCompositeGOSE,
days = Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>%
filter(
complete.cases(.),
days >= params$per_protocol_window[1],
days <= params$per_protocol_window[2]
) %>%
group_by(gupi) %>%
summarize(
gose = gose[which.min(abs(days - 180))],
days = days[which.min(abs(days - 180))],
.groups = "drop"
) %>%
transmute(
gupi,
gose_per_protocol = ifelse(
gose %in% c(2, 3),
"2_or_3",
as.character(gose)
)
),
by = "gupi"
) %>%
mutate(
# gose_map_imputed is only 1 when we know that the individual died before
# 6 months, i.e. in these cases the per-protocol gose should also be 1
gose_per_protocol = ifelse(gose_map_imputed == "1", "1", gose_per_protocol),
gose_map_per_protocol_combined = ifelse(
is.na(gose_per_protocol),
gose_map_imputed,
gose_per_protocol
)
) %>%
# reorder
select(
gupi,
gose_per_protocol,
gose_map_imputed,
gose_map_per_protocol_combined,
everything()
)
tbl_final_imputations %>%
select(
gose_map_imputed,
gose_per_protocol
) %>%
filter(
complete.cases(.)
) %>%
mutate_all(
~factor(.x, levels = c("1", "2_or_3", as.character(4:8)))
) %>%
{caret::confusionMatrix(
data = .$gose_map_imputed,
reference = .$gose_per_protocol
)$table} %>%
as_tibble() %>%
rename(
`GOSe, MAP imputed` = Prediction,
`GOSe, closest per-protocol` = Reference
) %>%
ggplot(aes(y = `GOSe, MAP imputed`, x = `GOSe, closest per-protocol`)) +
geom_raster(aes(fill = n)) +
geom_text(aes(label = n)) +
coord_fixed(expand = FALSE) +
scale_fill_continuous(low = "#FFFFFF", high = "#555555") +
theme_bw() +
ggtitle("Confusion matrix: imputed vs. per-protocol")
tbl_final_imputations %>% {
tmp <- .
names(tmp) <- gsub("gose", "Subject.DerivedImputed180DaysGOSE", x = names(.))
tmp } %>%
rename(
"Subject.GOSE6monthEndpointDerived" = "Subject.DerivedImputed180DaysGOSE_map_per_protocol_combined"
) %>%
write_csv("tbl_gose_imputed.csv")
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin19.5.0 (64-bit)
## Running under: macOS Catalina 10.15.5
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.10_1/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] msm_1.6.8 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.1
## [5] purrr_0.3.4 readr_1.3.1 tidyr_1.1.1 tibble_3.0.3
## [9] ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.1 jsonlite_1.7.0 splines_4.0.2
## [4] foreach_1.5.0 prodlim_2019.11.13 modelr_0.1.8
## [7] assertthat_0.2.1 expm_0.999-5 stats4_4.0.2
## [10] blob_1.2.1 renv_0.11.0 cellranger_1.1.0
## [13] yaml_2.2.1 ipred_0.9-9 pillar_1.4.6
## [16] backports_1.1.8 lattice_0.20-41 glue_1.4.1
## [19] pROC_1.16.2 digest_0.6.25 rvest_0.3.5
## [22] colorspace_1.4-1 recipes_0.1.13 htmltools_0.5.0
## [25] Matrix_1.2-18 plyr_1.8.6 timeDate_3043.102
## [28] pkgconfig_2.0.3 broom_0.7.0 haven_2.3.1
## [31] caret_6.0-86 mvtnorm_1.1-1 scales_1.1.1
## [34] gower_0.2.2 lava_1.6.7 farver_2.0.3
## [37] generics_0.0.2 ellipsis_0.3.1 withr_2.2.0
## [40] nnet_7.3-14 cli_2.0.2 survival_3.1-12
## [43] magrittr_1.5 crayon_1.3.4 readxl_1.3.1
## [46] evaluate_0.14 fs_1.4.2 fansi_0.4.1
## [49] nlme_3.1-148 MASS_7.3-51.6 xml2_1.3.2
## [52] class_7.3-17 data.table_1.12.8 tools_4.0.2
## [55] hms_0.5.3 lifecycle_0.2.0 munsell_0.5.0
## [58] reprex_0.3.0 e1071_1.7-3 compiler_4.0.2
## [61] rlang_0.4.7 grid_4.0.2 iterators_1.0.12
## [64] rstudioapi_0.11 labeling_0.3 rmarkdown_2.3
## [67] ModelMetrics_1.2.2.2 gtable_0.3.0 codetools_0.2-16
## [70] DBI_1.1.0 reshape2_1.4.4 R6_2.4.1
## [73] lubridate_1.7.9 knitr_1.29 stringi_1.4.6
## [76] Rcpp_1.0.5 vctrs_0.3.2 rpart_4.1-15
## [79] dbplyr_1.4.4 tidyselect_1.1.0 xfun_0.15