initial commit

97737a15 · Kevin · 97737a15 · 97737a15 · 97737a15 · 97737a15
Commit 97737a15 authored Feb 02, 2019 by Kevin
19 changed files
--- a/.gitignore
+++ b/.gitignore
+output
+.snakemake
+.Rproj.user
+.cache
+*.Rproj
+*.rds
+*.Rhistory
\ No newline at end of file
--- a/Snakefile
+++ b/Snakefile
+singularity: "docker://kkmann/gose-6mo-imputation@sha256:02a8f55c53d6f4917193abda823938e1ad7e8891fd6c392bd94f509e058eb34b"
+
+configfile: "config.yml"
+
+
+
+
+
+rule import_neurobot_csv:
+    output:
+        "data/{version}/df_baseline.rds",
+        "data/{version}/df_ctmri.rds",
+        "data/{version}/df_imaging.rds",
+        "data/{version}/df_labs.rds",
+        "data/{version}/df_gose.rds"
+    shell:
+        """
+        Rscript scripts/import_neurobot_data.R data/{wildcards.version} data/{wildcards.version}
+        """
+
+
+
+
+
+rule prepare_data:
+    input:
+        rules.import_neurobot_csv.output,
+        markdown = "reports/prepare_data.Rmd"
+    output:
+        "output/{version}/data/df_gose.rds",
+        "output/{version}/data/df_baseline.rds",
+        "output/{version}/prepare_data.pdf",
+        figures = "output/{version}/prepare_data_figures.zip"
+    shell:
+        """
+        mkdir -p output/{wildcards.version}/data
+        Rscript -e "rmarkdown::render('{input.markdown}', output_dir = 'output/{wildcards.version}', params = list(datapath = '../data/{wildcards.version}', max_lab_days = {config[max_lab_days]}, seed = {config[seed]}))"
+        mv reports/*.rds output/{wildcards.version}/data
+        mv reports/figures.zip {output.figures}
+        """
+
+
+
+
+
+rule impute_baseline:
+    input:
+        rules.prepare_data.output
+    output:
+        ["output/{version}/data/mi_baseline/df_baseline_mi_%i.rds" % i for i in range(1, config["mi_m"] + 1)]
+    shell:
+        """
+        mkdir -p output/{wildcards.version}/data/mi_baseline
+        Rscript scripts/impute_baseline.R output/{wildcards.version}/data/df_baseline.rds output/{wildcards.version}/data/mi_baseline {config[mi_m]} {config[mi_maxiter]} {config[seed]}
+        """
+
+
+
+
+
+rule generate_validation_data:
+    input:
+        rules.prepare_data.output,
+        rules.impute_baseline.output
+    output:
+        ["output/{version}/data/validation/df_%s_mi_%i_fold_%i.rds" % (s, i, j)
+          for s in ("train", "test")
+          for i in range(1, config["mi_m"] + 1)
+          for j in range(1, config["folds"] + 1)
+        ]
+    shell:
+        """
+        mkdir -p output/{wildcards.version}/data/validation
+        Rscript scripts/generate_validation_data.R output/{wildcards.version}/data {config[mi_m]} {config[folds]} {config[seed]}
+        """
+
+
+# adjust threads by model type
+def get_rule_threads(wildcards):
+    if wildcards.model in ("locf", "msm"):
+        return 1
+    else:
+        return config["stan"]["chains"]
+
+rule fit_model_validation_set:
+    input:
+        "config.yml",
+        "output/{version}/data/validation/df_train_mi_{i}_fold_{j}.rds"
+    output:
+        "output/{version}/data/validation/posteriors/{model}/df_posterior_mi_{i}_fold_{j}.rds"
+    threads:
+        get_rule_threads
+    shell:
+        """
+        mkdir -p output/{wildcards.version}/data/validation/posteriors/{wildcards.model}
+        Rscript models/{wildcards.model}/fit.R {input[1]} {output}
+        """
+
+
+
+
+# helper rule to just build all posterior datasets
+rule model_posteriors:
+    input:
+        ["output/v1.1/data/validation/posteriors/%s/df_posterior_mi_%i_fold_%i.rds" % (m, i, j)
+          for m in ("locf", "msm", "gp", "gp_nb", "mm", "mm_nb")
+          for i in range(1, config["mi_m"] + 1)
+          for j in range(1, config["folds"] + 1)
+        ]
+
+rule model_assessment:
+    input:
+        pop_report = rules.prepare_data.output,
+        posteriors = rules.model_posteriors.input,
+        markdown   = "reports/model_assessment.Rmd"
+    output:
+        pdf     = "output/{version}/model_assessment.pdf",
+        figures = "output/{version}/model_assessment_figures.zip"
+    shell:
+        """
+        mkdir -p output/{wildcards.version}
+        Rscript -e "rmarkdown::render('{input.markdown}', output_dir = 'output/{wildcards.version}', params = list(data_dir = '../output/{wildcards.version}/data', config_file = '../config.yml'))"
+        mv reports/figures.zip {output.figures}
+        """
+
+
+
+
+
+rule generate_imputation_data:
+    input:
+        rules.prepare_data.output,
+        rules.impute_baseline.output
+    output:
+        ["output/{version}/data/imputation/df_mi_%i.rds" % i
+            for i in range(1, config["mi_m"] + 1)
+        ]
+    shell:
+        """
+        mkdir -p output/{wildcards.version}/data/imputation
+        Rscript scripts/generate_imputation_data.R output/{wildcards.version}/data {config[mi_m]}
+        """
+
+
+
+
+
+# rules for imputing on entire dataset
+rule model_impute:
+    input:
+        "config.yml",
+        "output/{version}/data/imputation/df_mi_{i}.rds"
+    output:
+        "output/{version}/data/imputation/{model}/df_gose_imputed_mi_{i}.rds"
+    threads:
+        get_rule_threads
+    shell:
+        """
+        mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
+        Rscript models/{wildcards.model}/fit.R {input[1]} {output}
+        """
+
+# final reported values are a combination of the imputed and per-protocol
+# observed ones
+rule post_process_imputations:
+    input:
+        "config.yml",
+        ["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
+            for i in range(1, config["mi_m"] + 1)
+        ]
+    output:
+        "output/{version}/data/imputation/{model}/df_gose_imputed.csv"
+    shell:
+        """
+        mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
+        Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds {output}
+        """
--- a/config.yml
+++ b/config.yml
+seed:
+  42
+max_lab_days:
+  3
+mi_m:
+  2
+mi_maxiter:
+  5
+folds:
+  2
+t_out:
+  180
+t_buffer:
+  14
+t_out_msm:
+  180
+maxiter_msm:
+  20
+fnscale_msm:
+  12000
+stan:
+  iter:           10
+  warmup:         10
+  chains:          2
+  adapt_delta:      .95
+  max_treedepth:  10
+
+gp:
+  t_knots: [45, 90, 180, 270, 360]
--- a/models/gp/fit.R
+++ b/models/gp/fit.R
+#!/usr/bin/env Rscript
+library(tidyverse)
+library(rstan)
+
+args        <- commandArgs(trailingOnly = TRUE)
+
+inputfile   <- args[[1]]
+outputfile  <- args[[2]]
+modelfile   <- "models/gp/gp.stan"
+
+config      <- yaml::read_yaml("config.yml")
+
+df          <- readRDS(inputfile)
+
+model_matrix_baseline <- df %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  group_by(gupi) %>%
+  select(
+    -Outcomes.DerivedCompositeGOSEDaysPostInjury, -Outcomes.DerivedCompositeGOSE
+  ) %>%
+  summarize_all(first) %>%
+  model.matrix(~ . - gupi - 1, data = .)
+
+df_start_end_index <- df %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  mutate(
+    tmp = row_number()
+  ) %>%
+  group_by(gupi) %>%
+  summarize(
+    start_row = first(tmp),
+    end_row   = last(tmp)
+  )
+
+df_gose <- df %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  transmute(
+    gupi = gupi %>% factor %>% as.numeric,
+    Outcomes.DerivedCompositeGOSE = as.numeric(Outcomes.DerivedCompositeGOSE),
+    Outcomes.DerivedCompositeGOSEDaysPostInjury = Outcomes.DerivedCompositeGOSEDaysPostInjury
+  )
+
+stan_data <- list(
+  N             = nrow(df),
+  n_pat         = df_gose$gupi %>% unique %>% length,
+  pat_start_ind = df_start_end_index$start_row,
+  pat_end_ind   = df_start_end_index$end_row,
+  t             = df_gose$Outcomes.DerivedCompositeGOSEDaysPostInjury,
+  y             = df_gose$Outcomes.DerivedCompositeGOSE,
+  id            = df_gose$gupi,
+  modelmatrix   = model_matrix_baseline,
+  p_cov         = ncol(model_matrix_baseline),
+  t_out_        = config$t_out %>% as.matrix,
+  n_t_out       = length(config$t_out),
+  knots         = config$gp$t_knots,
+  n_knots       = length(config$gp$t_knots)
+)
+
+seed <- digest::digest(stan_data, algo = "sha1") %>%
+  substr(1, 5) %>%
+  strtoi(base = 16)
+cat(sprintf("seed: %i\n\r", seed))
+
+rstan::rstan_options(auto_write = TRUE)
+
+mdl <- rstan::stan(
+    file     = modelfile,
+    data     = stan_data,
+    chains   = config$stan$chains,
+    cores    = config$stan$chains,
+    seed     = seed,
+    iter     = config$stan$warmup + config$stan$iter,
+    warmup   = config$stan$warmup,
+    refresh  = 1,
+    control  = list(
+      max_treedepth = config$stan$max_treedepth,
+      adapt_delta   = config$stan$adapt_delta
+    )
+  )
+
+df_posteriors <- rstan::extract(mdl)$gose_out %>% # (n_sample, n_pat, n_t)
+  # relative frequencies
+  apply(
+    MARGIN = c(2, 3),
+    function(x) table(factor(x, levels = 1:8)) / length(x) %>%  as.numeric()
+  ) %>% # (8, n_pat, n_t) -> permute
+  aperm(c(2, 1, 3)) %>%
+  as.data.frame %>%
+  as_tibble %>%
+  mutate(
+    gupi = row_number()
+  ) %>%
+  gather(GOSE.t, probability, -gupi) %>%
+  separate(col = GOSE.t, c("GOSE", "t"), sep = "\\.", convert = TRUE) %>%
+  mutate(
+    t    = stan_data$t_out_[t, 1],
+    gupi = (df %>%
+      arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+      .[["gupi"]] %>%
+      unique)[gupi]
+  ) %>%
+  select(gupi, t, GOSE, probability) %>%
+  arrange(gupi, t)
+
+saveRDS(df_posteriors, outputfile)
--- a/models/gp/gp.stan
+++ b/models/gp/gp.stan
+data {
+
+  // total number of observations
+  int<lower=1> N;
+  int<lower=1> n_pat;
+  // start / stop index per patient
+  int<lower=1, upper=N> pat_start_ind[n_pat];
+  int<lower=1, upper=N> pat_end_ind[n_pat];
+  // actual data:
+  real t[N]; // sampling times
+  real y[N]; // outcomes
+  int id[N]; // id as integers
+  int<lower=1> n_knots; // number of knots to model population mean
+  real<lower=0.0> knots[n_knots]; // knot positions
+  int n_t_out; // number of output points
+  real t_out_[n_t_out, 1]; // output times
+  int p_cov; // number covariates (model matrix)
+  matrix[n_pat, p_cov] modelmatrix;
+
+}
+
+
+transformed data {
+
+  real t_out[n_t_out] = t_out_[1:n_t_out, 1];
+
+}
+
+
+parameters {
+
+  // standard deviation of the population mean process
+  real<lower=0> s_pop_mean;
+  // lengthscale of the population mean process
+  // real<lower=0> l_pop_mean;
+  real<lower=.5, upper=8.5> knots_y[n_knots];
+
+  // standard deviation of the population mean process
+  real<lower=0> s;
+  // lengthscale of the population mean process
+  real<lower=30, upper=120> l;
+  // nugget term standard deviation
+  real<lower=.01> s_nugg;
+
+  vector[p_cov] beta;
+
+}
+
+transformed parameters {
+
+  matrix[n_knots, n_knots] K_knots_inv = inverse(cov_exp_quad(knots, s_pop_mean, l));
+
+  vector[n_pat] lin_pred = modelmatrix * beta;
+
+}
+
+model {
+
+  // priors
+  s          ~ normal(4, 3);
+  s_pop_mean ~ normal(1, 1);
+  l          ~ normal(60, 14);
+  s_nugg     ~ normal(0, .1);
+  beta       ~ normal(0, 1);
+
+  for (i in 1:n_pat) {
+
+    int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
+
+    // latent mean process
+    real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
+    matrix[n_sub, n_knots] K_latent_12 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
+    vector[n_sub] mu_sub = K_latent_12 * K_knots_inv * to_vector(knots_y);
+
+    vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
+
+    matrix[n_sub, n_sub] Sigma = cov_exp_quad(t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub));
+
+    gose ~ multi_normal(
+      mu_sub + rep_vector(lin_pred[i], n_sub),
+      Sigma
+    );
+
+  }
+
+}
+
+generated quantities {
+
+  real gose_out[n_pat, n_t_out];
+  row_vector[n_t_out] tmp;
+  matrix[n_t_out, n_t_out] Sigma;
+
+  for (i in 1:n_pat) {
+
+    int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
+
+    // latent mean process
+    real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
+    matrix[n_sub, n_sub] K_oo_inv = inverse(cov_exp_quad(t_sub, t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub)));
+    matrix[n_t_out, n_sub] K_no    = cov_exp_quad(t_out, t_sub, s, l);
+
+    matrix[n_t_out, n_knots] K_latent_12 = cov_exp_quad(t_out, knots, s_pop_mean, l);
+    vector[n_t_out] mu_out = rep_vector(lin_pred[i], n_t_out) + K_latent_12 * K_knots_inv * to_vector(knots_y);
+
+    matrix[n_sub, n_knots] K_latent_12_2 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
+    vector[n_sub] mu_sub = rep_vector(lin_pred[i], n_sub) + K_latent_12_2 * K_knots_inv * to_vector(knots_y);
+
+    // observed values
+    vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
+
+    // posterior covariance matrix
+    Sigma = cov_exp_quad(t_out, t_out, s, l) - K_no * K_oo_inv * K_no' + diag_matrix(rep_vector(.001, n_t_out));
+
+    tmp = multi_normal_rng(
+      mu_out + K_no * K_oo_inv * (gose - mu_sub),
+      Sigma
+    )';
+
+    // discretize
+    for (j in 1:n_t_out) {
+      gose_out[i, j] = round(tmp[j]);
+      if (gose_out[i, j] > 8) {
+        gose_out[i, j] = 8;
+      }
+      if (gose_out[i, j] < 1) {
+        gose_out[i, j] = 1;
+      }
+    }
+
+  }
+
+}
--- a/models/gp_nb/fit.R
+++ b/models/gp_nb/fit.R
+#!/usr/bin/env Rscript
+library(tidyverse)
+library(rstan)
+
+args        <- commandArgs(trailingOnly = TRUE)
+
+inputfile   <- args[[1]]
+outputfile  <- args[[2]]
+modelfile   <- "models/gp_nb/gp_nb.stan"
+
+config      <- yaml::read_yaml("config.yml")
+
+df          <- readRDS(inputfile)
+
+model_matrix_baseline <- df %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  group_by(gupi) %>%
+  select(
+    -Outcomes.DerivedCompositeGOSEDaysPostInjury, -Outcomes.DerivedCompositeGOSE
+  ) %>%
+  summarize_all(first) %>%
+  model.matrix(~ . - gupi - 1, data = .)
+
+df_start_end_index <- df %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  mutate(
+    tmp = row_number()
+  ) %>%
+  group_by(gupi) %>%
+  summarize(
+    start_row = first(tmp),
+    end_row   = last(tmp)
+  )
+
+df_gose <- df %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  transmute(
+    gupi = gupi %>% factor %>% as.numeric,
+    Outcomes.DerivedCompositeGOSE = as.numeric(Outcomes.DerivedCompositeGOSE),
+    Outcomes.DerivedCompositeGOSEDaysPostInjury = Outcomes.DerivedCompositeGOSEDaysPostInjury
+  )
+
+stan_data <- list(
+  N             = nrow(df),
+  n_pat         = df_gose$gupi %>% unique %>% length,
+  pat_start_ind = df_start_end_index$start_row,
+  pat_end_ind   = df_start_end_index$end_row,
+  t             = df_gose$Outcomes.DerivedCompositeGOSEDaysPostInjury,
+  y             = df_gose$Outcomes.DerivedCompositeGOSE,
+  id            = df_gose$gupi,
+  modelmatrix   = model_matrix_baseline,
+  p_cov         = ncol(model_matrix_baseline),
+  t_out_        = config$t_out %>% as.matrix,
+  n_t_out       = length(config$t_out),
+  knots         = config$gp$t_knots,
+  n_knots       = length(config$gp$t_knots)
+)
+
+seed <- digest::digest(stan_data, algo = "sha1") %>%
+  substr(1, 5) %>%
+  strtoi(base = 16)
+cat(sprintf("seed: %i\n\r", seed))
+
+rstan::rstan_options(auto_write = TRUE)
+
+mdl <- rstan::stan(
+    file     = modelfile,
+    data     = stan_data,
+    chains   = config$stan$chains,
+    cores    = config$stan$chains,
+    seed     = seed,
+    iter     = config$stan$warmup + config$stan$iter,
+    warmup   = config$stan$warmup,
+    refresh  = 1,
+    control  = list(
+      max_treedepth = config$stan$max_treedepth,
+      adapt_delta   = config$stan$adapt_delta
+    )
+  )
+
+df_posteriors <- rstan::extract(mdl)$gose_out %>% # (n_sample, n_pat, n_t)
+  # relative frequencies
+  apply(
+    MARGIN = c(2, 3),
+    function(x) table(factor(x, levels = 1:8)) / length(x) %>%  as.numeric()
+  ) %>% # (8, n_pat, n_t) -> permute
+  aperm(c(2, 1, 3)) %>%
+  as.data.frame %>%
+  as_tibble %>%
+  mutate(
+    gupi = row_number()
+  ) %>%
+  gather(GOSE.t, probability, -gupi) %>%
+  separate(col = GOSE.t, c("GOSE", "t"), sep = "\\.", convert = TRUE) %>%
+  mutate(
+    t    = stan_data$t_out_[t, 1],
+    gupi = (df %>%
+      arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+      .[["gupi"]] %>%
+      unique)[gupi]
+  ) %>%
+  select(gupi, t, GOSE, probability) %>%
+  arrange(gupi, t)
+
+saveRDS(df_posteriors, outputfile)
--- a/models/gp_nb/gp_nb.stan
+++ b/models/gp_nb/gp_nb.stan
+data {
+
+  // total number of observations
+  int<lower=1> N;
+  int<lower=1> n_pat;
+  // start / stop index per patient
+  int<lower=1, upper=N> pat_start_ind[n_pat];
+  int<lower=1, upper=N> pat_end_ind[n_pat];
+  // actual data:
+  real t[N]; // sampling times
+  real y[N]; // outcomes
+  int id[N]; // id as integers
+  int<lower=1> n_knots; // number of knots to model population mean
+  real<lower=0.0> knots[n_knots]; // knot positions
+  int n_t_out; // number of output points
+  real t_out_[n_t_out, 1]; // output times
+  int p_cov; // number covariates (model matrix)
+  matrix[n_pat, p_cov] modelmatrix;
+
+}
+
+
+transformed data {
+
+  real t_out[n_t_out] = t_out_[1:n_t_out, 1];
+
+}
+
+
+parameters {
+
+  // standard deviation of the population mean process
+  real<lower=0> s_pop_mean;
+  // lengthscale of the population mean process
+  // real<lower=0> l_pop_mean;
+  real<lower=.5, upper=8.5> knots_y[n_knots];
+
+  // standard deviation of the population mean process
+  real<lower=0> s;
+  // lengthscale of the population mean process
+  real<lower=30, upper=120> l;
+  // nugget term standard deviation
+  real<lower=.01> s_nugg;
+
+}
+
+transformed parameters {
+
+  matrix[n_knots, n_knots] K_knots_inv = inverse(cov_exp_quad(knots, s_pop_mean, l));
+
+}
+
+model {
+
+  // priors
+  s          ~ normal(4, 3);
+  s_pop_mean ~ normal(1, 1);
+  l          ~ normal(60, 14);
+  s_nugg     ~ normal(0, .1);
+
+  for (i in 1:n_pat) {
+
+    int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
+
+    // latent mean process
+    real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
+    matrix[n_sub, n_knots] K_latent_12 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
+    vector[n_sub] mu_sub = K_latent_12 * K_knots_inv * to_vector(knots_y);
+
+    vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
+
+    matrix[n_sub, n_sub] Sigma = cov_exp_quad(t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub));
+
+    gose ~ multi_normal(
+      mu_sub,
+      Sigma
+    );
+
+  }
+
+}
+
+generated quantities {
+
+  real gose_out[n_pat, n_t_out];
+  row_vector[n_t_out] tmp;
+  matrix[n_t_out, n_t_out] Sigma;
+
+  for (i in 1:n_pat) {
+
+    int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
+
+    // latent mean process
+    real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
+    matrix[n_sub, n_sub] K_oo_inv = inverse(cov_exp_quad(t_sub, t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub)));
+    matrix[n_t_out, n_sub] K_no    = cov_exp_quad(t_out, t_sub, s, l);
+
+    matrix[n_t_out, n_knots] K_latent_12 = cov_exp_quad(t_out, knots, s_pop_mean, l);
+    vector[n_t_out] mu_out = K_latent_12 * K_knots_inv * to_vector(knots_y);
+
+    matrix[n_sub, n_knots] K_latent_12_2 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
+    vector[n_sub] mu_sub = K_latent_12_2 * K_knots_inv * to_vector(knots_y);
+
+    // observed values
+    vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
+
+    // posterior covariance matrix
+    Sigma = cov_exp_quad(t_out, t_out, s, l) - K_no * K_oo_inv * K_no' + diag_matrix(rep_vector(.001, n_t_out));
+
+    tmp = multi_normal_rng(
+      mu_out + K_no * K_oo_inv * (gose - mu_sub),
+      Sigma
+    )';
+
+    // discretize
+    for (j in 1:n_t_out) {
+      gose_out[i, j] = round(tmp[j]);
+      if (gose_out[i, j] > 8) {
+        gose_out[i, j] = 8;
+      }
+      if (gose_out[i, j] < 1) {
+        gose_out[i, j] = 1;
+      }
+    }
+
+  }
+
+}
--- a/models/locf/fit.R
+++ b/models/locf/fit.R
+#!/usr/bin/env Rscript
+library(tidyverse)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+inputfile  <- args[[1]]
+outputfile <- args[[2]]
+
+config <- yaml::read_yaml("config.yml")
+
+# read and process input data
+df <- readRDS(inputfile) %>%
+  mutate(
+    Outcomes.DerivedCompositeGOSE = factor(
+      Outcomes.DerivedCompositeGOSE,
+      levels  = c(1, 3:8), # 2 is too infrequent to be fitted
+      ordered = TRUE
+    )
+  )
+
+impute_locf <- function(t, GOSE, tolerance, t_pred) {
+  res  <- numeric(length(t_pred))
+  res[] <- NA_integer_
+  for (i in 1:length(t_pred)) {
+    res[i] <- GOSE[t < t_pred[i] + tolerance] %>%
+      tail(1) %>% # use last if multiple available
+      ifelse(length(.) == 0, NA, .)
+  }
+  return(res %>% as.integer)
+}
+
+df_posteriors <- df %>%
+  group_by(gupi) %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSE) %>%
+  do(
+    t = config$t_out,
+    GOSE_predicted = impute_locf(.$Outcomes.DerivedCompositeGOSEDaysPostInjury, .$Outcomes.DerivedCompositeGOSE %>% as.character %>% as.integer, config$t_buffer, config$t_out)
+  ) %>%
+  unnest %>%
+  mutate(
+    `1` = (GOSE_predicted == 1) %>% as.numeric,
+    `2` = (GOSE_predicted == 2) %>% as.numeric,
+    `3` = (GOSE_predicted == 3) %>% as.numeric,
+    `4` = (GOSE_predicted == 4) %>% as.numeric,
+    `5` = (GOSE_predicted == 5) %>% as.numeric,
+    `6` = (GOSE_predicted == 6) %>% as.numeric,
+    `7` = (GOSE_predicted == 7) %>% as.numeric,
+    `8` = (GOSE_predicted == 8) %>% as.numeric
+  ) %>%
+  select(-GOSE_predicted) %>%
+  gather(GOSE, probability, `1`:`8`) %>%
+  mutate(
+    gupi = as.character(gupi),
+    GOSE = as.integer(GOSE)
+  ) %>%
+  arrange(gupi, t, GOSE)
+
+saveRDS(df_posteriors, outputfile)
--- a/models/mm/fit.R
+++ b/models/mm/fit.R
+#!/usr/bin/env Rscript
+library(tidyverse)
+library(brms)
+
+args        <- commandArgs(trailingOnly = TRUE)
+
+inputfile   <- args[[1]]
+outputfile  <- args[[2]]
+
+config      <- yaml::read_yaml("config.yml")
+
+# read and process input data
+df <- readRDS(inputfile) %>%
+  mutate(
+    Outcomes.DerivedCompositeGOSE = factor(
+      Outcomes.DerivedCompositeGOSE,
+      levels = Outcomes.DerivedCompositeGOSE %>%
+        unique %>%
+        as.numeric %>%
+        sort %>%
+        as.character,
+      ordered = TRUE
+    )
+  ) %>%
+  mutate_if(is.character, factor)
+
+# generate random seed from first 5 digits of the sha1 hash value
+seed <- digest::digest(df, algo = "sha1") %>%
+  substr(1, 5) %>%
+  strtoi(base = 16)
+cat(sprintf("seed: %i\n\r", seed))
+
+formula <- Outcomes.DerivedCompositeGOSE ~
+  s(Outcomes.DerivedCompositeGOSEDaysPostInjury) +
+  (I(Outcomes.DerivedCompositeGOSEDaysPostInjury^2) + Outcomes.DerivedCompositeGOSEDaysPostInjury + 1 | gupi) +
+  . - gupi - Outcomes.DerivedCompositeGOSEDaysPostInjury - Outcomes.DerivedCompositeGOSE
+
+mdl <- brms::brm(
+  formula,
+  data     = df,
+  family   = brms::cumulative("logit", threshold = "flexible"),
+  chains   = config$stan$chains,
+  cores    = config$stan$chains,
+  seed     = seed,
+  iter     = config$stan$warmup + config$stan$iter,
+  warmup   = config$stan$warmup,
+  refresh  = 1,
+  control  = list(
+    max_treedepth = config$stan$max_treedepth,
+    adapt_delta   = config$stan$adapt_delta
+  ),
+  save_warmup = FALSE
+)
+
+df_new_data <- expand.grid(
+    gupi = df$gupi %>% unique,
+    Outcomes.DerivedCompositeGOSEDaysPostInjury = config$t_out
+  ) %>%
+  as_tibble %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  left_join(
+    df %>%
+      select(-Outcomes.DerivedCompositeGOSE, -Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+      group_by(gupi) %>%
+      summarize_all(first) %>%
+      ungroup,
+    by = c("gupi")
+  )
+
+df_posteriors <- predict(mdl, df_new_data, summary = TRUE) %>%
+  as_tibble() %>%
+  mutate(
+    gupi = df_new_data$gupi,
+    Outcomes.DerivedCompositeGOSEDaysPostInjury = df_new_data$Outcomes.DerivedCompositeGOSEDaysPostInjury
+  ) %>% {
+    # since not all levels are observed/fitted, need to extend predictions
+    # by zero for those categories, dplyr black magic
+    cols         <- sprintf("P(Y = %s)", 1:8)
+    missing_cols <- cols[!(cols %in% names(as_tibble(.)))]
+    res <- .
+    for (newcol in missing_cols) {
+      # add missing factor levels
+      res <- mutate(res, !!newcol := 0.0)
+    }
+    res <- select(res, gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury, everything())
+    return(res)
+  } %>%
+  rename(t = Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  gather(GOSE, probability, starts_with("P(Y =")) %>%
+  mutate(
+    GOSE = factor(GOSE, labels = 1:8, levels = sprintf("P(Y = %s)", 1:8))
+  ) %>%
+  arrange(gupi, t, GOSE) %>%
+  mutate_if(is.factor, as.character) %>%
+  mutate(
+    GOSE = GOSE %>% as.integer,
+    gupi = as.character(gupi)
+  )
+
+saveRDS(df_posteriors, outputfile)
--- a/models/mm_nb/fit.R
+++ b/models/mm_nb/fit.R
+#!/usr/bin/env Rscript
+library(tidyverse)
+library(brms)
+
+args        <- commandArgs(trailingOnly = TRUE)
+
+inputfile   <- args[[1]]
+outputfile  <- args[[2]]
+
+config      <- yaml::read_yaml("config.yml")
+
+# read and process input data
+df <- readRDS(inputfile) %>%
+  mutate(
+    Outcomes.DerivedCompositeGOSE = factor(
+      Outcomes.DerivedCompositeGOSE,
+      levels = Outcomes.DerivedCompositeGOSE %>%
+        unique %>%
+        as.numeric %>%
+        sort %>%
+        as.character,
+      ordered = TRUE
+    )
+  ) %>%
+  mutate_if(is.character, factor)
+
+# generate random seed from first 5 digits of the sha1 hash value
+seed <- digest::digest(df, algo = "sha1") %>%
+  substr(1, 5) %>%
+  strtoi(base = 16)
+cat(sprintf("seed: %i\n\r", seed))
+
+formula <- Outcomes.DerivedCompositeGOSE ~
+  s(Outcomes.DerivedCompositeGOSEDaysPostInjury) +
+  (I(Outcomes.DerivedCompositeGOSEDaysPostInjury^2) + Outcomes.DerivedCompositeGOSEDaysPostInjury + 1 | gupi)
+
+mdl <- brms::brm(
+  formula,
+  data     = df,
+  family   = brms::cumulative("logit", threshold = "flexible"),
+  chains   = config$stan$chains,
+  cores    = config$stan$chains,
+  seed     = seed,
+  iter     = config$stan$warmup + config$stan$iter,
+  warmup   = config$stan$warmup,
+  refresh  = 1,
+  control  = list(
+    max_treedepth = config$stan$max_treedepth,
+    adapt_delta   = config$stan$adapt_delta
+  ),
+  save_warmup = FALSE
+)
+
+df_new_data <- expand.grid(
+    gupi = df$gupi %>% unique,
+    Outcomes.DerivedCompositeGOSEDaysPostInjury = config$t_out
+  ) %>%
+  as_tibble %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  left_join(
+    df %>%
+      select(-Outcomes.DerivedCompositeGOSE, -Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+      group_by(gupi) %>%
+      summarize_all(first) %>%
+      ungroup,
+    by = c("gupi")
+  )
+
+df_posteriors <- predict(mdl, df_new_data, summary = TRUE) %>%
+  as_tibble() %>%
+  mutate(
+    gupi = df_new_data$gupi,
+    Outcomes.DerivedCompositeGOSEDaysPostInjury = df_new_data$Outcomes.DerivedCompositeGOSEDaysPostInjury
+  ) %>% {
+    # since not all levels are observed/fitted, need to extend predictions
+    # by zero for those categories, dplyr black magic
+    cols         <- sprintf("P(Y = %s)", 1:8)
+    missing_cols <- cols[!(cols %in% names(as_tibble(.)))]
+    res <- .
+    for (newcol in missing_cols) {
+      # add missing factor levels
+      res <- mutate(res, !!newcol := 0.0)
+    }
+    res <- select(res, gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury, everything())
+    return(res)
+  } %>%
+  rename(t = Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  gather(GOSE, probability, starts_with("P(Y =")) %>%
+  mutate(
+    GOSE = factor(GOSE, labels = 1:8, levels = sprintf("P(Y = %s)", 1:8))
+  ) %>%
+  arrange(gupi, t, GOSE) %>%
+  mutate_if(is.factor, as.character) %>%
+  mutate(
+    GOSE = GOSE %>% as.integer,
+    gupi = as.character(gupi)
+  )
+
+saveRDS(df_posteriors, outputfile)
--- a/models/msm/fit.R
+++ b/models/msm/fit.R
+#!/usr/bin/env Rscript
+library(tidyverse)
+library(msm)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+inputfile  <- args[[1]]
+outputfile <- args[[2]]
+
+config <- yaml::read_yaml("config.yml")
+
+# filter deaths - we use exact times
+# GOSE = 2 is too infreqeunt
+df_gose <- readRDS(inputfile) %>%
+  mutate(
+    Outcomes.DerivedCompositeGOSE = factor(
+      Outcomes.DerivedCompositeGOSE,
+      levels  = c(1, 3:8, 99), # 2 is too infrequent to be fitted
+      ordered = TRUE
+    ) %>% as.character %>% as.integer
+  ) %>% select(
+    gupi,
+    Outcomes.DerivedCompositeGOSE,
+    Outcomes.DerivedCompositeGOSEDaysPostInjury
+  ) %>%
+  arrange(
+    gupi,
+    Outcomes.DerivedCompositeGOSEDaysPostInjury
+  )
+
+# translate to msm formatting
+tmp <- df_gose %>%
+  rbind(tibble( # only predict GOSE at 180 days - takes too long otherwise
+    gupi   = df_gose$gupi %>% unique,
+    Outcomes.DerivedCompositeGOSEDaysPostInjury = config$t_out_msm,
+    Outcomes.DerivedCompositeGOSE = 99
+  )) %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  mutate(
+    obstype = ifelse(Outcomes.DerivedCompositeGOSE == 1, 3, 1), # make death exactly observed transition
+    Outcomes.DerivedCompositeGOSE = factor(
+      Outcomes.DerivedCompositeGOSE,
+      levels = c(1, 3:8, 99),
+      labels = c(1:7, 99)
+    ) %>% as.character %>% as.numeric
+  ) %>%
+  group_by(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
+  # remove prediction states where we already have a true one
+  filter((n() == 1) | (Outcomes.DerivedCompositeGOSE != 99)) %>%
+  ungroup
+
+# define transition matrix
+Q <- matrix(0, nrow = 7, ncol = 7)
+for (i in 1:7) {
+  for (j in 1:7) {
+    if (i == j - 1 & i != 1) {
+      Q[i, j] <- 1
+    }
+    if (i == j + 1) {
+      Q[i, j] <- 1
+    }
+  }
+}
+Q[2:7, 1] <- 1 # allow instantaneous deaths
+
+# someone comes back from the dead - make sure death is last observation
+fit <- msm(
+  Outcomes.DerivedCompositeGOSE ~ Outcomes.DerivedCompositeGOSEDaysPostInjury,
+  subject   = tmp$gupi,
+  data      = tmp,
+  obstype   = tmp$obstype,
+  gen.inits = TRUE,
+  qmatrix   = Q,
+  censor    = 99,
+  pci       = c(90, 270),
+  censor.states = 1:7,
+  control   = list(
+    fnscale = config$fnscale_msm,
+    maxit   = config$maxiter_msm,
+    trace   = 2
+  )
+)
+
+# get fitted values at censored (prediction) states
+fitted <- viterbi.msm(fit)
+
+# store posteriors
+df_posteriors <- tibble(
+    gupi = fitted$subject,
+    t    = fitted$time
+  ) %>%
+  cbind(
+    fitted$pstate %>% {colnames(.) <- c("1", as.character(3:8)); .}
+  ) %>%
+  mutate("2" = 0) %>%
+  filter(t == config$t_out_msm) %>%
+  gather(GOSE, probability, as.character(1:8)) %>%
+  arrange(gupi, t, GOSE)
+
+saveRDS(df_posteriors, outputfile)
--- a/reports/model_assessment.Rmd
+++ b/reports/model_assessment.Rmd
+---
+title: "Imputing GOSE scores in CENTER-TBI"
+
+date: "`r Sys.time()`"
+      
+statistician: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
+
+collaborator: "David Menon (dkm13@cam.ac.uk)"
+
+output: reportr::report
+
+git-commit-hash: "`r system('git rev-parse --verify HEAD', intern=TRUE)`"
+
+git-wd-clean: "`r ifelse(system('git diff-index --quiet HEAD') == 0, 'clean', 'file changes, working directory not clean!')`"
+
+bibliography: "references.bib"
+
+params:
+  data_dir: "../output/v1.1/data"
+  config_file: "../config.yml"
+---
+
+```{r setup-chunk, include=FALSE}
+options(tidyverse.quiet = TRUE) # supresses filter/lag conflicts
+require(tidyverse, quietly = TRUE)
+
+config <- yaml::read_yaml(params$config_file)
+
+set.seed(config$seed)
+```
+
+
+
+# Model descriptions
+
+## Last-observation-carried-forward (LOCF)
+
+We use last-observation-carried-forward (LOCF) as baseline method.
+We define LOCF as the latest observation before the imputation time point and
+aditionally allow a 14 day tolerance period, i.e., if there is a GOSE 
+observation within 14 days of the imputation time point it will always be used.
+Since LOCF is not model-based there might be situations in which a GOSE is
+available only at a later time point and no GOSE can be imputed.
+LOCF imputations are therefore only available for a subset of the study 
+population.
+This implies that all comparisons with different methods must be conducted
+on the subset of individuals for which LOCF is defined (i.e., at least one GOSE 
+observation being available within the first 180 + 14 days).
+
+
+
+## Mixed-effects model
+
+A standard approach to longitudinal data analysis are mixed effects models
+which model individual-specific deviations from a population mean trajectory.
+Here, wer use a mixed-effects model with a spline function for the potential
+non-linear population-mean trajectory (fixed time effect). 
+Patient-individual deviations are modelled as quadratic polynomials to allow 
+sufficient flexibility [^1].
+
+[^1]: Expressed as R model formula the model is given by: $GOSE \sim s(t) + (1 + t + t^2 \,|\, ID)$.
+Here, $s(t)$ denotes the spline function for the population-mean trajectory.
+Quadratic and linear terms for individuals with only one observation are 
+simply shrunk towards the population mean zero but the addition of those effects
+allows for both upward as well as downward facing (quadratic) deviation from the overall
+population mean trajectory.
+
+Baseline covariates can easily be added by including additional fixed effects
+per covariate [^2].
+
+[^2]: I.e., for $k$ additional baseline covariates $X_i, i=1\ldots k$ this is
+represented by the model formula $GOSE \sim s(t) + X_1 + ... + X_k + (1 + t + t^2 \,|\, ID)$.
+
+We account for the discretenss of GOSE by fitting a cumulative link ordinal 
+regression model.
+Logit link and flexible intercepts were employed (no uniform spacing assumed).
+For further details on cumulative link models see @Agresti2003.
+We used the BRMS package @brms2017, @brms2018 for the R environment 
+for statistical computing @r2016 to fit the model using non-informative priors.
+
+
+
+## Gaussian process model
+
+A potential drawback of a mixed effects model to for the longitudinal
+trajectories of individual patients's GOSE is the fact that the individual
+deviations from the population mean GOSE are modelled globally
+via polynomials (e.g. quadratic).
+Since linear and quadratic terms are not identifyable for patients with only one
+observed GOSE value, this implies large uncertainty over the patient-specific
+effects for individuals with one or two observations only, and thus the overall 
+uncertainty of their fit is relatively large.
+Gaussian process regression @rasmussen2006 allows to flexibly model both the 
+individual GOSE trajectories as well as the population mean.
+The main advantage over a mixed effects model with subject specific quadratic
+polynomail deviations from a population mean function is the fact that
+model uncertainty at 6 months for individuals with a single observation close to
+6 months is substantially decreased due to the *local* nature of Gaussian 
+process regression.
+However, it is to be expected that the model will perform slightly worse in terms
+of extrapolation since less structure is learnt than in the proposed mixed
+effects model.
+To model the discrete GOSE outcome, GP regression is applied treating GOSE values
+as numerical and the final predictions are rounded to the closest integer in 
+1 to 8.
+A squared exponential covariance function with shared lengthscale $l$, 
+variance $\sigma^2$, and residual variance (nugget term) 
+$\sigma_{resid.}^2$ was used since the
+spacing of observation is roughly comparable between patients.
+We model the latent population mean trajectory $\mu(t)$ as
+the mean of a Gaussian process with pseudo observations at specific pivot points.
+This approach maintains flexibility of the population mean function but avoids
+the computational complexity of a hierarchical Gaussian process.
+We chose as pivot points for the population mean
+
+```{r print-knots}
+config$gp$t_knots %>% as.numeric()
+```
+
+days after injury.
+Again, baseline covariates may be added to the mean function of the Guassian 
+process in the same way as for the mixed model.
+All parameters are estimated in a fully Bayesian fashion using the stan modelling 
+language @stan2017 and non-informative priors except for the lengthscale of the
+squared exponential kernel.
+Due to the sparseness of the data, the estimated lengthscale will naturally tend
+towards extremely large value allowing for unrealistically long-range dependency 
+between observations.
+We therefore limit the lengthscale to a maximum of 120 days (4 months) an impose
+a normal prior $\mathcal{N}(60, 14)$.
+
+
+
+## Multi-state model
+
+Both Gaussian process regression as well as mixed effects models with spline
+effects are non-linear regression techniques for longitudinal data.
+Multi-state models @msm2011 offer an entirely different approach to modelling 
+the GOSE observations.
+Here, it is assumed that the transitions between adjacent GOSE states can be
+modelled as a Markov process and the transition intensities between adjacent
+states are fitted to the observed data.
+To account for the fact that transition intensities may vary with time,
+piecewise constant transition intensities were fitted to the intervals 
+$[0, 90), [90, 270)$, and $[270, \infty)$ days after injury.
+Due to the large number of transition intensities for an 8-level outcome scale,
+inculsion of baseline covariates turned out to be numerically unstable.
+We therefore only report the results for the model without baseline covariates.
+The model was fitted using the msm package for R @msm2011.
+
+
+
+
+
+# Model validation
+
+We conducted a 3-fold cross validation to assess the quality of the imputed
+GOSE values.
+The test cases are given by the individuals with a GOSE measurement within 180 +/-
+14 days after injury.
+
+```{r load-test-data}
+df_ground_truth <- c()
+
+for (i in 1:config$folds) {
+  
+  df_ground_truth <- rbind(
+    df_ground_truth,
+    # note that GOSE values in test data are not subject to mi - loading the 
+    # first data set is sufficient!
+    sprintf("%s/validation/df_test_mi_1_fold_%i.rds", params$data_dir, i) %>%
+      readRDS() %>% 
+      mutate(fold = i) %>% 
+      select(gupi, fold, Outcomes.DerivedCompositeGOSEDaysPostInjury, Outcomes.DerivedCompositeGOSE)
+  )
+  
+}
+```
+
+
+
+```{r}
+df_ground_truth %>% 
+    ggplot(aes(Outcomes.DerivedCompositeGOSE)) +
+        geom_bar() +
+        facet_wrap(~fold) +
+        theme_bw() +
+        theme(
+            panel.grid.minor = element_blank(), 
+            panel.grid.major.x = element_blank()
+        ) +
+        ggtitle("Distribution of 6-months GOSE values of test cases per fold.")
+```
+
+The distribution of GOSE values in the respective three test sets is well 
+balanced.
+
+
+
+```{r get-model-names}
+modelnames <- lapply(
+    list.dirs(
+      sprintf("%s/validation/posteriors", params$data_dir), 
+      recursive = FALSE
+    ),
+    basename
+  ) %>% 
+  as.character()
+
+cat("models detected:\n\r")
+cat(sprintf("    %s\n\r", modelnames))
+```
+
+```{r load-posteriors}
+df_model_posteriors <- c()
+
+for (modelname in modelnames) {
+  for (i in 1:config$mi_m) {
+    for (j in 1:config$folds) {
+      tryCatch(
+        df_model_posteriors <- rbind(
+          df_model_posteriors,
+          sprintf("%s/validation/posteriors/%s/df_posterior_mi_%i_fold_%i.rds", 
+                  params$data_dir, modelname, i, j) %>%
+            readRDS() %>% 
+            mutate(imp = i, fold = j, model = modelname) %>% 
+            select(gupi, fold, imp, model, everything()) %>% 
+            filter(
+              gupi %in% (df_ground_truth %>% filter(fold == j) %>% .[["gupi"]])
+            )
+        ), 
+        error = function(e) print(e)
+      )
+    }
+  }
+}
+```
+
+```{r compute-map-predictions}
+df_predictions <- df_model_posteriors %>%
+  group_by(fold, model, gupi, t, GOSE) %>%
+  summarise(probability = mean(probability)) %>%
+  arrange(model, fold, gupi, t, GOSE) %>% 
+  ungroup() %>% 
+  filter(t == 180) %>% 
+  select(-t) %>% 
+  group_by(fold, model, gupi) %>% 
+  do(
+    GOSE = .$GOSE,
+    prediction = rep(which.max(.$probability) %>% 
+      ifelse(length(.) == 0, NA_integer_, .) %>% 
+      ifelse(length(.) > 1, round(mean(.)), .), 8),
+    probability = .$probability
+  ) %>% 
+  unnest %>% 
+  spread(GOSE, probability) %>% 
+  right_join(
+    df_ground_truth %>% 
+      mutate(
+        gupi = as.character(gupi),
+        GOSE = as.integer(Outcomes.DerivedCompositeGOSE)
+    ), 
+    by = c("gupi", "fold")
+  )
+```
+
+
+
+LOCF, by design, may not be able to provide imputed values whenever 
+no observation before 180 + 14 days after injury is observed.
+Since there is reason to assume that these individuals 6-months GOSE will be 
+harder to predict, 
+a comparison of LOCF with the other methods must be conditional on the 
+set of individuals which allow LOCF imputation.
+
+```{r non-locf-ids}
+idx <- df_predictions %>% 
+  filter(model == "locf", !complete.cases(.)) %>% 
+  .[["gupi"]]
+```
+
+Overall, `r length(idx)` out of `r df_predictions %>% filter(model == "locf") %>% nrow` test cases cannot 
+be imputed with the LOCF approach.
+
+
+
+## Comparison conditional on LOCF subset
+
+We first consider results for the set of test cases which allow LOCF imputation.
+
+
+### Confusion matrix
+
+The accuracy of the imputed values can be assessed graphically by inspection
+of the confusion matrix were imputed values are plotted against true GOSE values.
+Note that since we do not have access to true 180 days GOSE values we used a 
+time window of $\pm$ 14 days around 180 days after injury.
+
+```{r compute-confusion-matrices}
+df_average_confusion_matrices <- df_predictions %>% 
+  filter(!(gupi %in% idx)) %>% 
+  group_by(fold, model) %>% 
+  do(
+    confusion_matrix = caret::confusionMatrix(
+        data = factor(.$prediction, levels = 1:8), 
+        reference = factor(.$GOSE, levels = 1:8)
+      ) %>% 
+      as.matrix %>% as_tibble %>% 
+      mutate(`Predicted GOSE` = row_number() %>% as.character) %>% 
+      gather(`True GOSE`, n, 1:8)
+  ) %>% 
+  unnest %>% 
+  group_by(model, `Predicted GOSE`, `True GOSE`) %>% 
+  summarize(n = mean(n)) %>% 
+  ungroup
+```
+
+We first consider the raw, unnormalized counts of the respective confusion 
+matrices.
+These matrices do take into account the non-uniform distribution of the 
+respective true 6-months GOSE values in the test set.
+All values are averaged accross folds.
+
+```{r confusion-matrix-locf, warning=FALSE, message=FALSE, fig.height=4.5, out.width=".9\\textwidth", fig.align='center'}
+df_average_confusion_matrices %>%
+  group_by(model, `True GOSE`) %>%
+  ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = n)) +
+    geom_raster() +
+    geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
+    geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
+    scale_fill_gradient(low = "white", high = "black") +
+    coord_fixed(expand = FALSE) + 
+    labs(x = "true GOSE", y = "imputed GOSE", fill = "") +
+    theme_bw() +
+    theme(
+      panel.grid = element_blank()
+    ) + 
+    facet_wrap(~model) +
+    ggtitle("Average confusion matrix accross folds (absolute counts)")
+
+ggsave(filename = "confusion_matrices_locf.pdf", width = 7, height = 7)
+ggsave(filename = "confusion_matrices_locf.png", width = 7, height = 7)
+```
+
+
+
+```{r confusion-matrix-col-perc-locf, warning=FALSE, message=FALSE, fig.height=4.5, out.width=".9\\textwidth", fig.align='center'}
+df_average_confusion_matrices %>%
+  group_by(model, `True GOSE`) %>%
+  mutate(
+    `fraction (column)` = n / sum(n),
+    `fraction (column)` = ifelse(is.nan(`fraction (column)`), 0, `fraction (column)`)
+  ) %>% 
+  ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = `fraction (column)`)) +
+    geom_raster() +
+    geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
+    geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
+    scale_fill_gradient("column fraction", low = "white", high = "black", limits = c(0, 1)) +
+    coord_fixed(expand = FALSE) + 
+    labs(x = "true GOSE", y = "imputed GOSE", fill = "") +
+    theme_bw() +
+    theme(
+      panel.grid = element_blank()
+    ) + 
+    facet_wrap(~model) +
+    ggtitle("Average confusion matrix accross folds (column normalized)")
+
+ggsave(filename = "confusion_matrices_col_perc_locf.pdf", width = 7, height = 7)
+ggsave(filename = "confusion_matrices_col_perc_locf.png", width = 7, height = 7)
+```
+
+### Accuracy
+
+To summarize the overall performance of the respective approaches we consider
+bias, mean absolute error (MAE), and root mean square error (RMSE).
+
+```{r error-scores-stratified-locf, fig.align='center'}
+df_predictions %>%
+  filter(!(gupi %in% idx)) %>% 
+  group_by(model, fold, GOSE) %>% 
+  summarize(
+    RMSE = mean((GOSE - prediction)^2, na.rm = TRUE) %>% sqrt,
+    MAE  = mean(abs(GOSE - prediction), na.rm = TRUE),
+    bias = mean(prediction, na.rm = TRUE) - mean(GOSE, na.rm = TRUE) 
+  ) %>% 
+  gather(error, value, -model, -fold, -GOSE) %>% 
+  group_by(GOSE, model, error) %>% 
+  summarize(
+      mean = mean(value),
+      sd   = sd(value)
+  ) %>% 
+  ungroup() %>% 
+  ggplot(aes(GOSE, color = model)) + 
+    geom_hline(yintercept = 0, color = "black") +
+    geom_line(aes(y = mean)) + 
+    facet_wrap(~error) +
+    scale_y_continuous(name= "", breaks = seq(-2, 8, .25)) +
+    theme_bw() +
+    theme(
+      panel.grid.minor   = element_blank(),
+      panel.grid.major.x = element_blank(),
+      axis.text.x        = element_text(angle = 66, hjust = 1)
+    ) +
+    ggtitle("Summary measures by true GOSE value")
+
+ggsave(filename = "imputation_error_stratified_locf.pdf", width = 7, height = 7)
+ggsave(filename = "imputation_error_stratified_locf.png", width = 7, height = 7)
+```
+
+```{r error-scores-locf, fig.align='center'}
+df_predictions %>%
+  filter(!(gupi %in% idx)) %>% 
+  group_by(model, fold) %>% 
+  summarize(
+    RMSE = mean((GOSE - prediction)^2, na.rm = TRUE) %>% sqrt,
+    MAE  = mean(abs(GOSE - prediction), na.rm = TRUE),
+    bias = mean(prediction, na.rm = TRUE) - mean(GOSE, na.rm = TRUE) 
+  ) %>% 
+  ungroup %>% 
+  gather(error, value, -model, -fold) %>% 
+  ggplot(aes(model, value)) + 
+    geom_hline(yintercept = 0, color = "black") +
+    geom_boxplot() + 
+    facet_wrap(~error) +
+    scale_y_continuous(name= "", breaks = seq(-2, 8, .25)) +
+    scale_x_discrete("") + 
+    theme_bw() +
+    theme(
+      panel.grid.minor   = element_blank(),
+      panel.grid.major.x = element_blank(),
+      axis.text.x        = element_text(angle = 66, hjust = 1)
+    ) +
+    ggtitle("Overall summary measures")
+
+ggsave(filename = "imputation_error_locf.pdf", width = 7, height = 7)
+ggsave(filename = "imputation_error_locf.png", width = 7, height = 7)
+```
+
+
+Overall, LOCF exhibits a negative bias which is mainly driven by the 
+underestimation of individuals with true GOSE of 6,7, or 8. 
+This also translates to a slightly worse MAE and RMSE - again, this is
+mostly driven by a lack of fit for larger true GOSE values.
+For lower true GOSE values, the fit is very similar to MSM and the mixed effects
+models.
+Gaussian process regression exhibits a bad fit for low true GOSE value which
+are systematically overestimated.
+LOCF performs worst in terms of overall bias, MAE, and RMSE in addition to not
+being able to impute values for the entire set of test cases.
+We therefore drop it from the compariosn and compare the remaining 
+methods on the entire test sets.
+
+
+
+## Comparson on full test set
+
+```{r compute-confusion-matrices-no-locf}
+df_average_confusion_matrices <- df_predictions %>% 
+  filter(!(model == "locf")) %>% 
+  group_by(fold, model) %>% 
+  do(
+    confusion_matrix = caret::confusionMatrix(
+        data = factor(.$prediction, levels = 1:8), 
+        reference = factor(.$GOSE, levels = 1:8)
+      ) %>% 
+      as.matrix %>% as_tibble %>% 
+      mutate(`Predicted GOSE` = row_number() %>% as.character) %>% 
+      gather(`True GOSE`, n, 1:8)
+  ) %>% 
+  unnest %>% 
+  group_by(model, `Predicted GOSE`, `True GOSE`) %>% 
+  summarize(n = mean(n)) %>% 
+  ungroup
+```
+
+
+```{r confusion-matrix, warning=FALSE, message=FALSE, fig.height=5, out.width=".9\\textwidth", fig.align='center'}
+df_average_confusion_matrices %>%
+  group_by(model, `True GOSE`) %>%
+  ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = n)) +
+    geom_raster() +
+    geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
+    geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
+    scale_fill_gradient(low = "white", high = "black") +
+    coord_fixed(expand = FALSE) + 
+    labs(x = "true GOSE", y = "imputed GOSE", fill = "") +
+    theme_bw() +
+    theme(
+      panel.grid = element_blank()
+    ) + 
+    facet_wrap(~model) +
+    ggtitle("Average confusion matrix accross folds (absolute counts)")
+
+ggsave(filename = "confusion_matrices.pdf", width = 7, height = 7)
+ggsave(filename = "confusion_matrices.png", width = 7, height = 7)
+```
+
+```{r confusion-matrix-col-perc, warning=FALSE, message=FALSE, fig.height=5, out.width=".9\\textwidth", fig.align='center'}
+df_average_confusion_matrices %>%
+  group_by(model, `True GOSE`) %>%
+  mutate(
+    `fraction (column)` = n / sum(n),
+    `fraction (column)` = ifelse(is.nan(`fraction (column)`), 0, `fraction (column)`)
+  ) %>% 
+  ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = `fraction (column)`)) +
+    geom_raster() +
+    geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
+    geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
+    scale_fill_gradient("column fraction", low = "white", high = "black", limits = c(0, 1)) +
+    coord_fixed(expand = FALSE) + 
+    labs(x = "true GOSE", y = "imputed GOSE", fill = "") +
+    theme_bw() +
+    theme(
+      panel.grid = element_blank()
+    ) + 
+    facet_wrap(~model) +
+    ggtitle("Average confusion matrix accross folds (column normalized)")
+
+ggsave(filename = "confusion_matrices_col_perc.pdf", width = 7, height = 7)
+ggsave(filename = "confusion_matrices_col_perc.png", width = 7, height = 7)
+```
+
+
+
+### Accuracy
+
+```{r error-scores-stratified, fig.align='center'}
+df_predictions %>%
+  filter(model != "locf") %>% 
+  group_by(model, fold, GOSE) %>% 
+  summarize(
+    RMSE = mean((GOSE - prediction)^2, na.rm = TRUE) %>% sqrt,
+    MAE  = mean(abs(GOSE - prediction), na.rm = TRUE),
+    bias = mean(prediction, na.rm = TRUE) - mean(GOSE, na.rm = TRUE) 
+  ) %>% 
+  gather(error, value, -model, -fold, -GOSE) %>% 
+  group_by(GOSE, model, error) %>% 
+  summarize(
+      mean = mean(value),
+      sd   = sd(value)
+  ) %>% 
+  ungroup() %>% 
+  ggplot(aes(GOSE, color = model)) + 
+    geom_hline(yintercept = 0, color = "black") +
+    geom_line(aes(y = mean)) + 
+    facet_wrap(~error) +
+    scale_y_continuous(name= "", breaks = seq(-2, 8, .25)) +
+    theme_bw() +
+    theme(
+      panel.grid.minor   = element_blank(),
+      panel.grid.major.x = element_blank(),
+      axis.text.x        = element_text(angle = 66, hjust = 1)
+    ) +
+    ggtitle("Summary measures by true GOSE value")
+
+ggsave(filename = "imputation_error_stratified.pdf", width = 7, height = 7)
+ggsave(filename = "imputation_error_stratified.png", width = 7, height = 7)
+```
+
+```{r error-scores, fig.align='center'}
+df_predictions %>%
+  filter(model != "locf") %>% 
+  group_by(model, fold) %>% 
+  summarize(
+    RMSE = mean((GOSE - prediction)^2, na.rm = TRUE) %>% sqrt,
+    MAE  = mean(abs(GOSE - prediction), na.rm = TRUE),
+    bias = mean(prediction, na.rm = TRUE) - mean(GOSE, na.rm = TRUE) 
+  ) %>% 
+  ungroup %>% 
+  gather(error, value, -model, -fold) %>% 
+  ggplot(aes(model, value)) + 
+    geom_hline(yintercept = 0, color = "black") +
+    geom_boxplot() + 
+    facet_wrap(~error) +
+    scale_y_continuous(name= "", breaks = seq(-2, 8, .25)) +
+    scale_x_discrete("") + 
+    theme_bw() +
+    theme(
+      panel.grid.minor   = element_blank(),
+      panel.grid.major.x = element_blank(),
+      axis.text.x        = element_text(angle = 66, hjust = 1)
+    ) +
+    ggtitle("Overall summary measures")
+
+ggsave(filename = "imputation_error.pdf", width = 7, height = 7)
+ggsave(filename = "imputation_error.png", width = 7, height = 7)
+```
+
+
+
+```{r zip-figures}
+system("zip figures.zip *.png *.pdf")
+system("rm *.png *.pdf")
+```
+
+
+# Session Info
+
+```{r session-info}
+sessionInfo()
+``` 
+
+
--- a/reports/prepare_data.Rmd
+++ b/reports/prepare_data.Rmd
+---
+title: "Extract and prepare data"
+
+statistician: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
+
+collaborator: "David Menon (dkm13@cam.ac.uk)"
+
+output: reportr::report
+
+date: "`r Sys.time()`"
+
+params:
+  datapath: "../data/v1.1"
+  max_lab_days: 3
+  seed: 42
+---
+
+
+
+```{r setup, include=FALSE}
+library(tidyverse)
+library(centertbi)
+library(describr)
+library(ggalluvial)
+
+set.seed(params$seed)
+
+df_labs     <- readRDS(sprintf('%s/df_labs.rds', params$datapath))
+df_baseline <- readRDS(sprintf('%s/df_baseline.rds', params$datapath))
+df_ctmri    <- readRDS(sprintf('%s/df_ctmri.rds', params$datapath))
+df_imaging  <- readRDS(sprintf('%s/df_imaging.rds', params$datapath))
+df_gose     <- readRDS(sprintf('%s/df_gose.rds', params$datapath))
+```
+
+
+
+
+
+# Extract data
+
+```{r}
+df_deaths <- df_baseline %>% 
+  transmute(
+    gupi, 
+    days = difftime(
+        Subject.DeathDate %>% as.character %>% ifelse(. == "", NA, .) %>% as.Date,
+        "1970-01-01", 
+        units = "days"
+      ) %>% as.numeric
+  ) %>% 
+  filter(complete.cases(.))
+
+df_baseline <- df_baseline %>% 
+  select(-Subject.DeathDate) %>% 
+  mutate(
+    # combine suspect and definite to "yes"
+    InjuryHx.EDComplEventHypoxia = factor(InjuryHx.EDComplEventHypoxia, 
+      levels = 0:2,
+      labels = c("no", "yes", "yes")
+    ),
+    # combine suspect and definite to "yes"
+    InjuryHx.EDComplEventHypotension = factor(InjuryHx.EDComplEventHypotension, 
+      levels = 0:2,
+      labels = c("no", "yes", "yes")
+    )
+  ) %>%
+  # add lab values
+  left_join(
+    df_labs %>% 
+      distinct %>% # filter duplicates
+      mutate(Labs.DLDate = as.Date(Labs.DLDate)) %>% 
+      filter(
+        difftime(Labs.DLDate, "1970-01-01", units = "days") %>% as.numeric <= params$max_lab_days
+      ) %>% 
+      group_by(gupi) %>% 
+      arrange(gupi, Labs.DLDate) %>% 
+      summarise(
+        Labs.DLHemoglobingdL = na.omit(Labs.DLHemoglobingdL)[1],
+        Labs.DLGlucosemmolL  = na.omit(Labs.DLGlucosemmolL)[1]
+      ),
+    by = "gupi"
+  ) %>% 
+  # add imaging data
+  left_join(
+    df_imaging %>% 
+      distinct() %>% 
+      mutate(
+        Imaging.MarshallCTClassification = factor(
+          Imaging.MarshallCTClassification, levels = 1:6
+        ),
+        Imaging.ExperimentDate = Imaging.ExperimentDate %>% 
+          as.character %>% ifelse(. == "", NA, .) %>% as.Date
+      ) %>% 
+      arrange(gupi, Imaging.ExperimentDate) %>% 
+      group_by(gupi) %>% 
+      summarize(
+        Imaging.MarshallCTClassification = na.omit(Imaging.MarshallCTClassification)[1]
+      ),
+    by = "gupi"
+  ) %>% 
+  # add CTMRI data
+  left_join(
+    df_ctmri %>% 
+      distinct() %>% 
+      mutate(
+        CTMRI.CTMRIDate = CTMRI.CTMRIDate %>% 
+          as.character %>% ifelse(. == "", NA, .) %>% as.Date
+      ) %>% 
+      arrange(gupi, CTMRI.CTMRIDate) %>% 
+      group_by(gupi) %>% 
+      summarize(
+        CTMRI.CTSubarachnoidHem = na.omit(CTMRI.CTSubarachnoidHem)[1],
+        CTMRI.CTExtraduralHema = na.omit(CTMRI.CTExtraduralHema)[1],
+      ) %>% 
+      mutate(
+        CTMRI.CTSubarachnoidHem = factor(CTMRI.CTSubarachnoidHem, levels = 0:3, labels = c("no", "yes", "yes", "yes")),
+        CTMRI.CTExtraduralHema = factor(CTMRI.CTExtraduralHema, levels = 0:2, labels = c("no", "yes", "yes"))
+      ),
+    by = "gupi"
+  )
+```
+
+
+
+
+
+## GOSE data
+
+```{r gose-outcomes-ambiguity}
+df_gose <- df_gose %>% 
+  distinct %>% 
+  filter(complete.cases(.)) %>% 
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>% 
+  group_by(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>% 
+  mutate(Outcomes.DerivedCompositeGOSE = factor(Outcomes.DerivedCompositeGOSE, levels = 1:8))
+``` 
+
+This results in `r nrow(df_gose)` GOSE measurements of 
+`r df_gose %>% group_by(gupi) %>% n_groups()` individuals.
+
+
+
+# Compile final datasets
+
+* exclude all patient who do not survive first 6 months (no need to impute)
+* exclude all patients with no GOSE measurement (no imputation)
+
+```{r}
+early_deaths_gupi <- df_deaths %>% 
+  filter(days <= 180 - 14) %>% 
+  .[["gupi"]]
+
+df_gose <- df_gose %>% 
+  filter(!(gupi %in% early_deaths_gupi)) %>% 
+  ungroup %>% 
+  # add confirmed deaths
+  rbind(
+    df_deaths %>% 
+      filter(days > 180) %>% 
+      rename(
+        Outcomes.DerivedCompositeGOSEDaysPostInjury = days
+      ) %>% 
+      mutate(
+        Outcomes.DerivedCompositeGOSE = 1
+      )
+  ) %>% 
+  # remove observations after death
+  group_by(gupi) %>% 
+  arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>% 
+  filter(
+    {
+      first_death <- which(Outcomes.DerivedCompositeGOSE == 1)[1] 
+      (is.na(first_death) | n() <= first_death)
+    }
+  ) %>% 
+  ungroup()
+
+df_baseline <- df_baseline %>% 
+  filter(
+    gupi %in% df_gose$gupi
+  )
+```
+
+This results in `r nrow(df_gose)` GOSE measurements of 
+`r df_gose %>% group_by(gupi) %>% n_groups()` individuals.
+
+
+
+## Plausibility check
+
+The only genuinly numerical variables are Age, Glucose_mmolL, and Hb_dL.
+All other variables are factors and may therefore not contain outliers.
+
+```{r}
+df_baseline %>% 
+  select(Subject.Age, Labs.DLGlucosemmolL, Labs.DLHemoglobingdL) %>% 
+  gather(Variable, value) %>% 
+  ggplot(aes(value)) + 
+    geom_histogram() +
+    facet_wrap(~Variable, scales = "free") +
+    theme_bw() +
+    theme(panel.grid = element_blank())
+```
+
+Glucose is obviously left-skewed and a log transfrom might improve fits in linear
+models.
+
+```{r, echo=TRUE}
+df_baseline %>% 
+  select(gupi, Labs.DLHemoglobingdL) %>% 
+  filter(Labs.DLHemoglobingdL > 50)
+```
+
+All values above 50 are considered implausible and set to missing 
+(probably meant as missing).
+
+```{r}
+df_baseline <- df_baseline %>% 
+  mutate(
+    Labs.DLHemoglobingdL = ifelse(Labs.DLHemoglobingdL > 50, NA_real_, Labs.DLHemoglobingdL),
+    Labs.DLGlucosemmolL_log = log(Labs.DLGlucosemmolL)
+  ) %>% 
+  select(-Labs.DLGlucosemmolL)
+```
+
+
+```{r save}
+saveRDS(df_gose, "df_gose.rds")
+saveRDS(df_baseline, "df_baseline.rds")
+```
+
+
+
+
+# Describe data
+
+## Baseline covariates
+
+```{r, fig.height=8, fig.width=3.25, out.width=".45\\textwidth", warning=FALSE, message=FALSE}
+thm <- theme_default(6)
+thm$body$descriptor$style$plot_cell$plot_height <- unit(.1, "in")
+thm$colwidths$levels      <- unit(1, "in")
+thm$colwidths$variables   <- unit(1.15, "in")
+
+
+df_baseline %>% 
+  select(-gupi) %>% 
+  describr(
+    theme_new = thm, 
+    totals = TRUE
+  ) %>%
+  describe_if(
+    is.numeric,
+    with = list(
+      dscr_mean_sd(),
+      dscr_median_q1_q3(),
+      dscr_min_max(),
+      dscr_histogram()
+    )
+  ) %>%
+  describe_if(
+    is.factor,
+    with = list(dscr_freq(), dscr_factor_barchart())
+  ) %>% 
+  fit_to_height(height = 8)
+```
+
+
+
+## GOSE over time
+
+GOSE times are rounded to the nearest 3 months period (averaged/rounded if multiple
+values present).
+
+```{r alluvial-data}
+df_gose %>% 
+  # compute GOSE per quarter
+  transmute(
+    gupi,
+    quarter = round(Outcomes.DerivedCompositeGOSEDaysPostInjury/90),
+    Outcomes.DerivedCompositeGOSE = as.numeric(as.character(Outcomes.DerivedCompositeGOSE))
+  ) %>% 
+  group_by(gupi, quarter) %>% 
+  summarise(
+    GOSE = round(mean(Outcomes.DerivedCompositeGOSE)) %>% ifelse(. == 2, 1, .)
+  ) %>% 
+  ungroup() %>%
+  filter(
+    quarter %in% c(1, 2, 4)
+  ) %>% 
+  spread(quarter, GOSE) %>% 
+  gather(quarter, GOSE, `1`, `2`, `4`) %>% 
+  mutate(
+    GOSE = factor(GOSE, levels = 8:1) %>% forcats::fct_explicit_na(),
+    Timepoint = factor(quarter, levels = c("1", "2", "4"), labels = c("3 months", "6 months", "12 months"))
+  ) ->
+df_alluvial
+```
+
+```{r plot-alluvial-gradient, fig.height=4.5}
+df_alluvial %>% 
+  ggplot(aes(x = Timepoint, stratum = GOSE, alluvium = gupi, label = GOSE, fill = GOSE)) +
+    geom_flow(aes.flow = "backward") + 
+    geom_stratum() +
+    scale_fill_manual(
+      values = c(viridisLite::viridis(7, direction = -1), "grey"), 
+      breaks = c(8:3, 1, "(Missing)")
+    ) +
+    theme_bw() +
+    theme(
+      panel.grid = element_blank()
+    )
+
+ggsave("gose_alluvial_gradient_coloring.pdf", height = 5, width = 8)
+ggsave("gose_alluvial_gradient_coloring.png", height = 5, width = 8)
+```
+
+```{r plot-alluvial-differential, fig.height=4.5}
+df_alluvial %>% 
+  ggplot(aes(x = Timepoint, stratum = GOSE, alluvium = gupi, label = GOSE, fill = GOSE)) +
+    geom_flow(aes.flow = "backward") + 
+    geom_stratum() +
+    scale_fill_manual(
+      values = c(RColorBrewer::brewer.pal(7, "Set3"), "grey"), 
+      breaks = c(8:3, 1, "(Missing)")
+    ) +
+    theme_bw() +
+    theme(
+      panel.grid = element_blank()
+    )
+
+ggsave("gose_alluvial_differential_coloring.pdf", height = 5, width = 8)
+ggsave("gose_alluvial_differential_coloring.png", height = 5, width = 8)
+```
+
+
+
+```{r zip-figures}
+system("zip figures.zip *.png *.pdf")
+system("rm *.png *.pdf")
+```
+
+
+
+
+# Session Info
+
+```{r session-info}
+sessionInfo()
+``` 
--- a/reports/references.bib
+++ b/reports/references.bib
+@article{msm2011,
+author = {Jackson, Christopher H.},
+doi = {10.18637/jss.v038.i08},
+journal = {Journal of Statistical Software},
+month = {jan},
+number = {8},
+pages = {1--28},
+title = {{Multi-State Models for Panel Data: The msm Package for R}},
+volume = {38},
+year = {2011}
+}
+@article{r2016,
+archivePrefix = {arXiv},
+arxivId = {arXiv:1011.1669v3},
+author = {Team, R Development Core and {R Development Core Team}, R},
+doi = {10.1007/978-3-540-74686-7},
+eprint = {arXiv:1011.1669v3},
+isbn = {3{\_}900051{\_}00{\_}3},
+issn = {3-900051-07-0},
+journal = {R Foundation for Statistical Computing},
+pmid = {16106260},
+title = {{R: A Language and Environment for Statistical Computing}},
+year = {2016}
+}
+@book{rasmussen2006,
+abstract = {Gaussian processes (GPs) are natural generalisations of multivariate Gaussian random variables to infinite (countably or continuous) index sets. GPs have been applied in a large number of fields to a diverse range of ends, and very many deep theoretical analyses of various properties are available. This paper gives an introduction to Gaussian processes on a fairly elementary level with special emphasis on characteristics relevant in machine learning. It draws explicit connections to branches such as spline smoothing models and support vector machines in which similar ideas have been investigated. Gaussian process models are routinely used to solve hard machine learning problems. They are attractive because of their flexible non-parametric nature and computational simplicity. Treated within a Bayesian framework, very powerful statistical methods can be implemented which offer valid estimates of uncertainties in our predictions and generic model selection procedures cast as nonlinear optimization problems. Their main drawback of heavy computational scaling has recently been alleviated by the introduction of generic sparse approximations.13,78,31 The mathematical literature on GPs is large and often uses deep concepts which are not required to fully understand most machine learning applications. In this tutorial paper, we aim to present characteristics of GPs relevant to machine learning and to show up precise connections to other "kernel machines" popular in the community. Our focus is on a simple presentation, but references to more detailed sources are provided.},
+archivePrefix = {arXiv},
+arxivId = {026218253X},
+author = {Rasmussen, Carl Edward and Williams, Christopher K. I.},
+booktitle = {International Journal of Neural Systems},
+doi = {10.1142/S0129065704001899},
+eprint = {026218253X},
+isbn = {026218253X},
+issn = {0129-0657},
+pmid = {15112367},
+title = {{Gaussian Processes for Machine Learning}},
+year = {2006}
+}
+@article{stan2017,
+author = {Carpenter, B and Gelman, A and Hoffman, MD and Lee, D},
+journal = {Journal of statistical software},
+number = {1},
+title = {{Stan: A Probabilistic Programming Language}},
+volume = {76},
+year = {2017}
+}
+@book{Agresti2003,
+author = {Agresti, Alan},
+editor = {{John Wiley {\&} Sons}},
+title = {{Categorical data analysis}},
+year = {2003}
+}
+@article{brms2018,
+author = {B{\"{u}}rkner, Paul},
+journal = {The R Journal},
+number = {1},
+pages = {395--411},
+title = {{Advanced Bayesian Multilevel Modeling with the R Package brms}},
+volume = {10},
+year = {2018}
+}
+@article{brms2017,
+author = {B{\"{u}}rkner, Paul},
+journal = {Journal Of Statistical Software},
+number = {1},
+pages = {1--28},
+title = {{brms: An R Package for Bayesian Multilevel Models Using Stan}},
+volume = {80},
+year = {2017}
+}
+@article{Steyerberg2008,
+author = {Steyerberg, Ewout W and Mushkudiani, Nino and Perel, Pablo and Butcher, Isabella and Lu, Juan and McHugh, Gillian S and Murray, Gordon D and Marmarou, Anthony and Roberts, Ian and Habbema, J. Dik F and Maas, Andrew I. R},
+doi = {10.1371/journal.pmed.0050165},
+editor = {Singer, Mervyn},
+journal = {PLoS Medicine},
+month = {aug},
+number = {8},
+pages = {e165},
+title = {{Predicting Outcome after Traumatic Brain Injury: Development and International Validation of Prognostic Scores Based on Admission Characteristics}},
+url = {https://dx.plos.org/10.1371/journal.pmed.0050165},
+volume = {5},
+year = {2008}
+}
--- a/scripts/generate_imputation_data.R
+++ b/scripts/generate_imputation_data.R
+library(tidyverse)
+
+args     <- commandArgs(trailingOnly = TRUE)
+
+data_dir <- args[[1]]
+m        <- as.numeric(args[[2]])
+
+df_gose <- readRDS(sprintf("%s/df_gose.rds", data_dir))
+
+for (i in 1:m) {
+
+  df_baseline <- readRDS(sprintf("%s/mi_baseline/df_baseline_mi_%i.rds", data_dir, i))
+
+  df <- df_gose %>%
+    left_join(df_baseline, by = "gupi")
+
+  saveRDS(df, sprintf("%s/imputation/df_mi_%i.rds", data_dir, i))
+
+}
--- a/scripts/generate_validation_data.R
+++ b/scripts/generate_validation_data.R
+library(tidyverse)
+
+args     <- commandArgs(trailingOnly = TRUE)
+
+data_dir <- args[[1]]
+m        <- as.numeric(args[[2]])
+k        <- as.numeric(args[[3]])
+seed     <- as.integer(args[[4]])
+
+set.seed(seed)
+
+df_gose <- readRDS(sprintf("%s/df_gose.rds", data_dir))
+
+# get ids of all subjects with a valid 6mo GOSE
+test_candidate_ids <- df_gose %>%
+  group_by(gupi) %>%
+  filter(
+    # at least one non-death observation close to 6 months
+    any(
+      (Outcomes.DerivedCompositeGOSEDaysPostInjury >= 180 - 14) &
+      (Outcomes.DerivedCompositeGOSE != 1) &
+      (Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180 + 14)
+    ),
+    # at least one observation not close to 6 months
+    any(
+      (Outcomes.DerivedCompositeGOSEDaysPostInjury < 180 - 14) |
+      (Outcomes.DerivedCompositeGOSEDaysPostInjury > 180 + 14)
+    )
+  ) %>%
+  .[["gupi"]] %>%
+  unique %>%
+  as.character
+
+df_validation <- tibble(
+  gupi       = df_gose$gupi %>% unique,
+  valid_test = gupi %in% test_candidate_ids
+) %>%
+  group_by(
+    valid_test
+  ) %>%
+  mutate(
+    fold = sample(rep(1:k, ceiling(n()/k)))[1:n()]
+  ) %>%
+  ungroup
+
+# saveRDS(df_validation, "df_validation_splits.rds")
+
+for (i in 1:m) {
+
+  df_baseline <- readRDS(sprintf("%s/mi_baseline/df_baseline_mi_%i.rds", data_dir, i))
+
+  for (j in 1:k) {
+
+    df_test_rows <- df_gose %>%
+      left_join(df_baseline, by = "gupi") %>%
+      left_join(df_validation, by = "gupi") %>%
+      filter(fold == j & valid_test) %>%
+      group_by(gupi) %>%
+      filter(
+        Outcomes.DerivedCompositeGOSEDaysPostInjury ==
+          Outcomes.DerivedCompositeGOSEDaysPostInjury[which.min(abs(Outcomes.DerivedCompositeGOSEDaysPostInjury - 180))]) %>%
+      ungroup() %>%
+      select(
+        gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury
+      ) %>%
+      mutate(test = TRUE) %>%
+      right_join(
+        df_gose %>% select(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury),
+        by = c("gupi", "Outcomes.DerivedCompositeGOSEDaysPostInjury")
+      ) %>%
+      mutate(test = ifelse(is.na(test), FALSE, test))
+
+    df_test <- df_gose %>%
+      left_join(df_test_rows, by = c("gupi", "Outcomes.DerivedCompositeGOSEDaysPostInjury")) %>%
+      filter(test) %>%
+      select(-test) %>%
+      left_join(df_baseline, by = "gupi")
+
+    df_train <- df_gose %>%
+      left_join(df_test_rows, by = c("gupi", "Outcomes.DerivedCompositeGOSEDaysPostInjury")) %>%
+      filter(!test) %>%
+      select(-test) %>%
+      left_join(df_baseline, by = "gupi")
+
+    # plausi checks
+    if (nrow(df_test) + nrow(df_train) != nrow(df_gose)) stop("test/train does not add up")
+    if (any(df_test$Outcomes.DerivedCompositeGOSE == 1)) stop("test cases cannot be GOSE 1")
+
+    saveRDS(df_test, sprintf("%s/validation/df_test_mi_%i_fold_%i.rds", data_dir, i, j))
+    saveRDS(df_train, sprintf("%s/validation/df_train_mi_%i_fold_%i.rds", data_dir, i, j))
+
+  }
+}
--- a/scripts/import_neurobot_data.R
+++ b/scripts/import_neurobot_data.R
+library(tidyverse)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+in_folder  <- args[[1]]
+out_folder <- args[[2]]
+
+in_files <- sprintf("%s/%s", in_folder, list.files(path = in_folder, pattern = "*.csv"))
+
+if (length(in_files) != 5) stop("must have exactly 5 input files")
+
+for (f in in_files) {
+  tmp <- read.csv(f)
+  if ("Subject.Age" %in% names(tmp))
+    saveRDS(as_tibble(tmp), file = sprintf("%s/df_baseline.rds", out_folder))
+  if ("Outcomes.DerivedCompositeGOSE" %in% names(tmp))
+    saveRDS(as_tibble(tmp), file = sprintf("%s/df_gose.rds", out_folder))
+  if ("Labs.DLDate" %in% names(tmp))
+    saveRDS(as_tibble(tmp), file = sprintf("%s/df_labs.rds", out_folder))
+  if ("Imaging.MarshallCTClassification" %in% names(tmp))
+    saveRDS(as_tibble(tmp), file = sprintf("%s/df_imaging.rds", out_folder))
+  if ("CTMRI.CTSubarachnoidHem" %in% names(tmp))
+    saveRDS(as_tibble(tmp), file = sprintf("%s/df_ctmri.rds", out_folder))
+}
--- a/scripts/impute_baseline.R
+++ b/scripts/impute_baseline.R
+library(tidyverse)
+library(mice)
+
+args <- commandArgs(trailingOnly = TRUE)
+
+df_baseline <- readRDS(args[[1]])
+out_dir <- args[[2]]
+m <- as.numeric(args[[3]])
+maxiter <- as.numeric(args[[4]])
+seed <- as.numeric(args[[5]])
+
+mi_baseline <- df_baseline %>%
+  select(-gupi) %>%
+  mutate_if(is.character, factor) %>%
+  mice::mice(
+    m     = m,
+    seed  = seed,
+    maxit = maxiter
+  )
+
+# save the individual complete data sets
+for (i in 1:m) {
+  mice::complete(mi_baseline, action = i, include = FALSE) %>%
+    mutate_if(is.factor, as.character) %>%
+    as_tibble() %>%
+    mutate(
+      gupi = df_baseline$gupi
+    ) %>%
+    select(gupi, everything()) %>%
+    saveRDS(file = sprintf("%s/df_baseline_mi_%i.rds", out_dir, i))
+}
--- a/scripts/post_process_imputations.R
+++ b/scripts/post_process_imputations.R
+library(tidyverse)
+
+args                 <- commandArgs(trailingOnly = TRUE)
+
+modelimputations_dir <- args[[1]]
+gosefile             <- args[[2]]
+outputfile           <- args[[3]]
+
+set.seed(42)
+
+df_gose              <- readRDS(gosefile)
+
+posteriors <- list.files(modelimputations_dir, pattern = "*.rds")
+
+# load and combine all posteriros over different (baseline) imputations
+df_imputations <- c()
+for (i in 1:length(posteriors)) {
+  df_imputations <- rbind(
+    df_imputations,
+    readRDS(sprintf("%s/%s", modelimputations_dir, posteriors[i])) %>%
+      mutate(
+        imp = i
+      )
+  )
+}
+df_imputations <- df_imputations %>%
+  group_by(gupi, t, GOSE) %>%
+  summarize(probability = mean(probability, na.rm = TRUE)) %>%
+  ungroup
+
+
+
+
+df_imputations <- df_imputations %>%
+  select(-t) %>%
+  filter(complete.cases(.)) %>%
+  group_by(gupi) %>%
+  mutate(
+    Subject.DerivedImputed180DaysGOSE = which.max(probability) %>%
+      {
+        if (length(.) == 0) { # fix all probs. NA case
+          return(NA)
+        } else {
+          return(.)
+        }
+      } %>%
+      sample(1) %>% # need to resolve ambiguous gose predictions !
+      rep(n()),
+    GOSE = factor(
+      GOSE,
+      levels = 1:8,
+      labels = sprintf("Subject.DerivedImputed180DaysGOSE_%i_probability", 1:8)
+    )
+  ) %>%
+  ungroup %>%
+  spread(GOSE, probability)
+
+# extract per protocol gose observations
+df_per_protocol_gose <- df_gose %>%
+  filter(
+    Outcomes.DerivedCompositeGOSEDaysPostInjury >= 5*30,
+    Outcomes.DerivedCompositeGOSEDaysPostInjury <= 8*30
+  ) %>%
+  # pick closest to 180
+  group_by(gupi) %>%
+  summarize(
+    closest_per_protocol_GOSE = Outcomes.DerivedCompositeGOSE[
+      which.min(abs(Outcomes.DerivedCompositeGOSEDaysPostInjury - 180))
+    ]
+  )
+
+df_imputations <- df_imputations %>%
+  left_join(df_per_protocol_gose, by = "gupi") %>%
+  mutate(
+    Subject.GOSE6monthEndpointDerived = ifelse(
+      is.na(closest_per_protocol_GOSE),
+      Subject.DerivedImputed180DaysGOSE,
+      closest_per_protocol_GOSE
+    )
+  ) %>%
+  select(
+    gupi,
+    Subject.GOSE6monthEndpointDerived,
+    Subject.DerivedImputed180DaysGOSE,
+    everything()
+  ) %>%
+  select(
+    -closest_per_protocol_GOSE
+  )
+
+write_csv(df_imputations, outputfile)