Commit 97737a15 authored by Kevin's avatar Kevin

initial commit

parents
output
.snakemake
.Rproj.user
.cache
*.Rproj
*.rds
*.Rhistory
\ No newline at end of file
singularity: "docker://kkmann/gose-6mo-imputation@sha256:02a8f55c53d6f4917193abda823938e1ad7e8891fd6c392bd94f509e058eb34b"
configfile: "config.yml"
rule import_neurobot_csv:
output:
"data/{version}/df_baseline.rds",
"data/{version}/df_ctmri.rds",
"data/{version}/df_imaging.rds",
"data/{version}/df_labs.rds",
"data/{version}/df_gose.rds"
shell:
"""
Rscript scripts/import_neurobot_data.R data/{wildcards.version} data/{wildcards.version}
"""
rule prepare_data:
input:
rules.import_neurobot_csv.output,
markdown = "reports/prepare_data.Rmd"
output:
"output/{version}/data/df_gose.rds",
"output/{version}/data/df_baseline.rds",
"output/{version}/prepare_data.pdf",
figures = "output/{version}/prepare_data_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}/data
Rscript -e "rmarkdown::render('{input.markdown}', output_dir = 'output/{wildcards.version}', params = list(datapath = '../data/{wildcards.version}', max_lab_days = {config[max_lab_days]}, seed = {config[seed]}))"
mv reports/*.rds output/{wildcards.version}/data
mv reports/figures.zip {output.figures}
"""
rule impute_baseline:
input:
rules.prepare_data.output
output:
["output/{version}/data/mi_baseline/df_baseline_mi_%i.rds" % i for i in range(1, config["mi_m"] + 1)]
shell:
"""
mkdir -p output/{wildcards.version}/data/mi_baseline
Rscript scripts/impute_baseline.R output/{wildcards.version}/data/df_baseline.rds output/{wildcards.version}/data/mi_baseline {config[mi_m]} {config[mi_maxiter]} {config[seed]}
"""
rule generate_validation_data:
input:
rules.prepare_data.output,
rules.impute_baseline.output
output:
["output/{version}/data/validation/df_%s_mi_%i_fold_%i.rds" % (s, i, j)
for s in ("train", "test")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
shell:
"""
mkdir -p output/{wildcards.version}/data/validation
Rscript scripts/generate_validation_data.R output/{wildcards.version}/data {config[mi_m]} {config[folds]} {config[seed]}
"""
# adjust threads by model type
def get_rule_threads(wildcards):
if wildcards.model in ("locf", "msm"):
return 1
else:
return config["stan"]["chains"]
rule fit_model_validation_set:
input:
"config.yml",
"output/{version}/data/validation/df_train_mi_{i}_fold_{j}.rds"
output:
"output/{version}/data/validation/posteriors/{model}/df_posterior_mi_{i}_fold_{j}.rds"
threads:
get_rule_threads
shell:
"""
mkdir -p output/{wildcards.version}/data/validation/posteriors/{wildcards.model}
Rscript models/{wildcards.model}/fit.R {input[1]} {output}
"""
# helper rule to just build all posterior datasets
rule model_posteriors:
input:
["output/v1.1/data/validation/posteriors/%s/df_posterior_mi_%i_fold_%i.rds" % (m, i, j)
for m in ("locf", "msm", "gp", "gp_nb", "mm", "mm_nb")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
rule model_assessment:
input:
pop_report = rules.prepare_data.output,
posteriors = rules.model_posteriors.input,
markdown = "reports/model_assessment.Rmd"
output:
pdf = "output/{version}/model_assessment.pdf",
figures = "output/{version}/model_assessment_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}
Rscript -e "rmarkdown::render('{input.markdown}', output_dir = 'output/{wildcards.version}', params = list(data_dir = '../output/{wildcards.version}/data', config_file = '../config.yml'))"
mv reports/figures.zip {output.figures}
"""
rule generate_imputation_data:
input:
rules.prepare_data.output,
rules.impute_baseline.output
output:
["output/{version}/data/imputation/df_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
]
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation
Rscript scripts/generate_imputation_data.R output/{wildcards.version}/data {config[mi_m]}
"""
# rules for imputing on entire dataset
rule model_impute:
input:
"config.yml",
"output/{version}/data/imputation/df_mi_{i}.rds"
output:
"output/{version}/data/imputation/{model}/df_gose_imputed_mi_{i}.rds"
threads:
get_rule_threads
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript models/{wildcards.model}/fit.R {input[1]} {output}
"""
# final reported values are a combination of the imputed and per-protocol
# observed ones
rule post_process_imputations:
input:
"config.yml",
["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
]
output:
"output/{version}/data/imputation/{model}/df_gose_imputed.csv"
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds {output}
"""
seed:
42
max_lab_days:
3
mi_m:
2
mi_maxiter:
5
folds:
2
t_out:
180
t_buffer:
14
t_out_msm:
180
maxiter_msm:
20
fnscale_msm:
12000
stan:
iter: 10
warmup: 10
chains: 2
adapt_delta: .95
max_treedepth: 10
gp:
t_knots: [45, 90, 180, 270, 360]
#!/usr/bin/env Rscript
library(tidyverse)
library(rstan)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
modelfile <- "models/gp/gp.stan"
config <- yaml::read_yaml("config.yml")
df <- readRDS(inputfile)
model_matrix_baseline <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi) %>%
select(
-Outcomes.DerivedCompositeGOSEDaysPostInjury, -Outcomes.DerivedCompositeGOSE
) %>%
summarize_all(first) %>%
model.matrix(~ . - gupi - 1, data = .)
df_start_end_index <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
mutate(
tmp = row_number()
) %>%
group_by(gupi) %>%
summarize(
start_row = first(tmp),
end_row = last(tmp)
)
df_gose <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
transmute(
gupi = gupi %>% factor %>% as.numeric,
Outcomes.DerivedCompositeGOSE = as.numeric(Outcomes.DerivedCompositeGOSE),
Outcomes.DerivedCompositeGOSEDaysPostInjury = Outcomes.DerivedCompositeGOSEDaysPostInjury
)
stan_data <- list(
N = nrow(df),
n_pat = df_gose$gupi %>% unique %>% length,
pat_start_ind = df_start_end_index$start_row,
pat_end_ind = df_start_end_index$end_row,
t = df_gose$Outcomes.DerivedCompositeGOSEDaysPostInjury,
y = df_gose$Outcomes.DerivedCompositeGOSE,
id = df_gose$gupi,
modelmatrix = model_matrix_baseline,
p_cov = ncol(model_matrix_baseline),
t_out_ = config$t_out %>% as.matrix,
n_t_out = length(config$t_out),
knots = config$gp$t_knots,
n_knots = length(config$gp$t_knots)
)
seed <- digest::digest(stan_data, algo = "sha1") %>%
substr(1, 5) %>%
strtoi(base = 16)
cat(sprintf("seed: %i\n\r", seed))
rstan::rstan_options(auto_write = TRUE)
mdl <- rstan::stan(
file = modelfile,
data = stan_data,
chains = config$stan$chains,
cores = config$stan$chains,
seed = seed,
iter = config$stan$warmup + config$stan$iter,
warmup = config$stan$warmup,
refresh = 1,
control = list(
max_treedepth = config$stan$max_treedepth,
adapt_delta = config$stan$adapt_delta
)
)
df_posteriors <- rstan::extract(mdl)$gose_out %>% # (n_sample, n_pat, n_t)
# relative frequencies
apply(
MARGIN = c(2, 3),
function(x) table(factor(x, levels = 1:8)) / length(x) %>% as.numeric()
) %>% # (8, n_pat, n_t) -> permute
aperm(c(2, 1, 3)) %>%
as.data.frame %>%
as_tibble %>%
mutate(
gupi = row_number()
) %>%
gather(GOSE.t, probability, -gupi) %>%
separate(col = GOSE.t, c("GOSE", "t"), sep = "\\.", convert = TRUE) %>%
mutate(
t = stan_data$t_out_[t, 1],
gupi = (df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
.[["gupi"]] %>%
unique)[gupi]
) %>%
select(gupi, t, GOSE, probability) %>%
arrange(gupi, t)
saveRDS(df_posteriors, outputfile)
data {
// total number of observations
int<lower=1> N;
int<lower=1> n_pat;
// start / stop index per patient
int<lower=1, upper=N> pat_start_ind[n_pat];
int<lower=1, upper=N> pat_end_ind[n_pat];
// actual data:
real t[N]; // sampling times
real y[N]; // outcomes
int id[N]; // id as integers
int<lower=1> n_knots; // number of knots to model population mean
real<lower=0.0> knots[n_knots]; // knot positions
int n_t_out; // number of output points
real t_out_[n_t_out, 1]; // output times
int p_cov; // number covariates (model matrix)
matrix[n_pat, p_cov] modelmatrix;
}
transformed data {
real t_out[n_t_out] = t_out_[1:n_t_out, 1];
}
parameters {
// standard deviation of the population mean process
real<lower=0> s_pop_mean;
// lengthscale of the population mean process
// real<lower=0> l_pop_mean;
real<lower=.5, upper=8.5> knots_y[n_knots];
// standard deviation of the population mean process
real<lower=0> s;
// lengthscale of the population mean process
real<lower=30, upper=120> l;
// nugget term standard deviation
real<lower=.01> s_nugg;
vector[p_cov] beta;
}
transformed parameters {
matrix[n_knots, n_knots] K_knots_inv = inverse(cov_exp_quad(knots, s_pop_mean, l));
vector[n_pat] lin_pred = modelmatrix * beta;
}
model {
// priors
s ~ normal(4, 3);
s_pop_mean ~ normal(1, 1);
l ~ normal(60, 14);
s_nugg ~ normal(0, .1);
beta ~ normal(0, 1);
for (i in 1:n_pat) {
int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
// latent mean process
real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
matrix[n_sub, n_knots] K_latent_12 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
vector[n_sub] mu_sub = K_latent_12 * K_knots_inv * to_vector(knots_y);
vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
matrix[n_sub, n_sub] Sigma = cov_exp_quad(t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub));
gose ~ multi_normal(
mu_sub + rep_vector(lin_pred[i], n_sub),
Sigma
);
}
}
generated quantities {
real gose_out[n_pat, n_t_out];
row_vector[n_t_out] tmp;
matrix[n_t_out, n_t_out] Sigma;
for (i in 1:n_pat) {
int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
// latent mean process
real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
matrix[n_sub, n_sub] K_oo_inv = inverse(cov_exp_quad(t_sub, t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub)));
matrix[n_t_out, n_sub] K_no = cov_exp_quad(t_out, t_sub, s, l);
matrix[n_t_out, n_knots] K_latent_12 = cov_exp_quad(t_out, knots, s_pop_mean, l);
vector[n_t_out] mu_out = rep_vector(lin_pred[i], n_t_out) + K_latent_12 * K_knots_inv * to_vector(knots_y);
matrix[n_sub, n_knots] K_latent_12_2 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
vector[n_sub] mu_sub = rep_vector(lin_pred[i], n_sub) + K_latent_12_2 * K_knots_inv * to_vector(knots_y);
// observed values
vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
// posterior covariance matrix
Sigma = cov_exp_quad(t_out, t_out, s, l) - K_no * K_oo_inv * K_no' + diag_matrix(rep_vector(.001, n_t_out));
tmp = multi_normal_rng(
mu_out + K_no * K_oo_inv * (gose - mu_sub),
Sigma
)';
// discretize
for (j in 1:n_t_out) {
gose_out[i, j] = round(tmp[j]);
if (gose_out[i, j] > 8) {
gose_out[i, j] = 8;
}
if (gose_out[i, j] < 1) {
gose_out[i, j] = 1;
}
}
}
}
#!/usr/bin/env Rscript
library(tidyverse)
library(rstan)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
modelfile <- "models/gp_nb/gp_nb.stan"
config <- yaml::read_yaml("config.yml")
df <- readRDS(inputfile)
model_matrix_baseline <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi) %>%
select(
-Outcomes.DerivedCompositeGOSEDaysPostInjury, -Outcomes.DerivedCompositeGOSE
) %>%
summarize_all(first) %>%
model.matrix(~ . - gupi - 1, data = .)
df_start_end_index <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
mutate(
tmp = row_number()
) %>%
group_by(gupi) %>%
summarize(
start_row = first(tmp),
end_row = last(tmp)
)
df_gose <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
transmute(
gupi = gupi %>% factor %>% as.numeric,
Outcomes.DerivedCompositeGOSE = as.numeric(Outcomes.DerivedCompositeGOSE),
Outcomes.DerivedCompositeGOSEDaysPostInjury = Outcomes.DerivedCompositeGOSEDaysPostInjury
)
stan_data <- list(
N = nrow(df),
n_pat = df_gose$gupi %>% unique %>% length,
pat_start_ind = df_start_end_index$start_row,
pat_end_ind = df_start_end_index$end_row,
t = df_gose$Outcomes.DerivedCompositeGOSEDaysPostInjury,
y = df_gose$Outcomes.DerivedCompositeGOSE,
id = df_gose$gupi,
modelmatrix = model_matrix_baseline,
p_cov = ncol(model_matrix_baseline),
t_out_ = config$t_out %>% as.matrix,
n_t_out = length(config$t_out),
knots = config$gp$t_knots,
n_knots = length(config$gp$t_knots)
)
seed <- digest::digest(stan_data, algo = "sha1") %>%
substr(1, 5) %>%
strtoi(base = 16)
cat(sprintf("seed: %i\n\r", seed))
rstan::rstan_options(auto_write = TRUE)
mdl <- rstan::stan(
file = modelfile,
data = stan_data,
chains = config$stan$chains,
cores = config$stan$chains,
seed = seed,
iter = config$stan$warmup + config$stan$iter,
warmup = config$stan$warmup,
refresh = 1,
control = list(
max_treedepth = config$stan$max_treedepth,
adapt_delta = config$stan$adapt_delta
)
)
df_posteriors <- rstan::extract(mdl)$gose_out %>% # (n_sample, n_pat, n_t)
# relative frequencies
apply(
MARGIN = c(2, 3),
function(x) table(factor(x, levels = 1:8)) / length(x) %>% as.numeric()
) %>% # (8, n_pat, n_t) -> permute
aperm(c(2, 1, 3)) %>%
as.data.frame %>%
as_tibble %>%
mutate(
gupi = row_number()
) %>%
gather(GOSE.t, probability, -gupi) %>%
separate(col = GOSE.t, c("GOSE", "t"), sep = "\\.", convert = TRUE) %>%
mutate(
t = stan_data$t_out_[t, 1],
gupi = (df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
.[["gupi"]] %>%
unique)[gupi]
) %>%
select(gupi, t, GOSE, probability) %>%
arrange(gupi, t)
saveRDS(df_posteriors, outputfile)
data {
// total number of observations
int<lower=1> N;
int<lower=1> n_pat;
// start / stop index per patient
int<lower=1, upper=N> pat_start_ind[n_pat];
int<lower=1, upper=N> pat_end_ind[n_pat];
// actual data:
real t[N]; // sampling times
real y[N]; // outcomes
int id[N]; // id as integers
int<lower=1> n_knots; // number of knots to model population mean
real<lower=0.0> knots[n_knots]; // knot positions
int n_t_out; // number of output points
real t_out_[n_t_out, 1]; // output times
int p_cov; // number covariates (model matrix)
matrix[n_pat, p_cov] modelmatrix;
}
transformed data {
real t_out[n_t_out] = t_out_[1:n_t_out, 1];
}
parameters {
// standard deviation of the population mean process
real<lower=0> s_pop_mean;
// lengthscale of the population mean process
// real<lower=0> l_pop_mean;
real<lower=.5, upper=8.5> knots_y[n_knots];
// standard deviation of the population mean process
real<lower=0> s;
// lengthscale of the population mean process
real<lower=30, upper=120> l;
// nugget term standard deviation
real<lower=.01> s_nugg;
}
transformed parameters {
matrix[n_knots, n_knots] K_knots_inv = inverse(cov_exp_quad(knots, s_pop_mean, l));
}
model {
// priors
s ~ normal(4, 3);
s_pop_mean ~ normal(1, 1);
l ~ normal(60, 14);
s_nugg ~ normal(0, .1);
for (i in 1:n_pat) {
int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
// latent mean process
real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
matrix[n_sub, n_knots] K_latent_12 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
vector[n_sub] mu_sub = K_latent_12 * K_knots_inv * to_vector(knots_y);
vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
matrix[n_sub, n_sub] Sigma = cov_exp_quad(t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub));
gose ~ multi_normal(
mu_sub,
Sigma
);
}
}
generated quantities {
real gose_out[n_pat, n_t_out];
row_vector[n_t_out] tmp;
matrix[n_t_out, n_t_out] Sigma;
for (i in 1:n_pat) {
int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
// latent mean process
real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
matrix[n_sub, n_sub] K_oo_inv = inverse(cov_exp_quad(t_sub, t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub)));
matrix[n_t_out, n_sub] K_no = cov_exp_quad(t_out, t_sub, s, l);
matrix[n_t_out, n_knots] K_latent_12 = cov_exp_quad(t_out, knots, s_pop_mean, l);
vector[n_t_out] mu_out = K_latent_12 * K_knots_inv * to_vector(knots_y);
matrix[n_sub, n_knots] K_latent_12_2 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
vector[n_sub] mu_sub = K_latent_12_2 * K_knots_inv * to_vector(knots_y);
// observed values
vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
// posterior covariance matrix
Sigma = cov_exp_quad(t_out, t_out, s, l) - K_no * K_oo_inv * K_no' + diag_matrix(rep_vector(.001, n_t_out));
tmp = multi_normal_rng(
mu_out + K_no * K_oo_inv * (gose - mu_sub),
Sigma
)';
// discretize
for (j in 1:n_t_out) {
gose_out[i, j] = round(tmp[j]);
if (gose_out[i, j] > 8) {
gose_out[i, j] = 8;
}
if (gose_out[i, j] < 1) {
gose_out[i, j] = 1;
}
}
}
}
#!/usr/bin/env Rscript
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
config <- yaml::read_yaml("config.yml")
# read and process input data
df <- readRDS(inputfile) %>%
mutate(
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = c(1, 3:8), # 2 is too infrequent to be fitted
ordered = TRUE
)
)
impute_locf <- function(t, GOSE, tolerance, t_pred) {
res <- numeric(length(t_pred))
res[] <- NA_integer_
for (i in 1:length(t_pred)) {
res[i] <- GOSE[t < t_pred[i] + tolerance] %>%
tail(1) %>% # use last if multiple available
ifelse(length(.) == 0, NA, .)
}
return(res %>% as.integer)
}
df_posteriors <- df %>%
group_by(gupi) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSE) %>%
do(
t = config$t_out,
GOSE_predicted = impute_locf(.$Outcomes.DerivedCompositeGOSEDaysPostInjury, .$Outcomes.DerivedCompositeGOSE %>% as.character %>% as.integer, config$t_buffer, config$t_out)
) %>%
unnest %>%
mutate(
`1` = (GOSE_predicted == 1) %>% as.numeric,
`2` = (GOSE_predicted == 2) %>% as.numeric,
`3` = (GOSE_predicted == 3) %>% as.numeric,
`4` = (GOSE_predicted == 4) %>% as.numeric,
`5` = (GOSE_predicted == 5) %>% as.numeric,
`6` = (GOSE_predicted == 6) %>% as.numeric,
`7` = (GOSE_predicted == 7) %>% as.numeric,
`8` = (GOSE_predicted == 8) %>% as.numeric
) %>%
select(-GOSE_predicted) %>%
gather(GOSE, probability, `1`:`8`) %>%
mutate(
gupi = as.character(gupi),
GOSE = as.integer(GOSE)
) %>%
arrange(gupi, t, GOSE)
saveRDS(df_posteriors, outputfile)
#!/usr/bin/env Rscript
library(tidyverse)
library(brms)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
config <- yaml::read_yaml("config.yml")
# read and process input data
df <- readRDS(inputfile) %>%
mutate(
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = Outcomes.DerivedCompositeGOSE %>%
unique %>%
as.numeric %>%
sort %>%
as.character,
ordered = TRUE
)
) %>%
mutate_if(is.character, factor)
# generate random seed from first 5 digits of the sha1 hash value
seed <- digest::digest(df, algo = "sha1") %>%
substr(1, 5) %>%
strtoi(base = 16)
cat(sprintf("seed: %i\n\r", seed))
formula <- Outcomes.DerivedCompositeGOSE ~
s(Outcomes.DerivedCompositeGOSEDaysPostInjury) +
(I(Outcomes.DerivedCompositeGOSEDaysPostInjury^2) + Outcomes.DerivedCompositeGOSEDaysPostInjury + 1 | gupi) +
. - gupi - Outcomes.DerivedCompositeGOSEDaysPostInjury - Outcomes.DerivedCompositeGOSE
mdl <- brms::brm(
formula,
data = df,
family = brms::cumulative("logit", threshold = "flexible"),
chains = config$stan$chains,
cores = config$stan$chains,
seed = seed,
iter = config$stan$warmup + config$stan$iter,
warmup = config$stan$warmup,
refresh = 1,
control = list(
max_treedepth = config$stan$max_treedepth,
adapt_delta = config$stan$adapt_delta
),
save_warmup = FALSE
)
df_new_data <- expand.grid(
gupi = df$gupi %>% unique,
Outcomes.DerivedCompositeGOSEDaysPostInjury = config$t_out
) %>%
as_tibble %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
left_join(
df %>%
select(-Outcomes.DerivedCompositeGOSE, -Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi) %>%
summarize_all(first) %>%
ungroup,
by = c("gupi")
)
df_posteriors <- predict(mdl, df_new_data, summary = TRUE) %>%
as_tibble() %>%
mutate(
gupi = df_new_data$gupi,
Outcomes.DerivedCompositeGOSEDaysPostInjury = df_new_data$Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>% {
# since not all levels are observed/fitted, need to extend predictions
# by zero for those categories, dplyr black magic
cols <- sprintf("P(Y = %s)", 1:8)
missing_cols <- cols[!(cols %in% names(as_tibble(.)))]
res <- .
for (newcol in missing_cols) {
# add missing factor levels
res <- mutate(res, !!newcol := 0.0)
}
res <- select(res, gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury, everything())
return(res)
} %>%
rename(t = Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
gather(GOSE, probability, starts_with("P(Y =")) %>%
mutate(
GOSE = factor(GOSE, labels = 1:8, levels = sprintf("P(Y = %s)", 1:8))
) %>%
arrange(gupi, t, GOSE) %>%
mutate_if(is.factor, as.character) %>%
mutate(
GOSE = GOSE %>% as.integer,
gupi = as.character(gupi)
)
saveRDS(df_posteriors, outputfile)
#!/usr/bin/env Rscript
library(tidyverse)
library(brms)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
config <- yaml::read_yaml("config.yml")
# read and process input data
df <- readRDS(inputfile) %>%
mutate(
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = Outcomes.DerivedCompositeGOSE %>%
unique %>%
as.numeric %>%
sort %>%
as.character,
ordered = TRUE
)
) %>%
mutate_if(is.character, factor)
# generate random seed from first 5 digits of the sha1 hash value
seed <- digest::digest(df, algo = "sha1") %>%
substr(1, 5) %>%
strtoi(base = 16)
cat(sprintf("seed: %i\n\r", seed))
formula <- Outcomes.DerivedCompositeGOSE ~
s(Outcomes.DerivedCompositeGOSEDaysPostInjury) +
(I(Outcomes.DerivedCompositeGOSEDaysPostInjury^2) + Outcomes.DerivedCompositeGOSEDaysPostInjury + 1 | gupi)
mdl <- brms::brm(
formula,
data = df,
family = brms::cumulative("logit", threshold = "flexible"),
chains = config$stan$chains,
cores = config$stan$chains,
seed = seed,
iter = config$stan$warmup + config$stan$iter,
warmup = config$stan$warmup,
refresh = 1,
control = list(
max_treedepth = config$stan$max_treedepth,
adapt_delta = config$stan$adapt_delta
),
save_warmup = FALSE
)
df_new_data <- expand.grid(
gupi = df$gupi %>% unique,
Outcomes.DerivedCompositeGOSEDaysPostInjury = config$t_out
) %>%
as_tibble %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
left_join(
df %>%
select(-Outcomes.DerivedCompositeGOSE, -Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi) %>%
summarize_all(first) %>%
ungroup,
by = c("gupi")
)
df_posteriors <- predict(mdl, df_new_data, summary = TRUE) %>%
as_tibble() %>%
mutate(
gupi = df_new_data$gupi,
Outcomes.DerivedCompositeGOSEDaysPostInjury = df_new_data$Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>% {
# since not all levels are observed/fitted, need to extend predictions
# by zero for those categories, dplyr black magic
cols <- sprintf("P(Y = %s)", 1:8)
missing_cols <- cols[!(cols %in% names(as_tibble(.)))]
res <- .
for (newcol in missing_cols) {
# add missing factor levels
res <- mutate(res, !!newcol := 0.0)
}
res <- select(res, gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury, everything())
return(res)
} %>%
rename(t = Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
gather(GOSE, probability, starts_with("P(Y =")) %>%
mutate(
GOSE = factor(GOSE, labels = 1:8, levels = sprintf("P(Y = %s)", 1:8))
) %>%
arrange(gupi, t, GOSE) %>%
mutate_if(is.factor, as.character) %>%
mutate(
GOSE = GOSE %>% as.integer,
gupi = as.character(gupi)
)
saveRDS(df_posteriors, outputfile)
#!/usr/bin/env Rscript
library(tidyverse)
library(msm)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
config <- yaml::read_yaml("config.yml")
# filter deaths - we use exact times
# GOSE = 2 is too infreqeunt
df_gose <- readRDS(inputfile) %>%
mutate(
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = c(1, 3:8, 99), # 2 is too infrequent to be fitted
ordered = TRUE
) %>% as.character %>% as.integer
) %>% select(
gupi,
Outcomes.DerivedCompositeGOSE,
Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>%
arrange(
gupi,
Outcomes.DerivedCompositeGOSEDaysPostInjury
)
# translate to msm formatting
tmp <- df_gose %>%
rbind(tibble( # only predict GOSE at 180 days - takes too long otherwise
gupi = df_gose$gupi %>% unique,
Outcomes.DerivedCompositeGOSEDaysPostInjury = config$t_out_msm,
Outcomes.DerivedCompositeGOSE = 99
)) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
mutate(
obstype = ifelse(Outcomes.DerivedCompositeGOSE == 1, 3, 1), # make death exactly observed transition
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = c(1, 3:8, 99),
labels = c(1:7, 99)
) %>% as.character %>% as.numeric
) %>%
group_by(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
# remove prediction states where we already have a true one
filter((n() == 1) | (Outcomes.DerivedCompositeGOSE != 99)) %>%
ungroup
# define transition matrix
Q <- matrix(0, nrow = 7, ncol = 7)
for (i in 1:7) {
for (j in 1:7) {
if (i == j - 1 & i != 1) {
Q[i, j] <- 1
}
if (i == j + 1) {
Q[i, j] <- 1
}
}
}
Q[2:7, 1] <- 1 # allow instantaneous deaths
# someone comes back from the dead - make sure death is last observation
fit <- msm(
Outcomes.DerivedCompositeGOSE ~ Outcomes.DerivedCompositeGOSEDaysPostInjury,
subject = tmp$gupi,
data = tmp,
obstype = tmp$obstype,
gen.inits = TRUE,
qmatrix = Q,
censor = 99,
pci = c(90, 270),
censor.states = 1:7,
control = list(
fnscale = config$fnscale_msm,
maxit = config$maxiter_msm,
trace = 2
)
)
# get fitted values at censored (prediction) states
fitted <- viterbi.msm(fit)
# store posteriors
df_posteriors <- tibble(
gupi = fitted$subject,
t = fitted$time
) %>%
cbind(
fitted$pstate %>% {colnames(.) <- c("1", as.character(3:8)); .}
) %>%
mutate("2" = 0) %>%
filter(t == config$t_out_msm) %>%
gather(GOSE, probability, as.character(1:8)) %>%
arrange(gupi, t, GOSE)
saveRDS(df_posteriors, outputfile)
---
title: "Imputing GOSE scores in CENTER-TBI"
date: "`r Sys.time()`"
statistician: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
collaborator: "David Menon (dkm13@cam.ac.uk)"
output: reportr::report
git-commit-hash: "`r system('git rev-parse --verify HEAD', intern=TRUE)`"
git-wd-clean: "`r ifelse(system('git diff-index --quiet HEAD') == 0, 'clean', 'file changes, working directory not clean!')`"
bibliography: "references.bib"
params:
data_dir: "../output/v1.1/data"
config_file: "../config.yml"
---
```{r setup-chunk, include=FALSE}
options(tidyverse.quiet = TRUE) # supresses filter/lag conflicts
require(tidyverse, quietly = TRUE)
config <- yaml::read_yaml(params$config_file)
set.seed(config$seed)
```
# Model descriptions
## Last-observation-carried-forward (LOCF)
We use last-observation-carried-forward (LOCF) as baseline method.
We define LOCF as the latest observation before the imputation time point and
aditionally allow a 14 day tolerance period, i.e., if there is a GOSE
observation within 14 days of the imputation time point it will always be used.
Since LOCF is not model-based there might be situations in which a GOSE is
available only at a later time point and no GOSE can be imputed.
LOCF imputations are therefore only available for a subset of the study
population.
This implies that all comparisons with different methods must be conducted
on the subset of individuals for which LOCF is defined (i.e., at least one GOSE
observation being available within the first 180 + 14 days).
## Mixed-effects model
A standard approach to longitudinal data analysis are mixed effects models
which model individual-specific deviations from a population mean trajectory.
Here, wer use a mixed-effects model with a spline function for the potential
non-linear population-mean trajectory (fixed time effect).
Patient-individual deviations are modelled as quadratic polynomials to allow
sufficient flexibility [^1].
[^1]: Expressed as R model formula the model is given by: $GOSE \sim s(t) + (1 + t + t^2 \,|\, ID)$.
Here, $s(t)$ denotes the spline function for the population-mean trajectory.
Quadratic and linear terms for individuals with only one observation are
simply shrunk towards the population mean zero but the addition of those effects
allows for both upward as well as downward facing (quadratic) deviation from the overall
population mean trajectory.
Baseline covariates can easily be added by including additional fixed effects
per covariate [^2].
[^2]: I.e., for $k$ additional baseline covariates $X_i, i=1\ldots k$ this is
represented by the model formula $GOSE \sim s(t) + X_1 + ... + X_k + (1 + t + t^2 \,|\, ID)$.
We account for the discretenss of GOSE by fitting a cumulative link ordinal
regression model.
Logit link and flexible intercepts were employed (no uniform spacing assumed).
For further details on cumulative link models see @Agresti2003.
We used the BRMS package @brms2017, @brms2018 for the R environment
for statistical computing @r2016 to fit the model using non-informative priors.
## Gaussian process model
A potential drawback of a mixed effects model to for the longitudinal
trajectories of individual patients's GOSE is the fact that the individual
deviations from the population mean GOSE are modelled globally
via polynomials (e.g. quadratic).
Since linear and quadratic terms are not identifyable for patients with only one
observed GOSE value, this implies large uncertainty over the patient-specific
effects for individuals with one or two observations only, and thus the overall
uncertainty of their fit is relatively large.
Gaussian process regression @rasmussen2006 allows to flexibly model both the
individual GOSE trajectories as well as the population mean.
The main advantage over a mixed effects model with subject specific quadratic
polynomail deviations from a population mean function is the fact that
model uncertainty at 6 months for individuals with a single observation close to
6 months is substantially decreased due to the *local* nature of Gaussian
process regression.
However, it is to be expected that the model will perform slightly worse in terms
of extrapolation since less structure is learnt than in the proposed mixed
effects model.
To model the discrete GOSE outcome, GP regression is applied treating GOSE values
as numerical and the final predictions are rounded to the closest integer in
1 to 8.
A squared exponential covariance function with shared lengthscale $l$,
variance $\sigma^2$, and residual variance (nugget term)
$\sigma_{resid.}^2$ was used since the
spacing of observation is roughly comparable between patients.
We model the latent population mean trajectory $\mu(t)$ as
the mean of a Gaussian process with pseudo observations at specific pivot points.
This approach maintains flexibility of the population mean function but avoids
the computational complexity of a hierarchical Gaussian process.
We chose as pivot points for the population mean
```{r print-knots}
config$gp$t_knots %>% as.numeric()
```
days after injury.
Again, baseline covariates may be added to the mean function of the Guassian
process in the same way as for the mixed model.
All parameters are estimated in a fully Bayesian fashion using the stan modelling
language @stan2017 and non-informative priors except for the lengthscale of the
squared exponential kernel.
Due to the sparseness of the data, the estimated lengthscale will naturally tend
towards extremely large value allowing for unrealistically long-range dependency
between observations.
We therefore limit the lengthscale to a maximum of 120 days (4 months) an impose
a normal prior $\mathcal{N}(60, 14)$.
## Multi-state model
Both Gaussian process regression as well as mixed effects models with spline
effects are non-linear regression techniques for longitudinal data.
Multi-state models @msm2011 offer an entirely different approach to modelling
the GOSE observations.
Here, it is assumed that the transitions between adjacent GOSE states can be
modelled as a Markov process and the transition intensities between adjacent
states are fitted to the observed data.
To account for the fact that transition intensities may vary with time,
piecewise constant transition intensities were fitted to the intervals
$[0, 90), [90, 270)$, and $[270, \infty)$ days after injury.
Due to the large number of transition intensities for an 8-level outcome scale,
inculsion of baseline covariates turned out to be numerically unstable.
We therefore only report the results for the model without baseline covariates.
The model was fitted using the msm package for R @msm2011.
# Model validation
We conducted a 3-fold cross validation to assess the quality of the imputed
GOSE values.
The test cases are given by the individuals with a GOSE measurement within 180 +/-
14 days after injury.
```{r load-test-data}
df_ground_truth <- c()
for (i in 1:config$folds) {
df_ground_truth <- rbind(
df_ground_truth,
# note that GOSE values in test data are not subject to mi - loading the
# first data set is sufficient!
sprintf("%s/validation/df_test_mi_1_fold_%i.rds", params$data_dir, i) %>%
readRDS() %>%
mutate(fold = i) %>%
select(gupi, fold, Outcomes.DerivedCompositeGOSEDaysPostInjury, Outcomes.DerivedCompositeGOSE)
)
}
```
```{r}
df_ground_truth %>%
ggplot(aes(Outcomes.DerivedCompositeGOSE)) +
geom_bar() +
facet_wrap(~fold) +
theme_bw() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank()
) +
ggtitle("Distribution of 6-months GOSE values of test cases per fold.")
```
The distribution of GOSE values in the respective three test sets is well
balanced.
```{r get-model-names}
modelnames <- lapply(
list.dirs(
sprintf("%s/validation/posteriors", params$data_dir),
recursive = FALSE
),
basename
) %>%
as.character()
cat("models detected:\n\r")
cat(sprintf(" %s\n\r", modelnames))
```
```{r load-posteriors}
df_model_posteriors <- c()
for (modelname in modelnames) {
for (i in 1:config$mi_m) {
for (j in 1:config$folds) {
tryCatch(
df_model_posteriors <- rbind(
df_model_posteriors,
sprintf("%s/validation/posteriors/%s/df_posterior_mi_%i_fold_%i.rds",
params$data_dir, modelname, i, j) %>%
readRDS() %>%
mutate(imp = i, fold = j, model = modelname) %>%
select(gupi, fold, imp, model, everything()) %>%
filter(
gupi %in% (df_ground_truth %>% filter(fold == j) %>% .[["gupi"]])
)
),
error = function(e) print(e)
)
}
}
}
```
```{r compute-map-predictions}
df_predictions <- df_model_posteriors %>%
group_by(fold, model, gupi, t, GOSE) %>%
summarise(probability = mean(probability)) %>%
arrange(model, fold, gupi, t, GOSE) %>%
ungroup() %>%
filter(t == 180) %>%
select(-t) %>%
group_by(fold, model, gupi) %>%
do(
GOSE = .$GOSE,
prediction = rep(which.max(.$probability) %>%
ifelse(length(.) == 0, NA_integer_, .) %>%
ifelse(length(.) > 1, round(mean(.)), .), 8),
probability = .$probability
) %>%
unnest %>%
spread(GOSE, probability) %>%
right_join(
df_ground_truth %>%
mutate(
gupi = as.character(gupi),
GOSE = as.integer(Outcomes.DerivedCompositeGOSE)
),
by = c("gupi", "fold")
)
```
LOCF, by design, may not be able to provide imputed values whenever
no observation before 180 + 14 days after injury is observed.
Since there is reason to assume that these individuals 6-months GOSE will be
harder to predict,
a comparison of LOCF with the other methods must be conditional on the
set of individuals which allow LOCF imputation.
```{r non-locf-ids}
idx <- df_predictions %>%
filter(model == "locf", !complete.cases(.)) %>%
.[["gupi"]]
```
Overall, `r length(idx)` out of `r df_predictions %>% filter(model == "locf") %>% nrow` test cases cannot
be imputed with the LOCF approach.
## Comparison conditional on LOCF subset
We first consider results for the set of test cases which allow LOCF imputation.
### Confusion matrix
The accuracy of the imputed values can be assessed graphically by inspection
of the confusion matrix were imputed values are plotted against true GOSE values.
Note that since we do not have access to true 180 days GOSE values we used a
time window of $\pm$ 14 days around 180 days after injury.
```{r compute-confusion-matrices}
df_average_confusion_matrices <- df_predictions %>%
filter(!(gupi %in% idx)) %>%
group_by(fold, model) %>%
do(
confusion_matrix = caret::confusionMatrix(
data = factor(.$prediction, levels = 1:8),
reference = factor(.$GOSE, levels = 1:8)
) %>%
as.matrix %>% as_tibble %>%
mutate(`Predicted GOSE` = row_number() %>% as.character) %>%
gather(`True GOSE`, n, 1:8)
) %>%
unnest %>%
group_by(model, `Predicted GOSE`, `True GOSE`) %>%
summarize(n = mean(n)) %>%
ungroup
```
We first consider the raw, unnormalized counts of the respective confusion
matrices.
These matrices do take into account the non-uniform distribution of the
respective true 6-months GOSE values in the test set.
All values are averaged accross folds.
```{r confusion-matrix-locf, warning=FALSE, message=FALSE, fig.height=4.5, out.width=".9\\textwidth", fig.align='center'}
df_average_confusion_matrices %>%
group_by(model, `True GOSE`) %>%
ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = n)) +
geom_raster() +
geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
scale_fill_gradient(low = "white", high = "black") +
coord_fixed(expand = FALSE) +
labs(x = "true GOSE", y = "imputed GOSE", fill = "") +
theme_bw() +
theme(
panel.grid = element_blank()
) +
facet_wrap(~model) +
ggtitle("Average confusion matrix accross folds (absolute counts)")
ggsave(filename = "confusion_matrices_locf.pdf", width = 7, height = 7)
ggsave(filename = "confusion_matrices_locf.png", width = 7, height = 7)
```
```{r confusion-matrix-col-perc-locf, warning=FALSE, message=FALSE, fig.height=4.5, out.width=".9\\textwidth", fig.align='center'}
df_average_confusion_matrices %>%
group_by(model, `True GOSE`) %>%
mutate(
`fraction (column)` = n / sum(n),
`fraction (column)` = ifelse(is.nan(`fraction (column)`), 0, `fraction (column)`)
) %>%
ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = `fraction (column)`)) +
geom_raster() +
geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
scale_fill_gradient("column fraction", low = "white", high = "black", limits = c(0, 1)) +
coord_fixed(expand = FALSE) +
labs(x = "true GOSE", y = "imputed GOSE", fill = "") +
theme_bw() +
theme(
panel.grid = element_blank()
) +
facet_wrap(~model) +
ggtitle("Average confusion matrix accross folds (column normalized)")
ggsave(filename = "confusion_matrices_col_perc_locf.pdf", width = 7, height = 7)
ggsave(filename = "confusion_matrices_col_perc_locf.png", width = 7, height = 7)
```
### Accuracy
To summarize the overall performance of the respective approaches we consider
bias, mean absolute error (MAE), and root mean square error (RMSE).
```{r error-scores-stratified-locf, fig.align='center'}
df_predictions %>%
filter(!(gupi %in% idx)) %>%
group_by(model, fold, GOSE) %>%
summarize(
RMSE = mean((GOSE - prediction)^2, na.rm = TRUE) %>% sqrt,
MAE = mean(abs(GOSE - prediction), na.rm = TRUE),
bias = mean(prediction, na.rm = TRUE) - mean(GOSE, na.rm = TRUE)
) %>%
gather(error, value, -model, -fold, -GOSE) %>%
group_by(GOSE, model, error) %>%
summarize(
mean = mean(value),
sd = sd(value)
) %>%
ungroup() %>%
ggplot(aes(GOSE, color = model)) +
geom_hline(yintercept = 0, color = "black") +
geom_line(aes(y = mean)) +
facet_wrap(~error) +
scale_y_continuous(name= "", breaks = seq(-2, 8, .25)) +
theme_bw() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 66, hjust = 1)
) +
ggtitle("Summary measures by true GOSE value")
ggsave(filename = "imputation_error_stratified_locf.pdf", width = 7, height = 7)
ggsave(filename = "imputation_error_stratified_locf.png", width = 7, height = 7)
```
```{r error-scores-locf, fig.align='center'}
df_predictions %>%
filter(!(gupi %in% idx)) %>%
group_by(model, fold) %>%
summarize(
RMSE = mean((GOSE - prediction)^2, na.rm = TRUE) %>% sqrt,
MAE = mean(abs(GOSE - prediction), na.rm = TRUE),
bias = mean(prediction, na.rm = TRUE) - mean(GOSE, na.rm = TRUE)
) %>%
ungroup %>%
gather(error, value, -model, -fold) %>%
ggplot(aes(model, value)) +
geom_hline(yintercept = 0, color = "black") +
geom_boxplot() +
facet_wrap(~error) +
scale_y_continuous(name= "", breaks = seq(-2, 8, .25)) +
scale_x_discrete("") +
theme_bw() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 66, hjust = 1)
) +
ggtitle("Overall summary measures")
ggsave(filename = "imputation_error_locf.pdf", width = 7, height = 7)
ggsave(filename = "imputation_error_locf.png", width = 7, height = 7)
```
Overall, LOCF exhibits a negative bias which is mainly driven by the
underestimation of individuals with true GOSE of 6,7, or 8.
This also translates to a slightly worse MAE and RMSE - again, this is
mostly driven by a lack of fit for larger true GOSE values.
For lower true GOSE values, the fit is very similar to MSM and the mixed effects
models.
Gaussian process regression exhibits a bad fit for low true GOSE value which
are systematically overestimated.
LOCF performs worst in terms of overall bias, MAE, and RMSE in addition to not
being able to impute values for the entire set of test cases.
We therefore drop it from the compariosn and compare the remaining
methods on the entire test sets.
## Comparson on full test set
```{r compute-confusion-matrices-no-locf}
df_average_confusion_matrices <- df_predictions %>%
filter(!(model == "locf")) %>%
group_by(fold, model) %>%
do(
confusion_matrix = caret::confusionMatrix(
data = factor(.$prediction, levels = 1:8),
reference = factor(.$GOSE, levels = 1:8)
) %>%
as.matrix %>% as_tibble %>%
mutate(`Predicted GOSE` = row_number() %>% as.character) %>%
gather(`True GOSE`, n, 1:8)
) %>%
unnest %>%
group_by(model, `Predicted GOSE`, `True GOSE`) %>%
summarize(n = mean(n)) %>%
ungroup
```
```{r confusion-matrix, warning=FALSE, message=FALSE, fig.height=5, out.width=".9\\textwidth", fig.align='center'}
df_average_confusion_matrices %>%
group_by(model, `True GOSE`) %>%
ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = n)) +
geom_raster() +
geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
scale_fill_gradient(low = "white", high = "black") +
coord_fixed(expand = FALSE) +
labs(x = "true GOSE", y = "imputed GOSE", fill = "") +
theme_bw() +
theme(
panel.grid = element_blank()
) +
facet_wrap(~model) +
ggtitle("Average confusion matrix accross folds (absolute counts)")
ggsave(filename = "confusion_matrices.pdf", width = 7, height = 7)
ggsave(filename = "confusion_matrices.png", width = 7, height = 7)
```
```{r confusion-matrix-col-perc, warning=FALSE, message=FALSE, fig.height=5, out.width=".9\\textwidth", fig.align='center'}
df_average_confusion_matrices %>%
group_by(model, `True GOSE`) %>%
mutate(
`fraction (column)` = n / sum(n),
`fraction (column)` = ifelse(is.nan(`fraction (column)`), 0, `fraction (column)`)
) %>%
ggplot(aes(`True GOSE`, `Predicted GOSE`, fill = `fraction (column)`)) +
geom_raster() +
geom_hline(yintercept = c(2, 4, 6) + .5, color = "black") +
geom_vline(xintercept = c(2, 4, 6) + .5, color = "black") +
scale_fill_gradient("column fraction", low = "white", high = "black", limits = c(0, 1)) +
coord_fixed(expand = FALSE) +
labs(x = "true GOSE", y = "imputed GOSE", fill = "") +
theme_bw() +
theme(
panel.grid = element_blank()
) +
facet_wrap(~model) +
ggtitle("Average confusion matrix accross folds (column normalized)")
ggsave(filename = "confusion_matrices_col_perc.pdf", width = 7, height = 7)
ggsave(filename = "confusion_matrices_col_perc.png", width = 7, height = 7)
```
### Accuracy
```{r error-scores-stratified, fig.align='center'}
df_predictions %>%
filter(model != "locf") %>%
group_by(model, fold, GOSE) %>%
summarize(
RMSE = mean((GOSE - prediction)^2, na.rm = TRUE) %>% sqrt,
MAE = mean(abs(GOSE - prediction), na.rm = TRUE),
bias = mean(prediction, na.rm = TRUE) - mean(GOSE, na.rm = TRUE)
) %>%
gather(error, value, -model, -fold, -GOSE) %>%
group_by(GOSE, model, error) %>%
summarize(
mean = mean(value),
sd = sd(value)
) %>%
ungroup() %>%
ggplot(aes(GOSE, color = model)) +
geom_hline(yintercept = 0, color = "black") +
geom_line(aes(y = mean)) +
facet_wrap(~error) +
scale_y_continuous(name= "", breaks = seq(-2, 8, .25)) +
theme_bw() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 66, hjust = 1)
) +
ggtitle("Summary measures by true GOSE value")
ggsave(filename = "imputation_error_stratified.pdf", width = 7, height = 7)
ggsave(filename = "imputation_error_stratified.png", width = 7, height = 7)
```
```{r error-scores, fig.align='center'}
df_predictions %>%
filter(model != "locf") %>%
group_by(model, fold) %>%
summarize(
RMSE = mean((GOSE - prediction)^2, na.rm = TRUE) %>% sqrt,
MAE = mean(abs(GOSE - prediction), na.rm = TRUE),
bias = mean(prediction, na.rm = TRUE) - mean(GOSE, na.rm = TRUE)
) %>%
ungroup %>%
gather(error, value, -model, -fold) %>%
ggplot(aes(model, value)) +
geom_hline(yintercept = 0, color = "black") +
geom_boxplot() +
facet_wrap(~error) +
scale_y_continuous(name= "", breaks = seq(-2, 8, .25)) +
scale_x_discrete("") +
theme_bw() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 66, hjust = 1)
) +
ggtitle("Overall summary measures")
ggsave(filename = "imputation_error.pdf", width = 7, height = 7)
ggsave(filename = "imputation_error.png", width = 7, height = 7)
```
```{r zip-figures}
system("zip figures.zip *.png *.pdf")
system("rm *.png *.pdf")
```
# Session Info
```{r session-info}
sessionInfo()
```
---
title: "Extract and prepare data"
statistician: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
collaborator: "David Menon (dkm13@cam.ac.uk)"
output: reportr::report
date: "`r Sys.time()`"
params:
datapath: "../data/v1.1"
max_lab_days: 3
seed: 42
---
```{r setup, include=FALSE}
library(tidyverse)
library(centertbi)
library(describr)
library(ggalluvial)
set.seed(params$seed)
df_labs <- readRDS(sprintf('%s/df_labs.rds', params$datapath))
df_baseline <- readRDS(sprintf('%s/df_baseline.rds', params$datapath))
df_ctmri <- readRDS(sprintf('%s/df_ctmri.rds', params$datapath))
df_imaging <- readRDS(sprintf('%s/df_imaging.rds', params$datapath))
df_gose <- readRDS(sprintf('%s/df_gose.rds', params$datapath))
```
# Extract data
```{r}
df_deaths <- df_baseline %>%
transmute(
gupi,
days = difftime(
Subject.DeathDate %>% as.character %>% ifelse(. == "", NA, .) %>% as.Date,
"1970-01-01",
units = "days"
) %>% as.numeric
) %>%
filter(complete.cases(.))
df_baseline <- df_baseline %>%
select(-Subject.DeathDate) %>%
mutate(
# combine suspect and definite to "yes"
InjuryHx.EDComplEventHypoxia = factor(InjuryHx.EDComplEventHypoxia,
levels = 0:2,
labels = c("no", "yes", "yes")
),
# combine suspect and definite to "yes"
InjuryHx.EDComplEventHypotension = factor(InjuryHx.EDComplEventHypotension,
levels = 0:2,
labels = c("no", "yes", "yes")
)
) %>%
# add lab values
left_join(
df_labs %>%
distinct %>% # filter duplicates
mutate(Labs.DLDate = as.Date(Labs.DLDate)) %>%
filter(
difftime(Labs.DLDate, "1970-01-01", units = "days") %>% as.numeric <= params$max_lab_days
) %>%
group_by(gupi) %>%
arrange(gupi, Labs.DLDate) %>%
summarise(
Labs.DLHemoglobingdL = na.omit(Labs.DLHemoglobingdL)[1],
Labs.DLGlucosemmolL = na.omit(Labs.DLGlucosemmolL)[1]
),
by = "gupi"
) %>%
# add imaging data
left_join(
df_imaging %>%
distinct() %>%
mutate(
Imaging.MarshallCTClassification = factor(
Imaging.MarshallCTClassification, levels = 1:6
),
Imaging.ExperimentDate = Imaging.ExperimentDate %>%
as.character %>% ifelse(. == "", NA, .) %>% as.Date
) %>%
arrange(gupi, Imaging.ExperimentDate) %>%
group_by(gupi) %>%
summarize(
Imaging.MarshallCTClassification = na.omit(Imaging.MarshallCTClassification)[1]
),
by = "gupi"
) %>%
# add CTMRI data
left_join(
df_ctmri %>%
distinct() %>%
mutate(
CTMRI.CTMRIDate = CTMRI.CTMRIDate %>%
as.character %>% ifelse(. == "", NA, .) %>% as.Date
) %>%
arrange(gupi, CTMRI.CTMRIDate) %>%
group_by(gupi) %>%
summarize(
CTMRI.CTSubarachnoidHem = na.omit(CTMRI.CTSubarachnoidHem)[1],
CTMRI.CTExtraduralHema = na.omit(CTMRI.CTExtraduralHema)[1],
) %>%
mutate(
CTMRI.CTSubarachnoidHem = factor(CTMRI.CTSubarachnoidHem, levels = 0:3, labels = c("no", "yes", "yes", "yes")),
CTMRI.CTExtraduralHema = factor(CTMRI.CTExtraduralHema, levels = 0:2, labels = c("no", "yes", "yes"))
),
by = "gupi"
)
```
## GOSE data
```{r gose-outcomes-ambiguity}
df_gose <- df_gose %>%
distinct %>%
filter(complete.cases(.)) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
mutate(Outcomes.DerivedCompositeGOSE = factor(Outcomes.DerivedCompositeGOSE, levels = 1:8))
```
This results in `r nrow(df_gose)` GOSE measurements of
`r df_gose %>% group_by(gupi) %>% n_groups()` individuals.
# Compile final datasets
* exclude all patient who do not survive first 6 months (no need to impute)
* exclude all patients with no GOSE measurement (no imputation)
```{r}
early_deaths_gupi <- df_deaths %>%
filter(days <= 180 - 14) %>%
.[["gupi"]]
df_gose <- df_gose %>%
filter(!(gupi %in% early_deaths_gupi)) %>%
ungroup %>%
# add confirmed deaths
rbind(
df_deaths %>%
filter(days > 180) %>%
rename(
Outcomes.DerivedCompositeGOSEDaysPostInjury = days
) %>%
mutate(
Outcomes.DerivedCompositeGOSE = 1
)
) %>%
# remove observations after death
group_by(gupi) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
filter(
{
first_death <- which(Outcomes.DerivedCompositeGOSE == 1)[1]
(is.na(first_death) | n() <= first_death)
}
) %>%
ungroup()
df_baseline <- df_baseline %>%
filter(
gupi %in% df_gose$gupi
)
```
This results in `r nrow(df_gose)` GOSE measurements of
`r df_gose %>% group_by(gupi) %>% n_groups()` individuals.
## Plausibility check
The only genuinly numerical variables are Age, Glucose_mmolL, and Hb_dL.
All other variables are factors and may therefore not contain outliers.
```{r}
df_baseline %>%
select(Subject.Age, Labs.DLGlucosemmolL, Labs.DLHemoglobingdL) %>%
gather(Variable, value) %>%
ggplot(aes(value)) +
geom_histogram() +
facet_wrap(~Variable, scales = "free") +
theme_bw() +
theme(panel.grid = element_blank())
```
Glucose is obviously left-skewed and a log transfrom might improve fits in linear
models.
```{r, echo=TRUE}
df_baseline %>%
select(gupi, Labs.DLHemoglobingdL) %>%
filter(Labs.DLHemoglobingdL > 50)
```
All values above 50 are considered implausible and set to missing
(probably meant as missing).
```{r}
df_baseline <- df_baseline %>%
mutate(
Labs.DLHemoglobingdL = ifelse(Labs.DLHemoglobingdL > 50, NA_real_, Labs.DLHemoglobingdL),
Labs.DLGlucosemmolL_log = log(Labs.DLGlucosemmolL)
) %>%
select(-Labs.DLGlucosemmolL)
```
```{r save}
saveRDS(df_gose, "df_gose.rds")
saveRDS(df_baseline, "df_baseline.rds")
```
# Describe data
## Baseline covariates
```{r, fig.height=8, fig.width=3.25, out.width=".45\\textwidth", warning=FALSE, message=FALSE}
thm <- theme_default(6)
thm$body$descriptor$style$plot_cell$plot_height <- unit(.1, "in")
thm$colwidths$levels <- unit(1, "in")
thm$colwidths$variables <- unit(1.15, "in")
df_baseline %>%
select(-gupi) %>%
describr(
theme_new = thm,
totals = TRUE
) %>%
describe_if(
is.numeric,
with = list(
dscr_mean_sd(),
dscr_median_q1_q3(),
dscr_min_max(),
dscr_histogram()
)
) %>%
describe_if(
is.factor,
with = list(dscr_freq(), dscr_factor_barchart())
) %>%
fit_to_height(height = 8)
```
## GOSE over time
GOSE times are rounded to the nearest 3 months period (averaged/rounded if multiple
values present).
```{r alluvial-data}
df_gose %>%
# compute GOSE per quarter
transmute(
gupi,
quarter = round(Outcomes.DerivedCompositeGOSEDaysPostInjury/90),
Outcomes.DerivedCompositeGOSE = as.numeric(as.character(Outcomes.DerivedCompositeGOSE))
) %>%
group_by(gupi, quarter) %>%
summarise(
GOSE = round(mean(Outcomes.DerivedCompositeGOSE)) %>% ifelse(. == 2, 1, .)
) %>%
ungroup() %>%
filter(
quarter %in% c(1, 2, 4)
) %>%
spread(quarter, GOSE) %>%
gather(quarter, GOSE, `1`, `2`, `4`) %>%
mutate(
GOSE = factor(GOSE, levels = 8:1) %>% forcats::fct_explicit_na(),
Timepoint = factor(quarter, levels = c("1", "2", "4"), labels = c("3 months", "6 months", "12 months"))
) ->
df_alluvial
```
```{r plot-alluvial-gradient, fig.height=4.5}
df_alluvial %>%
ggplot(aes(x = Timepoint, stratum = GOSE, alluvium = gupi, label = GOSE, fill = GOSE)) +
geom_flow(aes.flow = "backward") +
geom_stratum() +
scale_fill_manual(
values = c(viridisLite::viridis(7, direction = -1), "grey"),
breaks = c(8:3, 1, "(Missing)")
) +
theme_bw() +
theme(
panel.grid = element_blank()
)
ggsave("gose_alluvial_gradient_coloring.pdf", height = 5, width = 8)
ggsave("gose_alluvial_gradient_coloring.png", height = 5, width = 8)
```
```{r plot-alluvial-differential, fig.height=4.5}
df_alluvial %>%
ggplot(aes(x = Timepoint, stratum = GOSE, alluvium = gupi, label = GOSE, fill = GOSE)) +
geom_flow(aes.flow = "backward") +
geom_stratum() +
scale_fill_manual(
values = c(RColorBrewer::brewer.pal(7, "Set3"), "grey"),
breaks = c(8:3, 1, "(Missing)")
) +
theme_bw() +
theme(
panel.grid = element_blank()
)
ggsave("gose_alluvial_differential_coloring.pdf", height = 5, width = 8)
ggsave("gose_alluvial_differential_coloring.png", height = 5, width = 8)
```
```{r zip-figures}
system("zip figures.zip *.png *.pdf")
system("rm *.png *.pdf")
```
# Session Info
```{r session-info}
sessionInfo()
```
@article{msm2011,
author = {Jackson, Christopher H.},
doi = {10.18637/jss.v038.i08},
journal = {Journal of Statistical Software},
month = {jan},
number = {8},
pages = {1--28},
title = {{Multi-State Models for Panel Data: The msm Package for R}},
volume = {38},
year = {2011}
}
@article{r2016,
archivePrefix = {arXiv},
arxivId = {arXiv:1011.1669v3},
author = {Team, R Development Core and {R Development Core Team}, R},
doi = {10.1007/978-3-540-74686-7},
eprint = {arXiv:1011.1669v3},
isbn = {3{\_}900051{\_}00{\_}3},
issn = {3-900051-07-0},
journal = {R Foundation for Statistical Computing},
pmid = {16106260},
title = {{R: A Language and Environment for Statistical Computing}},
year = {2016}
}
@book{rasmussen2006,
abstract = {Gaussian processes (GPs) are natural generalisations of multivariate Gaussian random variables to infinite (countably or continuous) index sets. GPs have been applied in a large number of fields to a diverse range of ends, and very many deep theoretical analyses of various properties are available. This paper gives an introduction to Gaussian processes on a fairly elementary level with special emphasis on characteristics relevant in machine learning. It draws explicit connections to branches such as spline smoothing models and support vector machines in which similar ideas have been investigated. Gaussian process models are routinely used to solve hard machine learning problems. They are attractive because of their flexible non-parametric nature and computational simplicity. Treated within a Bayesian framework, very powerful statistical methods can be implemented which offer valid estimates of uncertainties in our predictions and generic model selection procedures cast as nonlinear optimization problems. Their main drawback of heavy computational scaling has recently been alleviated by the introduction of generic sparse approximations.13,78,31 The mathematical literature on GPs is large and often uses deep concepts which are not required to fully understand most machine learning applications. In this tutorial paper, we aim to present characteristics of GPs relevant to machine learning and to show up precise connections to other "kernel machines" popular in the community. Our focus is on a simple presentation, but references to more detailed sources are provided.},
archivePrefix = {arXiv},
arxivId = {026218253X},
author = {Rasmussen, Carl Edward and Williams, Christopher K. I.},
booktitle = {International Journal of Neural Systems},
doi = {10.1142/S0129065704001899},
eprint = {026218253X},
isbn = {026218253X},
issn = {0129-0657},
pmid = {15112367},
title = {{Gaussian Processes for Machine Learning}},
year = {2006}
}
@article{stan2017,
author = {Carpenter, B and Gelman, A and Hoffman, MD and Lee, D},
journal = {Journal of statistical software},
number = {1},
title = {{Stan: A Probabilistic Programming Language}},
volume = {76},
year = {2017}
}
@book{Agresti2003,
author = {Agresti, Alan},
editor = {{John Wiley {\&} Sons}},
title = {{Categorical data analysis}},
year = {2003}
}
@article{brms2018,
author = {B{\"{u}}rkner, Paul},
journal = {The R Journal},
number = {1},
pages = {395--411},
title = {{Advanced Bayesian Multilevel Modeling with the R Package brms}},
volume = {10},
year = {2018}
}
@article{brms2017,
author = {B{\"{u}}rkner, Paul},
journal = {Journal Of Statistical Software},
number = {1},
pages = {1--28},
title = {{brms: An R Package for Bayesian Multilevel Models Using Stan}},
volume = {80},
year = {2017}
}
@article{Steyerberg2008,
author = {Steyerberg, Ewout W and Mushkudiani, Nino and Perel, Pablo and Butcher, Isabella and Lu, Juan and McHugh, Gillian S and Murray, Gordon D and Marmarou, Anthony and Roberts, Ian and Habbema, J. Dik F and Maas, Andrew I. R},
doi = {10.1371/journal.pmed.0050165},
editor = {Singer, Mervyn},
journal = {PLoS Medicine},
month = {aug},
number = {8},
pages = {e165},
title = {{Predicting Outcome after Traumatic Brain Injury: Development and International Validation of Prognostic Scores Based on Admission Characteristics}},
url = {https://dx.plos.org/10.1371/journal.pmed.0050165},
volume = {5},
year = {2008}
}
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
data_dir <- args[[1]]
m <- as.numeric(args[[2]])
df_gose <- readRDS(sprintf("%s/df_gose.rds", data_dir))
for (i in 1:m) {
df_baseline <- readRDS(sprintf("%s/mi_baseline/df_baseline_mi_%i.rds", data_dir, i))
df <- df_gose %>%
left_join(df_baseline, by = "gupi")
saveRDS(df, sprintf("%s/imputation/df_mi_%i.rds", data_dir, i))
}
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
data_dir <- args[[1]]
m <- as.numeric(args[[2]])
k <- as.numeric(args[[3]])
seed <- as.integer(args[[4]])
set.seed(seed)
df_gose <- readRDS(sprintf("%s/df_gose.rds", data_dir))
# get ids of all subjects with a valid 6mo GOSE
test_candidate_ids <- df_gose %>%
group_by(gupi) %>%
filter(
# at least one non-death observation close to 6 months
any(
(Outcomes.DerivedCompositeGOSEDaysPostInjury >= 180 - 14) &
(Outcomes.DerivedCompositeGOSE != 1) &
(Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180 + 14)
),
# at least one observation not close to 6 months
any(
(Outcomes.DerivedCompositeGOSEDaysPostInjury < 180 - 14) |
(Outcomes.DerivedCompositeGOSEDaysPostInjury > 180 + 14)
)
) %>%
.[["gupi"]] %>%
unique %>%
as.character
df_validation <- tibble(
gupi = df_gose$gupi %>% unique,
valid_test = gupi %in% test_candidate_ids
) %>%
group_by(
valid_test
) %>%
mutate(
fold = sample(rep(1:k, ceiling(n()/k)))[1:n()]
) %>%
ungroup
# saveRDS(df_validation, "df_validation_splits.rds")
for (i in 1:m) {
df_baseline <- readRDS(sprintf("%s/mi_baseline/df_baseline_mi_%i.rds", data_dir, i))
for (j in 1:k) {
df_test_rows <- df_gose %>%
left_join(df_baseline, by = "gupi") %>%
left_join(df_validation, by = "gupi") %>%
filter(fold == j & valid_test) %>%
group_by(gupi) %>%
filter(
Outcomes.DerivedCompositeGOSEDaysPostInjury ==
Outcomes.DerivedCompositeGOSEDaysPostInjury[which.min(abs(Outcomes.DerivedCompositeGOSEDaysPostInjury - 180))]) %>%
ungroup() %>%
select(
gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>%
mutate(test = TRUE) %>%
right_join(
df_gose %>% select(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury),
by = c("gupi", "Outcomes.DerivedCompositeGOSEDaysPostInjury")
) %>%
mutate(test = ifelse(is.na(test), FALSE, test))
df_test <- df_gose %>%
left_join(df_test_rows, by = c("gupi", "Outcomes.DerivedCompositeGOSEDaysPostInjury")) %>%
filter(test) %>%
select(-test) %>%
left_join(df_baseline, by = "gupi")
df_train <- df_gose %>%
left_join(df_test_rows, by = c("gupi", "Outcomes.DerivedCompositeGOSEDaysPostInjury")) %>%
filter(!test) %>%
select(-test) %>%
left_join(df_baseline, by = "gupi")
# plausi checks
if (nrow(df_test) + nrow(df_train) != nrow(df_gose)) stop("test/train does not add up")
if (any(df_test$Outcomes.DerivedCompositeGOSE == 1)) stop("test cases cannot be GOSE 1")
saveRDS(df_test, sprintf("%s/validation/df_test_mi_%i_fold_%i.rds", data_dir, i, j))
saveRDS(df_train, sprintf("%s/validation/df_train_mi_%i_fold_%i.rds", data_dir, i, j))
}
}
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
in_folder <- args[[1]]
out_folder <- args[[2]]
in_files <- sprintf("%s/%s", in_folder, list.files(path = in_folder, pattern = "*.csv"))
if (length(in_files) != 5) stop("must have exactly 5 input files")
for (f in in_files) {
tmp <- read.csv(f)
if ("Subject.Age" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_baseline.rds", out_folder))
if ("Outcomes.DerivedCompositeGOSE" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_gose.rds", out_folder))
if ("Labs.DLDate" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_labs.rds", out_folder))
if ("Imaging.MarshallCTClassification" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_imaging.rds", out_folder))
if ("CTMRI.CTSubarachnoidHem" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_ctmri.rds", out_folder))
}
library(tidyverse)
library(mice)
args <- commandArgs(trailingOnly = TRUE)
df_baseline <- readRDS(args[[1]])
out_dir <- args[[2]]
m <- as.numeric(args[[3]])
maxiter <- as.numeric(args[[4]])
seed <- as.numeric(args[[5]])
mi_baseline <- df_baseline %>%
select(-gupi) %>%
mutate_if(is.character, factor) %>%
mice::mice(
m = m,
seed = seed,
maxit = maxiter
)
# save the individual complete data sets
for (i in 1:m) {
mice::complete(mi_baseline, action = i, include = FALSE) %>%
mutate_if(is.factor, as.character) %>%
as_tibble() %>%
mutate(
gupi = df_baseline$gupi
) %>%
select(gupi, everything()) %>%
saveRDS(file = sprintf("%s/df_baseline_mi_%i.rds", out_dir, i))
}
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
modelimputations_dir <- args[[1]]
gosefile <- args[[2]]
outputfile <- args[[3]]
set.seed(42)
df_gose <- readRDS(gosefile)
posteriors <- list.files(modelimputations_dir, pattern = "*.rds")
# load and combine all posteriros over different (baseline) imputations
df_imputations <- c()
for (i in 1:length(posteriors)) {
df_imputations <- rbind(
df_imputations,
readRDS(sprintf("%s/%s", modelimputations_dir, posteriors[i])) %>%
mutate(
imp = i
)
)
}
df_imputations <- df_imputations %>%
group_by(gupi, t, GOSE) %>%
summarize(probability = mean(probability, na.rm = TRUE)) %>%
ungroup
df_imputations <- df_imputations %>%
select(-t) %>%
filter(complete.cases(.)) %>%
group_by(gupi) %>%
mutate(
Subject.DerivedImputed180DaysGOSE = which.max(probability) %>%
{
if (length(.) == 0) { # fix all probs. NA case
return(NA)
} else {
return(.)
}
} %>%
sample(1) %>% # need to resolve ambiguous gose predictions !
rep(n()),
GOSE = factor(
GOSE,
levels = 1:8,
labels = sprintf("Subject.DerivedImputed180DaysGOSE_%i_probability", 1:8)
)
) %>%
ungroup %>%
spread(GOSE, probability)
# extract per protocol gose observations
df_per_protocol_gose <- df_gose %>%
filter(
Outcomes.DerivedCompositeGOSEDaysPostInjury >= 5*30,
Outcomes.DerivedCompositeGOSEDaysPostInjury <= 8*30
) %>%
# pick closest to 180
group_by(gupi) %>%
summarize(
closest_per_protocol_GOSE = Outcomes.DerivedCompositeGOSE[
which.min(abs(Outcomes.DerivedCompositeGOSEDaysPostInjury - 180))
]
)
df_imputations <- df_imputations %>%
left_join(df_per_protocol_gose, by = "gupi") %>%
mutate(
Subject.GOSE6monthEndpointDerived = ifelse(
is.na(closest_per_protocol_GOSE),
Subject.DerivedImputed180DaysGOSE,
closest_per_protocol_GOSE
)
) %>%
select(
gupi,
Subject.GOSE6monthEndpointDerived,
Subject.DerivedImputed180DaysGOSE,
everything()
) %>%
select(
-closest_per_protocol_GOSE
)
write_csv(df_imputations, outputfile)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment