Commit 97737a15 authored by Kevin's avatar Kevin

initial commit

parents
output
.snakemake
.Rproj.user
.cache
*.Rproj
*.rds
*.Rhistory
\ No newline at end of file
singularity: "docker://kkmann/gose-6mo-imputation@sha256:02a8f55c53d6f4917193abda823938e1ad7e8891fd6c392bd94f509e058eb34b"
configfile: "config.yml"
rule import_neurobot_csv:
output:
"data/{version}/df_baseline.rds",
"data/{version}/df_ctmri.rds",
"data/{version}/df_imaging.rds",
"data/{version}/df_labs.rds",
"data/{version}/df_gose.rds"
shell:
"""
Rscript scripts/import_neurobot_data.R data/{wildcards.version} data/{wildcards.version}
"""
rule prepare_data:
input:
rules.import_neurobot_csv.output,
markdown = "reports/prepare_data.Rmd"
output:
"output/{version}/data/df_gose.rds",
"output/{version}/data/df_baseline.rds",
"output/{version}/prepare_data.pdf",
figures = "output/{version}/prepare_data_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}/data
Rscript -e "rmarkdown::render('{input.markdown}', output_dir = 'output/{wildcards.version}', params = list(datapath = '../data/{wildcards.version}', max_lab_days = {config[max_lab_days]}, seed = {config[seed]}))"
mv reports/*.rds output/{wildcards.version}/data
mv reports/figures.zip {output.figures}
"""
rule impute_baseline:
input:
rules.prepare_data.output
output:
["output/{version}/data/mi_baseline/df_baseline_mi_%i.rds" % i for i in range(1, config["mi_m"] + 1)]
shell:
"""
mkdir -p output/{wildcards.version}/data/mi_baseline
Rscript scripts/impute_baseline.R output/{wildcards.version}/data/df_baseline.rds output/{wildcards.version}/data/mi_baseline {config[mi_m]} {config[mi_maxiter]} {config[seed]}
"""
rule generate_validation_data:
input:
rules.prepare_data.output,
rules.impute_baseline.output
output:
["output/{version}/data/validation/df_%s_mi_%i_fold_%i.rds" % (s, i, j)
for s in ("train", "test")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
shell:
"""
mkdir -p output/{wildcards.version}/data/validation
Rscript scripts/generate_validation_data.R output/{wildcards.version}/data {config[mi_m]} {config[folds]} {config[seed]}
"""
# adjust threads by model type
def get_rule_threads(wildcards):
if wildcards.model in ("locf", "msm"):
return 1
else:
return config["stan"]["chains"]
rule fit_model_validation_set:
input:
"config.yml",
"output/{version}/data/validation/df_train_mi_{i}_fold_{j}.rds"
output:
"output/{version}/data/validation/posteriors/{model}/df_posterior_mi_{i}_fold_{j}.rds"
threads:
get_rule_threads
shell:
"""
mkdir -p output/{wildcards.version}/data/validation/posteriors/{wildcards.model}
Rscript models/{wildcards.model}/fit.R {input[1]} {output}
"""
# helper rule to just build all posterior datasets
rule model_posteriors:
input:
["output/v1.1/data/validation/posteriors/%s/df_posterior_mi_%i_fold_%i.rds" % (m, i, j)
for m in ("locf", "msm", "gp", "gp_nb", "mm", "mm_nb")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
rule model_assessment:
input:
pop_report = rules.prepare_data.output,
posteriors = rules.model_posteriors.input,
markdown = "reports/model_assessment.Rmd"
output:
pdf = "output/{version}/model_assessment.pdf",
figures = "output/{version}/model_assessment_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}
Rscript -e "rmarkdown::render('{input.markdown}', output_dir = 'output/{wildcards.version}', params = list(data_dir = '../output/{wildcards.version}/data', config_file = '../config.yml'))"
mv reports/figures.zip {output.figures}
"""
rule generate_imputation_data:
input:
rules.prepare_data.output,
rules.impute_baseline.output
output:
["output/{version}/data/imputation/df_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
]
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation
Rscript scripts/generate_imputation_data.R output/{wildcards.version}/data {config[mi_m]}
"""
# rules for imputing on entire dataset
rule model_impute:
input:
"config.yml",
"output/{version}/data/imputation/df_mi_{i}.rds"
output:
"output/{version}/data/imputation/{model}/df_gose_imputed_mi_{i}.rds"
threads:
get_rule_threads
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript models/{wildcards.model}/fit.R {input[1]} {output}
"""
# final reported values are a combination of the imputed and per-protocol
# observed ones
rule post_process_imputations:
input:
"config.yml",
["output/{version}/data/imputation/{model}/df_gose_imputed_mi_%i.rds" % i
for i in range(1, config["mi_m"] + 1)
]
output:
"output/{version}/data/imputation/{model}/df_gose_imputed.csv"
shell:
"""
mkdir -p output/{wildcards.version}/data/imputation/{wildcards.model}
Rscript scripts/post_process_imputations.R output/{wildcards.version}/data/imputation/{wildcards.model} output/{wildcards.version}/data/df_gose.rds {output}
"""
seed:
42
max_lab_days:
3
mi_m:
2
mi_maxiter:
5
folds:
2
t_out:
180
t_buffer:
14
t_out_msm:
180
maxiter_msm:
20
fnscale_msm:
12000
stan:
iter: 10
warmup: 10
chains: 2
adapt_delta: .95
max_treedepth: 10
gp:
t_knots: [45, 90, 180, 270, 360]
#!/usr/bin/env Rscript
library(tidyverse)
library(rstan)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
modelfile <- "models/gp/gp.stan"
config <- yaml::read_yaml("config.yml")
df <- readRDS(inputfile)
model_matrix_baseline <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi) %>%
select(
-Outcomes.DerivedCompositeGOSEDaysPostInjury, -Outcomes.DerivedCompositeGOSE
) %>%
summarize_all(first) %>%
model.matrix(~ . - gupi - 1, data = .)
df_start_end_index <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
mutate(
tmp = row_number()
) %>%
group_by(gupi) %>%
summarize(
start_row = first(tmp),
end_row = last(tmp)
)
df_gose <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
transmute(
gupi = gupi %>% factor %>% as.numeric,
Outcomes.DerivedCompositeGOSE = as.numeric(Outcomes.DerivedCompositeGOSE),
Outcomes.DerivedCompositeGOSEDaysPostInjury = Outcomes.DerivedCompositeGOSEDaysPostInjury
)
stan_data <- list(
N = nrow(df),
n_pat = df_gose$gupi %>% unique %>% length,
pat_start_ind = df_start_end_index$start_row,
pat_end_ind = df_start_end_index$end_row,
t = df_gose$Outcomes.DerivedCompositeGOSEDaysPostInjury,
y = df_gose$Outcomes.DerivedCompositeGOSE,
id = df_gose$gupi,
modelmatrix = model_matrix_baseline,
p_cov = ncol(model_matrix_baseline),
t_out_ = config$t_out %>% as.matrix,
n_t_out = length(config$t_out),
knots = config$gp$t_knots,
n_knots = length(config$gp$t_knots)
)
seed <- digest::digest(stan_data, algo = "sha1") %>%
substr(1, 5) %>%
strtoi(base = 16)
cat(sprintf("seed: %i\n\r", seed))
rstan::rstan_options(auto_write = TRUE)
mdl <- rstan::stan(
file = modelfile,
data = stan_data,
chains = config$stan$chains,
cores = config$stan$chains,
seed = seed,
iter = config$stan$warmup + config$stan$iter,
warmup = config$stan$warmup,
refresh = 1,
control = list(
max_treedepth = config$stan$max_treedepth,
adapt_delta = config$stan$adapt_delta
)
)
df_posteriors <- rstan::extract(mdl)$gose_out %>% # (n_sample, n_pat, n_t)
# relative frequencies
apply(
MARGIN = c(2, 3),
function(x) table(factor(x, levels = 1:8)) / length(x) %>% as.numeric()
) %>% # (8, n_pat, n_t) -> permute
aperm(c(2, 1, 3)) %>%
as.data.frame %>%
as_tibble %>%
mutate(
gupi = row_number()
) %>%
gather(GOSE.t, probability, -gupi) %>%
separate(col = GOSE.t, c("GOSE", "t"), sep = "\\.", convert = TRUE) %>%
mutate(
t = stan_data$t_out_[t, 1],
gupi = (df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
.[["gupi"]] %>%
unique)[gupi]
) %>%
select(gupi, t, GOSE, probability) %>%
arrange(gupi, t)
saveRDS(df_posteriors, outputfile)
data {
// total number of observations
int<lower=1> N;
int<lower=1> n_pat;
// start / stop index per patient
int<lower=1, upper=N> pat_start_ind[n_pat];
int<lower=1, upper=N> pat_end_ind[n_pat];
// actual data:
real t[N]; // sampling times
real y[N]; // outcomes
int id[N]; // id as integers
int<lower=1> n_knots; // number of knots to model population mean
real<lower=0.0> knots[n_knots]; // knot positions
int n_t_out; // number of output points
real t_out_[n_t_out, 1]; // output times
int p_cov; // number covariates (model matrix)
matrix[n_pat, p_cov] modelmatrix;
}
transformed data {
real t_out[n_t_out] = t_out_[1:n_t_out, 1];
}
parameters {
// standard deviation of the population mean process
real<lower=0> s_pop_mean;
// lengthscale of the population mean process
// real<lower=0> l_pop_mean;
real<lower=.5, upper=8.5> knots_y[n_knots];
// standard deviation of the population mean process
real<lower=0> s;
// lengthscale of the population mean process
real<lower=30, upper=120> l;
// nugget term standard deviation
real<lower=.01> s_nugg;
vector[p_cov] beta;
}
transformed parameters {
matrix[n_knots, n_knots] K_knots_inv = inverse(cov_exp_quad(knots, s_pop_mean, l));
vector[n_pat] lin_pred = modelmatrix * beta;
}
model {
// priors
s ~ normal(4, 3);
s_pop_mean ~ normal(1, 1);
l ~ normal(60, 14);
s_nugg ~ normal(0, .1);
beta ~ normal(0, 1);
for (i in 1:n_pat) {
int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
// latent mean process
real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
matrix[n_sub, n_knots] K_latent_12 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
vector[n_sub] mu_sub = K_latent_12 * K_knots_inv * to_vector(knots_y);
vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
matrix[n_sub, n_sub] Sigma = cov_exp_quad(t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub));
gose ~ multi_normal(
mu_sub + rep_vector(lin_pred[i], n_sub),
Sigma
);
}
}
generated quantities {
real gose_out[n_pat, n_t_out];
row_vector[n_t_out] tmp;
matrix[n_t_out, n_t_out] Sigma;
for (i in 1:n_pat) {
int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
// latent mean process
real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
matrix[n_sub, n_sub] K_oo_inv = inverse(cov_exp_quad(t_sub, t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub)));
matrix[n_t_out, n_sub] K_no = cov_exp_quad(t_out, t_sub, s, l);
matrix[n_t_out, n_knots] K_latent_12 = cov_exp_quad(t_out, knots, s_pop_mean, l);
vector[n_t_out] mu_out = rep_vector(lin_pred[i], n_t_out) + K_latent_12 * K_knots_inv * to_vector(knots_y);
matrix[n_sub, n_knots] K_latent_12_2 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
vector[n_sub] mu_sub = rep_vector(lin_pred[i], n_sub) + K_latent_12_2 * K_knots_inv * to_vector(knots_y);
// observed values
vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
// posterior covariance matrix
Sigma = cov_exp_quad(t_out, t_out, s, l) - K_no * K_oo_inv * K_no' + diag_matrix(rep_vector(.001, n_t_out));
tmp = multi_normal_rng(
mu_out + K_no * K_oo_inv * (gose - mu_sub),
Sigma
)';
// discretize
for (j in 1:n_t_out) {
gose_out[i, j] = round(tmp[j]);
if (gose_out[i, j] > 8) {
gose_out[i, j] = 8;
}
if (gose_out[i, j] < 1) {
gose_out[i, j] = 1;
}
}
}
}
#!/usr/bin/env Rscript
library(tidyverse)
library(rstan)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
modelfile <- "models/gp_nb/gp_nb.stan"
config <- yaml::read_yaml("config.yml")
df <- readRDS(inputfile)
model_matrix_baseline <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi) %>%
select(
-Outcomes.DerivedCompositeGOSEDaysPostInjury, -Outcomes.DerivedCompositeGOSE
) %>%
summarize_all(first) %>%
model.matrix(~ . - gupi - 1, data = .)
df_start_end_index <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
mutate(
tmp = row_number()
) %>%
group_by(gupi) %>%
summarize(
start_row = first(tmp),
end_row = last(tmp)
)
df_gose <- df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
transmute(
gupi = gupi %>% factor %>% as.numeric,
Outcomes.DerivedCompositeGOSE = as.numeric(Outcomes.DerivedCompositeGOSE),
Outcomes.DerivedCompositeGOSEDaysPostInjury = Outcomes.DerivedCompositeGOSEDaysPostInjury
)
stan_data <- list(
N = nrow(df),
n_pat = df_gose$gupi %>% unique %>% length,
pat_start_ind = df_start_end_index$start_row,
pat_end_ind = df_start_end_index$end_row,
t = df_gose$Outcomes.DerivedCompositeGOSEDaysPostInjury,
y = df_gose$Outcomes.DerivedCompositeGOSE,
id = df_gose$gupi,
modelmatrix = model_matrix_baseline,
p_cov = ncol(model_matrix_baseline),
t_out_ = config$t_out %>% as.matrix,
n_t_out = length(config$t_out),
knots = config$gp$t_knots,
n_knots = length(config$gp$t_knots)
)
seed <- digest::digest(stan_data, algo = "sha1") %>%
substr(1, 5) %>%
strtoi(base = 16)
cat(sprintf("seed: %i\n\r", seed))
rstan::rstan_options(auto_write = TRUE)
mdl <- rstan::stan(
file = modelfile,
data = stan_data,
chains = config$stan$chains,
cores = config$stan$chains,
seed = seed,
iter = config$stan$warmup + config$stan$iter,
warmup = config$stan$warmup,
refresh = 1,
control = list(
max_treedepth = config$stan$max_treedepth,
adapt_delta = config$stan$adapt_delta
)
)
df_posteriors <- rstan::extract(mdl)$gose_out %>% # (n_sample, n_pat, n_t)
# relative frequencies
apply(
MARGIN = c(2, 3),
function(x) table(factor(x, levels = 1:8)) / length(x) %>% as.numeric()
) %>% # (8, n_pat, n_t) -> permute
aperm(c(2, 1, 3)) %>%
as.data.frame %>%
as_tibble %>%
mutate(
gupi = row_number()
) %>%
gather(GOSE.t, probability, -gupi) %>%
separate(col = GOSE.t, c("GOSE", "t"), sep = "\\.", convert = TRUE) %>%
mutate(
t = stan_data$t_out_[t, 1],
gupi = (df %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
.[["gupi"]] %>%
unique)[gupi]
) %>%
select(gupi, t, GOSE, probability) %>%
arrange(gupi, t)
saveRDS(df_posteriors, outputfile)
data {
// total number of observations
int<lower=1> N;
int<lower=1> n_pat;
// start / stop index per patient
int<lower=1, upper=N> pat_start_ind[n_pat];
int<lower=1, upper=N> pat_end_ind[n_pat];
// actual data:
real t[N]; // sampling times
real y[N]; // outcomes
int id[N]; // id as integers
int<lower=1> n_knots; // number of knots to model population mean
real<lower=0.0> knots[n_knots]; // knot positions
int n_t_out; // number of output points
real t_out_[n_t_out, 1]; // output times
int p_cov; // number covariates (model matrix)
matrix[n_pat, p_cov] modelmatrix;
}
transformed data {
real t_out[n_t_out] = t_out_[1:n_t_out, 1];
}
parameters {
// standard deviation of the population mean process
real<lower=0> s_pop_mean;
// lengthscale of the population mean process
// real<lower=0> l_pop_mean;
real<lower=.5, upper=8.5> knots_y[n_knots];
// standard deviation of the population mean process
real<lower=0> s;
// lengthscale of the population mean process
real<lower=30, upper=120> l;
// nugget term standard deviation
real<lower=.01> s_nugg;
}
transformed parameters {
matrix[n_knots, n_knots] K_knots_inv = inverse(cov_exp_quad(knots, s_pop_mean, l));
}
model {
// priors
s ~ normal(4, 3);
s_pop_mean ~ normal(1, 1);
l ~ normal(60, 14);
s_nugg ~ normal(0, .1);
for (i in 1:n_pat) {
int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
// latent mean process
real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
matrix[n_sub, n_knots] K_latent_12 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
vector[n_sub] mu_sub = K_latent_12 * K_knots_inv * to_vector(knots_y);
vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
matrix[n_sub, n_sub] Sigma = cov_exp_quad(t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub));
gose ~ multi_normal(
mu_sub,
Sigma
);
}
}
generated quantities {
real gose_out[n_pat, n_t_out];
row_vector[n_t_out] tmp;
matrix[n_t_out, n_t_out] Sigma;
for (i in 1:n_pat) {
int n_sub = pat_end_ind[i] - pat_start_ind[i] + 1;
// latent mean process
real t_sub[n_sub] = t[pat_start_ind[i]:pat_end_ind[i]];
matrix[n_sub, n_sub] K_oo_inv = inverse(cov_exp_quad(t_sub, t_sub, s, l) + diag_matrix(rep_vector(pow(s_nugg, 2.0), n_sub)));
matrix[n_t_out, n_sub] K_no = cov_exp_quad(t_out, t_sub, s, l);
matrix[n_t_out, n_knots] K_latent_12 = cov_exp_quad(t_out, knots, s_pop_mean, l);
vector[n_t_out] mu_out = K_latent_12 * K_knots_inv * to_vector(knots_y);
matrix[n_sub, n_knots] K_latent_12_2 = cov_exp_quad(t_sub, knots, s_pop_mean, l);
vector[n_sub] mu_sub = K_latent_12_2 * K_knots_inv * to_vector(knots_y);
// observed values
vector[n_sub] gose = to_vector(y[pat_start_ind[i]:pat_end_ind[i]]);
// posterior covariance matrix
Sigma = cov_exp_quad(t_out, t_out, s, l) - K_no * K_oo_inv * K_no' + diag_matrix(rep_vector(.001, n_t_out));
tmp = multi_normal_rng(
mu_out + K_no * K_oo_inv * (gose - mu_sub),
Sigma
)';
// discretize
for (j in 1:n_t_out) {
gose_out[i, j] = round(tmp[j]);
if (gose_out[i, j] > 8) {
gose_out[i, j] = 8;
}
if (gose_out[i, j] < 1) {
gose_out[i, j] = 1;
}
}
}
}
#!/usr/bin/env Rscript
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
config <- yaml::read_yaml("config.yml")
# read and process input data
df <- readRDS(inputfile) %>%
mutate(
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = c(1, 3:8), # 2 is too infrequent to be fitted
ordered = TRUE
)
)
impute_locf <- function(t, GOSE, tolerance, t_pred) {
res <- numeric(length(t_pred))
res[] <- NA_integer_
for (i in 1:length(t_pred)) {
res[i] <- GOSE[t < t_pred[i] + tolerance] %>%
tail(1) %>% # use last if multiple available
ifelse(length(.) == 0, NA, .)
}
return(res %>% as.integer)
}
df_posteriors <- df %>%
group_by(gupi) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSE) %>%
do(
t = config$t_out,
GOSE_predicted = impute_locf(.$Outcomes.DerivedCompositeGOSEDaysPostInjury, .$Outcomes.DerivedCompositeGOSE %>% as.character %>% as.integer, config$t_buffer, config$t_out)
) %>%
unnest %>%
mutate(
`1` = (GOSE_predicted == 1) %>% as.numeric,
`2` = (GOSE_predicted == 2) %>% as.numeric,
`3` = (GOSE_predicted == 3) %>% as.numeric,
`4` = (GOSE_predicted == 4) %>% as.numeric,
`5` = (GOSE_predicted == 5) %>% as.numeric,
`6` = (GOSE_predicted == 6) %>% as.numeric,
`7` = (GOSE_predicted == 7) %>% as.numeric,
`8` = (GOSE_predicted == 8) %>% as.numeric
) %>%
select(-GOSE_predicted) %>%
gather(GOSE, probability, `1`:`8`) %>%
mutate(
gupi = as.character(gupi),
GOSE = as.integer(GOSE)
) %>%
arrange(gupi, t, GOSE)
saveRDS(df_posteriors, outputfile)
#!/usr/bin/env Rscript
library(tidyverse)
library(brms)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
config <- yaml::read_yaml("config.yml")
# read and process input data
df <- readRDS(inputfile) %>%
mutate(
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = Outcomes.DerivedCompositeGOSE %>%
unique %>%
as.numeric %>%
sort %>%
as.character,
ordered = TRUE
)
) %>%
mutate_if(is.character, factor)
# generate random seed from first 5 digits of the sha1 hash value
seed <- digest::digest(df, algo = "sha1") %>%
substr(1, 5) %>%
strtoi(base = 16)
cat(sprintf("seed: %i\n\r", seed))
formula <- Outcomes.DerivedCompositeGOSE ~
s(Outcomes.DerivedCompositeGOSEDaysPostInjury) +
(I(Outcomes.DerivedCompositeGOSEDaysPostInjury^2) + Outcomes.DerivedCompositeGOSEDaysPostInjury + 1 | gupi) +
. - gupi - Outcomes.DerivedCompositeGOSEDaysPostInjury - Outcomes.DerivedCompositeGOSE
mdl <- brms::brm(
formula,
data = df,
family = brms::cumulative("logit", threshold = "flexible"),
chains = config$stan$chains,
cores = config$stan$chains,
seed = seed,
iter = config$stan$warmup + config$stan$iter,
warmup = config$stan$warmup,
refresh = 1,
control = list(
max_treedepth = config$stan$max_treedepth,
adapt_delta = config$stan$adapt_delta
),
save_warmup = FALSE
)
df_new_data <- expand.grid(
gupi = df$gupi %>% unique,
Outcomes.DerivedCompositeGOSEDaysPostInjury = config$t_out
) %>%
as_tibble %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
left_join(
df %>%
select(-Outcomes.DerivedCompositeGOSE, -Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi) %>%
summarize_all(first) %>%
ungroup,
by = c("gupi")
)
df_posteriors <- predict(mdl, df_new_data, summary = TRUE) %>%
as_tibble() %>%
mutate(
gupi = df_new_data$gupi,
Outcomes.DerivedCompositeGOSEDaysPostInjury = df_new_data$Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>% {
# since not all levels are observed/fitted, need to extend predictions
# by zero for those categories, dplyr black magic
cols <- sprintf("P(Y = %s)", 1:8)
missing_cols <- cols[!(cols %in% names(as_tibble(.)))]
res <- .
for (newcol in missing_cols) {
# add missing factor levels
res <- mutate(res, !!newcol := 0.0)
}
res <- select(res, gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury, everything())
return(res)
} %>%
rename(t = Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
gather(GOSE, probability, starts_with("P(Y =")) %>%
mutate(
GOSE = factor(GOSE, labels = 1:8, levels = sprintf("P(Y = %s)", 1:8))
) %>%
arrange(gupi, t, GOSE) %>%
mutate_if(is.factor, as.character) %>%
mutate(
GOSE = GOSE %>% as.integer,
gupi = as.character(gupi)
)
saveRDS(df_posteriors, outputfile)
#!/usr/bin/env Rscript
library(tidyverse)
library(brms)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
config <- yaml::read_yaml("config.yml")
# read and process input data
df <- readRDS(inputfile) %>%
mutate(
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = Outcomes.DerivedCompositeGOSE %>%
unique %>%
as.numeric %>%
sort %>%
as.character,
ordered = TRUE
)
) %>%
mutate_if(is.character, factor)
# generate random seed from first 5 digits of the sha1 hash value
seed <- digest::digest(df, algo = "sha1") %>%
substr(1, 5) %>%
strtoi(base = 16)
cat(sprintf("seed: %i\n\r", seed))
formula <- Outcomes.DerivedCompositeGOSE ~
s(Outcomes.DerivedCompositeGOSEDaysPostInjury) +
(I(Outcomes.DerivedCompositeGOSEDaysPostInjury^2) + Outcomes.DerivedCompositeGOSEDaysPostInjury + 1 | gupi)
mdl <- brms::brm(
formula,
data = df,
family = brms::cumulative("logit", threshold = "flexible"),
chains = config$stan$chains,
cores = config$stan$chains,
seed = seed,
iter = config$stan$warmup + config$stan$iter,
warmup = config$stan$warmup,
refresh = 1,
control = list(
max_treedepth = config$stan$max_treedepth,
adapt_delta = config$stan$adapt_delta
),
save_warmup = FALSE
)
df_new_data <- expand.grid(
gupi = df$gupi %>% unique,
Outcomes.DerivedCompositeGOSEDaysPostInjury = config$t_out
) %>%
as_tibble %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
left_join(
df %>%
select(-Outcomes.DerivedCompositeGOSE, -Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi) %>%
summarize_all(first) %>%
ungroup,
by = c("gupi")
)
df_posteriors <- predict(mdl, df_new_data, summary = TRUE) %>%
as_tibble() %>%
mutate(
gupi = df_new_data$gupi,
Outcomes.DerivedCompositeGOSEDaysPostInjury = df_new_data$Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>% {
# since not all levels are observed/fitted, need to extend predictions
# by zero for those categories, dplyr black magic
cols <- sprintf("P(Y = %s)", 1:8)
missing_cols <- cols[!(cols %in% names(as_tibble(.)))]
res <- .
for (newcol in missing_cols) {
# add missing factor levels
res <- mutate(res, !!newcol := 0.0)
}
res <- select(res, gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury, everything())
return(res)
} %>%
rename(t = Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
gather(GOSE, probability, starts_with("P(Y =")) %>%
mutate(
GOSE = factor(GOSE, labels = 1:8, levels = sprintf("P(Y = %s)", 1:8))
) %>%
arrange(gupi, t, GOSE) %>%
mutate_if(is.factor, as.character) %>%
mutate(
GOSE = GOSE %>% as.integer,
gupi = as.character(gupi)
)
saveRDS(df_posteriors, outputfile)
#!/usr/bin/env Rscript
library(tidyverse)
library(msm)
args <- commandArgs(trailingOnly = TRUE)
inputfile <- args[[1]]
outputfile <- args[[2]]
config <- yaml::read_yaml("config.yml")
# filter deaths - we use exact times
# GOSE = 2 is too infreqeunt
df_gose <- readRDS(inputfile) %>%
mutate(
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = c(1, 3:8, 99), # 2 is too infrequent to be fitted
ordered = TRUE
) %>% as.character %>% as.integer
) %>% select(
gupi,
Outcomes.DerivedCompositeGOSE,
Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>%
arrange(
gupi,
Outcomes.DerivedCompositeGOSEDaysPostInjury
)
# translate to msm formatting
tmp <- df_gose %>%
rbind(tibble( # only predict GOSE at 180 days - takes too long otherwise
gupi = df_gose$gupi %>% unique,
Outcomes.DerivedCompositeGOSEDaysPostInjury = config$t_out_msm,
Outcomes.DerivedCompositeGOSE = 99
)) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
mutate(
obstype = ifelse(Outcomes.DerivedCompositeGOSE == 1, 3, 1), # make death exactly observed transition
Outcomes.DerivedCompositeGOSE = factor(
Outcomes.DerivedCompositeGOSE,
levels = c(1, 3:8, 99),
labels = c(1:7, 99)
) %>% as.character %>% as.numeric
) %>%
group_by(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
# remove prediction states where we already have a true one
filter((n() == 1) | (Outcomes.DerivedCompositeGOSE != 99)) %>%
ungroup
# define transition matrix
Q <- matrix(0, nrow = 7, ncol = 7)
for (i in 1:7) {
for (j in 1:7) {
if (i == j - 1 & i != 1) {
Q[i, j] <- 1
}
if (i == j + 1) {
Q[i, j] <- 1
}
}
}
Q[2:7, 1] <- 1 # allow instantaneous deaths
# someone comes back from the dead - make sure death is last observation
fit <- msm(
Outcomes.DerivedCompositeGOSE ~ Outcomes.DerivedCompositeGOSEDaysPostInjury,
subject = tmp$gupi,
data = tmp,
obstype = tmp$obstype,
gen.inits = TRUE,
qmatrix = Q,
censor = 99,
pci = c(90, 270),
censor.states = 1:7,
control = list(
fnscale = config$fnscale_msm,
maxit = config$maxiter_msm,
trace = 2
)
)
# get fitted values at censored (prediction) states
fitted <- viterbi.msm(fit)
# store posteriors
df_posteriors <- tibble(
gupi = fitted$subject,
t = fitted$time
) %>%
cbind(
fitted$pstate %>% {colnames(.) <- c("1", as.character(3:8)); .}
) %>%
mutate("2" = 0) %>%
filter(t == config$t_out_msm) %>%
gather(GOSE, probability, as.character(1:8)) %>%
arrange(gupi, t, GOSE)
saveRDS(df_posteriors, outputfile)
This diff is collapsed.
---
title: "Extract and prepare data"
statistician: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
collaborator: "David Menon (dkm13@cam.ac.uk)"
output: reportr::report
date: "`r Sys.time()`"
params:
datapath: "../data/v1.1"
max_lab_days: 3
seed: 42
---
```{r setup, include=FALSE}
library(tidyverse)
library(centertbi)
library(describr)
library(ggalluvial)
set.seed(params$seed)
df_labs <- readRDS(sprintf('%s/df_labs.rds', params$datapath))
df_baseline <- readRDS(sprintf('%s/df_baseline.rds', params$datapath))
df_ctmri <- readRDS(sprintf('%s/df_ctmri.rds', params$datapath))
df_imaging <- readRDS(sprintf('%s/df_imaging.rds', params$datapath))
df_gose <- readRDS(sprintf('%s/df_gose.rds', params$datapath))
```
# Extract data
```{r}
df_deaths <- df_baseline %>%
transmute(
gupi,
days = difftime(
Subject.DeathDate %>% as.character %>% ifelse(. == "", NA, .) %>% as.Date,
"1970-01-01",
units = "days"
) %>% as.numeric
) %>%
filter(complete.cases(.))
df_baseline <- df_baseline %>%
select(-Subject.DeathDate) %>%
mutate(
# combine suspect and definite to "yes"
InjuryHx.EDComplEventHypoxia = factor(InjuryHx.EDComplEventHypoxia,
levels = 0:2,
labels = c("no", "yes", "yes")
),
# combine suspect and definite to "yes"
InjuryHx.EDComplEventHypotension = factor(InjuryHx.EDComplEventHypotension,
levels = 0:2,
labels = c("no", "yes", "yes")
)
) %>%
# add lab values
left_join(
df_labs %>%
distinct %>% # filter duplicates
mutate(Labs.DLDate = as.Date(Labs.DLDate)) %>%
filter(
difftime(Labs.DLDate, "1970-01-01", units = "days") %>% as.numeric <= params$max_lab_days
) %>%
group_by(gupi) %>%
arrange(gupi, Labs.DLDate) %>%
summarise(
Labs.DLHemoglobingdL = na.omit(Labs.DLHemoglobingdL)[1],
Labs.DLGlucosemmolL = na.omit(Labs.DLGlucosemmolL)[1]
),
by = "gupi"
) %>%
# add imaging data
left_join(
df_imaging %>%
distinct() %>%
mutate(
Imaging.MarshallCTClassification = factor(
Imaging.MarshallCTClassification, levels = 1:6
),
Imaging.ExperimentDate = Imaging.ExperimentDate %>%
as.character %>% ifelse(. == "", NA, .) %>% as.Date
) %>%
arrange(gupi, Imaging.ExperimentDate) %>%
group_by(gupi) %>%
summarize(
Imaging.MarshallCTClassification = na.omit(Imaging.MarshallCTClassification)[1]
),
by = "gupi"
) %>%
# add CTMRI data
left_join(
df_ctmri %>%
distinct() %>%
mutate(
CTMRI.CTMRIDate = CTMRI.CTMRIDate %>%
as.character %>% ifelse(. == "", NA, .) %>% as.Date
) %>%
arrange(gupi, CTMRI.CTMRIDate) %>%
group_by(gupi) %>%
summarize(
CTMRI.CTSubarachnoidHem = na.omit(CTMRI.CTSubarachnoidHem)[1],
CTMRI.CTExtraduralHema = na.omit(CTMRI.CTExtraduralHema)[1],
) %>%
mutate(
CTMRI.CTSubarachnoidHem = factor(CTMRI.CTSubarachnoidHem, levels = 0:3, labels = c("no", "yes", "yes", "yes")),
CTMRI.CTExtraduralHema = factor(CTMRI.CTExtraduralHema, levels = 0:2, labels = c("no", "yes", "yes"))
),
by = "gupi"
)
```
## GOSE data
```{r gose-outcomes-ambiguity}
df_gose <- df_gose %>%
distinct %>%
filter(complete.cases(.)) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
group_by(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
mutate(Outcomes.DerivedCompositeGOSE = factor(Outcomes.DerivedCompositeGOSE, levels = 1:8))
```
This results in `r nrow(df_gose)` GOSE measurements of
`r df_gose %>% group_by(gupi) %>% n_groups()` individuals.
# Compile final datasets
* exclude all patient who do not survive first 6 months (no need to impute)
* exclude all patients with no GOSE measurement (no imputation)
```{r}
early_deaths_gupi <- df_deaths %>%
filter(days <= 180 - 14) %>%
.[["gupi"]]
df_gose <- df_gose %>%
filter(!(gupi %in% early_deaths_gupi)) %>%
ungroup %>%
# add confirmed deaths
rbind(
df_deaths %>%
filter(days > 180) %>%
rename(
Outcomes.DerivedCompositeGOSEDaysPostInjury = days
) %>%
mutate(
Outcomes.DerivedCompositeGOSE = 1
)
) %>%
# remove observations after death
group_by(gupi) %>%
arrange(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury) %>%
filter(
{
first_death <- which(Outcomes.DerivedCompositeGOSE == 1)[1]
(is.na(first_death) | n() <= first_death)
}
) %>%
ungroup()
df_baseline <- df_baseline %>%
filter(
gupi %in% df_gose$gupi
)
```
This results in `r nrow(df_gose)` GOSE measurements of
`r df_gose %>% group_by(gupi) %>% n_groups()` individuals.
## Plausibility check
The only genuinly numerical variables are Age, Glucose_mmolL, and Hb_dL.
All other variables are factors and may therefore not contain outliers.
```{r}
df_baseline %>%
select(Subject.Age, Labs.DLGlucosemmolL, Labs.DLHemoglobingdL) %>%
gather(Variable, value) %>%
ggplot(aes(value)) +
geom_histogram() +
facet_wrap(~Variable, scales = "free") +
theme_bw() +
theme(panel.grid = element_blank())
```
Glucose is obviously left-skewed and a log transfrom might improve fits in linear
models.
```{r, echo=TRUE}
df_baseline %>%
select(gupi, Labs.DLHemoglobingdL) %>%
filter(Labs.DLHemoglobingdL > 50)
```
All values above 50 are considered implausible and set to missing
(probably meant as missing).
```{r}
df_baseline <- df_baseline %>%
mutate(
Labs.DLHemoglobingdL = ifelse(Labs.DLHemoglobingdL > 50, NA_real_, Labs.DLHemoglobingdL),
Labs.DLGlucosemmolL_log = log(Labs.DLGlucosemmolL)
) %>%
select(-Labs.DLGlucosemmolL)
```
```{r save}
saveRDS(df_gose, "df_gose.rds")
saveRDS(df_baseline, "df_baseline.rds")
```
# Describe data
## Baseline covariates
```{r, fig.height=8, fig.width=3.25, out.width=".45\\textwidth", warning=FALSE, message=FALSE}
thm <- theme_default(6)
thm$body$descriptor$style$plot_cell$plot_height <- unit(.1, "in")
thm$colwidths$levels <- unit(1, "in")
thm$colwidths$variables <- unit(1.15, "in")
df_baseline %>%
select(-gupi) %>%
describr(
theme_new = thm,
totals = TRUE
) %>%
describe_if(
is.numeric,
with = list(
dscr_mean_sd(),
dscr_median_q1_q3(),
dscr_min_max(),
dscr_histogram()
)
) %>%
describe_if(
is.factor,
with = list(dscr_freq(), dscr_factor_barchart())
) %>%
fit_to_height(height = 8)
```
## GOSE over time
GOSE times are rounded to the nearest 3 months period (averaged/rounded if multiple
values present).
```{r alluvial-data}
df_gose %>%
# compute GOSE per quarter
transmute(
gupi,
quarter = round(Outcomes.DerivedCompositeGOSEDaysPostInjury/90),
Outcomes.DerivedCompositeGOSE = as.numeric(as.character(Outcomes.DerivedCompositeGOSE))
) %>%
group_by(gupi, quarter) %>%
summarise(
GOSE = round(mean(Outcomes.DerivedCompositeGOSE)) %>% ifelse(. == 2, 1, .)
) %>%
ungroup() %>%
filter(
quarter %in% c(1, 2, 4)
) %>%
spread(quarter, GOSE) %>%
gather(quarter, GOSE, `1`, `2`, `4`) %>%
mutate(
GOSE = factor(GOSE, levels = 8:1) %>% forcats::fct_explicit_na(),
Timepoint = factor(quarter, levels = c("1", "2", "4"), labels = c("3 months", "6 months", "12 months"))
) ->
df_alluvial
```
```{r plot-alluvial-gradient, fig.height=4.5}
df_alluvial %>%
ggplot(aes(x = Timepoint, stratum = GOSE, alluvium = gupi, label = GOSE, fill = GOSE)) +
geom_flow(aes.flow = "backward") +
geom_stratum() +
scale_fill_manual(
values = c(viridisLite::viridis(7, direction = -1), "grey"),
breaks = c(8:3, 1, "(Missing)")
) +
theme_bw() +
theme(
panel.grid = element_blank()
)
ggsave("gose_alluvial_gradient_coloring.pdf", height = 5, width = 8)
ggsave("gose_alluvial_gradient_coloring.png", height = 5, width = 8)
```
```{r plot-alluvial-differential, fig.height=4.5}
df_alluvial %>%
ggplot(aes(x = Timepoint, stratum = GOSE, alluvium = gupi, label = GOSE, fill = GOSE)) +
geom_flow(aes.flow = "backward") +
geom_stratum() +
scale_fill_manual(
values = c(RColorBrewer::brewer.pal(7, "Set3"), "grey"),
breaks = c(8:3, 1, "(Missing)")
) +
theme_bw() +
theme(
panel.grid = element_blank()
)
ggsave("gose_alluvial_differential_coloring.pdf", height = 5, width = 8)
ggsave("gose_alluvial_differential_coloring.png", height = 5, width = 8)
```
```{r zip-figures}
system("zip figures.zip *.png *.pdf")
system("rm *.png *.pdf")
```
# Session Info
```{r session-info}
sessionInfo()
```
@article{msm2011,
author = {Jackson, Christopher H.},
doi = {10.18637/jss.v038.i08},
journal = {Journal of Statistical Software},
month = {jan},
number = {8},
pages = {1--28},
title = {{Multi-State Models for Panel Data: The msm Package for R}},
volume = {38},
year = {2011}
}
@article{r2016,
archivePrefix = {arXiv},
arxivId = {arXiv:1011.1669v3},
author = {Team, R Development Core and {R Development Core Team}, R},
doi = {10.1007/978-3-540-74686-7},
eprint = {arXiv:1011.1669v3},
isbn = {3{\_}900051{\_}00{\_}3},
issn = {3-900051-07-0},
journal = {R Foundation for Statistical Computing},
pmid = {16106260},
title = {{R: A Language and Environment for Statistical Computing}},
year = {2016}
}
@book{rasmussen2006,
abstract = {Gaussian processes (GPs) are natural generalisations of multivariate Gaussian random variables to infinite (countably or continuous) index sets. GPs have been applied in a large number of fields to a diverse range of ends, and very many deep theoretical analyses of various properties are available. This paper gives an introduction to Gaussian processes on a fairly elementary level with special emphasis on characteristics relevant in machine learning. It draws explicit connections to branches such as spline smoothing models and support vector machines in which similar ideas have been investigated. Gaussian process models are routinely used to solve hard machine learning problems. They are attractive because of their flexible non-parametric nature and computational simplicity. Treated within a Bayesian framework, very powerful statistical methods can be implemented which offer valid estimates of uncertainties in our predictions and generic model selection procedures cast as nonlinear optimization problems. Their main drawback of heavy computational scaling has recently been alleviated by the introduction of generic sparse approximations.13,78,31 The mathematical literature on GPs is large and often uses deep concepts which are not required to fully understand most machine learning applications. In this tutorial paper, we aim to present characteristics of GPs relevant to machine learning and to show up precise connections to other "kernel machines" popular in the community. Our focus is on a simple presentation, but references to more detailed sources are provided.},
archivePrefix = {arXiv},
arxivId = {026218253X},
author = {Rasmussen, Carl Edward and Williams, Christopher K. I.},
booktitle = {International Journal of Neural Systems},
doi = {10.1142/S0129065704001899},
eprint = {026218253X},
isbn = {026218253X},
issn = {0129-0657},
pmid = {15112367},
title = {{Gaussian Processes for Machine Learning}},
year = {2006}
}
@article{stan2017,
author = {Carpenter, B and Gelman, A and Hoffman, MD and Lee, D},
journal = {Journal of statistical software},
number = {1},
title = {{Stan: A Probabilistic Programming Language}},
volume = {76},
year = {2017}
}
@book{Agresti2003,
author = {Agresti, Alan},
editor = {{John Wiley {\&} Sons}},
title = {{Categorical data analysis}},
year = {2003}
}
@article{brms2018,
author = {B{\"{u}}rkner, Paul},
journal = {The R Journal},
number = {1},
pages = {395--411},
title = {{Advanced Bayesian Multilevel Modeling with the R Package brms}},
volume = {10},
year = {2018}
}
@article{brms2017,
author = {B{\"{u}}rkner, Paul},
journal = {Journal Of Statistical Software},
number = {1},
pages = {1--28},
title = {{brms: An R Package for Bayesian Multilevel Models Using Stan}},
volume = {80},
year = {2017}
}
@article{Steyerberg2008,
author = {Steyerberg, Ewout W and Mushkudiani, Nino and Perel, Pablo and Butcher, Isabella and Lu, Juan and McHugh, Gillian S and Murray, Gordon D and Marmarou, Anthony and Roberts, Ian and Habbema, J. Dik F and Maas, Andrew I. R},
doi = {10.1371/journal.pmed.0050165},
editor = {Singer, Mervyn},
journal = {PLoS Medicine},
month = {aug},
number = {8},
pages = {e165},
title = {{Predicting Outcome after Traumatic Brain Injury: Development and International Validation of Prognostic Scores Based on Admission Characteristics}},
url = {https://dx.plos.org/10.1371/journal.pmed.0050165},
volume = {5},
year = {2008}
}
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
data_dir <- args[[1]]
m <- as.numeric(args[[2]])
df_gose <- readRDS(sprintf("%s/df_gose.rds", data_dir))
for (i in 1:m) {
df_baseline <- readRDS(sprintf("%s/mi_baseline/df_baseline_mi_%i.rds", data_dir, i))
df <- df_gose %>%
left_join(df_baseline, by = "gupi")
saveRDS(df, sprintf("%s/imputation/df_mi_%i.rds", data_dir, i))
}
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
data_dir <- args[[1]]
m <- as.numeric(args[[2]])
k <- as.numeric(args[[3]])
seed <- as.integer(args[[4]])
set.seed(seed)
df_gose <- readRDS(sprintf("%s/df_gose.rds", data_dir))
# get ids of all subjects with a valid 6mo GOSE
test_candidate_ids <- df_gose %>%
group_by(gupi) %>%
filter(
# at least one non-death observation close to 6 months
any(
(Outcomes.DerivedCompositeGOSEDaysPostInjury >= 180 - 14) &
(Outcomes.DerivedCompositeGOSE != 1) &
(Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180 + 14)
),
# at least one observation not close to 6 months
any(
(Outcomes.DerivedCompositeGOSEDaysPostInjury < 180 - 14) |
(Outcomes.DerivedCompositeGOSEDaysPostInjury > 180 + 14)
)
) %>%
.[["gupi"]] %>%
unique %>%
as.character
df_validation <- tibble(
gupi = df_gose$gupi %>% unique,
valid_test = gupi %in% test_candidate_ids
) %>%
group_by(
valid_test
) %>%
mutate(
fold = sample(rep(1:k, ceiling(n()/k)))[1:n()]
) %>%
ungroup
# saveRDS(df_validation, "df_validation_splits.rds")
for (i in 1:m) {
df_baseline <- readRDS(sprintf("%s/mi_baseline/df_baseline_mi_%i.rds", data_dir, i))
for (j in 1:k) {
df_test_rows <- df_gose %>%
left_join(df_baseline, by = "gupi") %>%
left_join(df_validation, by = "gupi") %>%
filter(fold == j & valid_test) %>%
group_by(gupi) %>%
filter(
Outcomes.DerivedCompositeGOSEDaysPostInjury ==
Outcomes.DerivedCompositeGOSEDaysPostInjury[which.min(abs(Outcomes.DerivedCompositeGOSEDaysPostInjury - 180))]) %>%
ungroup() %>%
select(
gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>%
mutate(test = TRUE) %>%
right_join(
df_gose %>% select(gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury),
by = c("gupi", "Outcomes.DerivedCompositeGOSEDaysPostInjury")
) %>%
mutate(test = ifelse(is.na(test), FALSE, test))
df_test <- df_gose %>%
left_join(df_test_rows, by = c("gupi", "Outcomes.DerivedCompositeGOSEDaysPostInjury")) %>%
filter(test) %>%
select(-test) %>%
left_join(df_baseline, by = "gupi")
df_train <- df_gose %>%
left_join(df_test_rows, by = c("gupi", "Outcomes.DerivedCompositeGOSEDaysPostInjury")) %>%
filter(!test) %>%
select(-test) %>%
left_join(df_baseline, by = "gupi")
# plausi checks
if (nrow(df_test) + nrow(df_train) != nrow(df_gose)) stop("test/train does not add up")
if (any(df_test$Outcomes.DerivedCompositeGOSE == 1)) stop("test cases cannot be GOSE 1")
saveRDS(df_test, sprintf("%s/validation/df_test_mi_%i_fold_%i.rds", data_dir, i, j))
saveRDS(df_train, sprintf("%s/validation/df_train_mi_%i_fold_%i.rds", data_dir, i, j))
}
}
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
in_folder <- args[[1]]
out_folder <- args[[2]]
in_files <- sprintf("%s/%s", in_folder, list.files(path = in_folder, pattern = "*.csv"))
if (length(in_files) != 5) stop("must have exactly 5 input files")
for (f in in_files) {
tmp <- read.csv(f)
if ("Subject.Age" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_baseline.rds", out_folder))
if ("Outcomes.DerivedCompositeGOSE" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_gose.rds", out_folder))
if ("Labs.DLDate" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_labs.rds", out_folder))
if ("Imaging.MarshallCTClassification" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_imaging.rds", out_folder))
if ("CTMRI.CTSubarachnoidHem" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_ctmri.rds", out_folder))
}
library(tidyverse)
library(mice)
args <- commandArgs(trailingOnly = TRUE)
df_baseline <- readRDS(args[[1]])
out_dir <- args[[2]]
m <- as.numeric(args[[3]])
maxiter <- as.numeric(args[[4]])
seed <- as.numeric(args[[5]])
mi_baseline <- df_baseline %>%
select(-gupi) %>%
mutate_if(is.character, factor) %>%
mice::mice(
m = m,
seed = seed,
maxit = maxiter
)
# save the individual complete data sets
for (i in 1:m) {
mice::complete(mi_baseline, action = i, include = FALSE) %>%
mutate_if(is.factor, as.character) %>%
as_tibble() %>%
mutate(
gupi = df_baseline$gupi
) %>%
select(gupi, everything()) %>%
saveRDS(file = sprintf("%s/df_baseline_mi_%i.rds", out_dir, i))
}
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
modelimputations_dir <- args[[1]]
gosefile <- args[[2]]
outputfile <- args[[3]]
set.seed(42)
df_gose <- readRDS(gosefile)
posteriors <- list.files(modelimputations_dir, pattern = "*.rds")
# load and combine all posteriros over different (baseline) imputations
df_imputations <- c()
for (i in 1:length(posteriors)) {
df_imputations <- rbind(
df_imputations,
readRDS(sprintf("%s/%s", modelimputations_dir, posteriors[i])) %>%
mutate(
imp = i
)
)
}
df_imputations <- df_imputations %>%
group_by(gupi, t, GOSE) %>%
summarize(probability = mean(probability, na.rm = TRUE)) %>%
ungroup
df_imputations <- df_imputations %>%
select(-t) %>%
filter(complete.cases(.)) %>%
group_by(gupi) %>%
mutate(
Subject.DerivedImputed180DaysGOSE = which.max(probability) %>%
{
if (length(.) == 0) { # fix all probs. NA case
return(NA)
} else {
return(.)
}
} %>%
sample(1) %>% # need to resolve ambiguous gose predictions !
rep(n()),
GOSE = factor(
GOSE,
levels = 1:8,
labels = sprintf("Subject.DerivedImputed180DaysGOSE_%i_probability", 1:8)
)
) %>%
ungroup %>%
spread(GOSE, probability)
# extract per protocol gose observations
df_per_protocol_gose <- df_gose %>%
filter(
Outcomes.DerivedCompositeGOSEDaysPostInjury >= 5*30,
Outcomes.DerivedCompositeGOSEDaysPostInjury <= 8*30
) %>%
# pick closest to 180
group_by(gupi) %>%
summarize(
closest_per_protocol_GOSE = Outcomes.DerivedCompositeGOSE[
which.min(abs(Outcomes.DerivedCompositeGOSEDaysPostInjury - 180))
]
)
df_imputations <- df_imputations %>%
left_join(df_per_protocol_gose, by = "gupi") %>%
mutate(
Subject.GOSE6monthEndpointDerived = ifelse(
is.na(closest_per_protocol_GOSE),
Subject.DerivedImputed180DaysGOSE,
closest_per_protocol_GOSE
)
) %>%
select(
gupi,
Subject.GOSE6monthEndpointDerived,
Subject.DerivedImputed180DaysGOSE,
everything()
) %>%
select(
-closest_per_protocol_GOSE
)
write_csv(df_imputations, outputfile)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment