Commit ac60926f authored by Kevin's avatar Kevin

initial commit

parents
data
output
.snakemake
.Rproj.user
container.sif
*.Rproj
*.Rhistory
MIT License
Copyright (c) 2019 Kevin Kunzmann
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
singularity: "container.sif"
configfile: "config/snakemake.yml"
rule download:
input:
config = "config/snakemake.yml",
links = "config/tbl_download_links.csv",
script = "scripts/download-data-table.R"
output:
csv = expand("{datapath}/tbl_gose_imputation.csv",
datapath = config["datapath"])
shell:
"""
mkdir -p {config[datapath]}
Rscript scripts/download-data-table.R $NEUROBOT_USR $NEUROBOT_API {config[neurobot_staging_version]} gose_imputation
mv tbl_gose_imputation.csv {output.csv}
"""
rule check_inputs:
input:
csv = expand("{datapath}/tbl_gose_imputation.csv",
datapath = config["datapath"]
),
script = "scripts/check-inputs.R"
shell:
"""
Rscript scripts/check-inputs.R {config[datapath]}/tbl_gose_imputation.csv
"""
rule impute:
input:
config = "config/snakemake.yml",
csv = expand("{datapath}/tbl_gose_imputation.csv",
datapath = config["datapath"]
),
rmd = "imputation-report.Rmd"
output:
html = "output/imputation-report.html",
csv = "output/tbl_imputed_gose.csv"
shell:
"""
Rscript scripts/check-inputs.R {config[datapath]}/tbl_gose_imputation.csv
mkdir -p output
Rscript -e "rmarkdown::render(\\"{input.rmd}\\")"
mv tbl_imputed_gose.csv {output.csv}
mv imputation-report.html {output.html}
"""
neurobot_staging_version: "1.2"
datapath: "data"
seed: 42
age_range: [0.0, 999.0] # years
per_protocol_window: [150.0, 240] # 5-8 months post injury, in days
version,tbl_name,download_link
1.2,gose_imputation,_5cb46e9a52dc3879e3b7cc40
Bootstrap: docker
From: rocker/verse:latest
%labels
Maintainer Kevin Kunzmann kevin.kunzmann@mrc-bsu.cam.ac.uk
%help
CENTER-TBI 6 months GOSe outcome imputation,
cf. https://git.center-tbi.eu/kunzmann/gose-6mo-imputation for details.
%post
apt-get update
apt-get -y install curl python3-pip
pip3 install snakemake
R -e "install.packages('msm')"
---
title: "Extract and prepare data"
date: "`r Sys.time()`"
author: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
output: html_document
params:
config: "config/snakemake.yml"
---
```{r setup}
options(tidyverse.quiet = TRUE)
library(tidyverse)
config <- yaml::read_yaml(params$config)
print(config)
knitr::opts_chunk$set(
echo = TRUE
)
set.seed(config$seed)
```
```{r load-data}
tbl_combined <- readr::read_csv(
sprintf('%s/tbl_gose_imputation.csv', config$datapath),
col_types = "cinncD"
) %>%
filter(
# throw out all rows with missing GOSE
!is.na(Outcomes.DerivedCompositeGOSE),
# throw out all observations without a precise timestamp
!is.na(Outcomes.DerivedCompositeGOSEDaysPostInjury)
) %>%
mutate(
Subject.PatientType = case_when(
Subject.PatientType == "1" ~ "ER",
Subject.PatientType == "2" ~ "Admission",
Subject.PatientType == "3" ~ "ICU"
)
)
# add death date as GOSE 1
tbl_combined <- tbl_combined %>%
mutate(
death_days = as.numeric(difftime(Subject.DeathDate, "1970-01-01", units = "days"))
) %>%
group_by(gupi) %>%
summarize_all(
first
) %>%
ungroup() %>%
filter(
!is.na(death_days),
death_days > 180
) %>%
mutate(
Outcomes.DerivedCompositeGOSEDaysPostInjury = death_days,
Outcomes.DerivedCompositeGOSE = 1L
) %>%
select(-death_days) %>%
rbind(tbl_combined) %>%
arrange(
gupi,
Outcomes.DerivedCompositeGOSEDaysPostInjury
)
# make sure GOSE is unique at each observation time
tbl_combined <- tbl_combined %>%
group_by(
gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>%
summarize_all(
first
) %>%
ungroup
# filter all individuals with GOSE = 1 before 180 days post injury
# (no imputation required!)
tbl_combined <- tbl_combined %>%
group_by(gupi) %>%
filter(
# all GOSEs before 180 dayspost injury must be > 1 (dead)
all(Outcomes.DerivedCompositeGOSE[Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180] > 1),
# ensure age-bracket
Subject.Age >= config$age_range[1],
Subject.Age <= config$age_range[2],
# censor at first GOSE == 1
all(Outcomes.DerivedCompositeGOSE > 1) | (row_number() <= which(Outcomes.DerivedCompositeGOSE == 1)[1])
) %>%
ungroup
```
```{r}
tbl_combined$Outcomes.DerivedCompositeGOSE %>% table
```
# Impute
```{r impute}
tbl_gose_msm <- tbl_combined %>%
select(
gupi,
days_post_injury = Outcomes.DerivedCompositeGOSEDaysPostInjury,
gose = Outcomes.DerivedCompositeGOSE
) %>%
rbind(
# add censored states at 180.1 days to make prediction for
tibble(
gupi = unique(.$gupi),
# needed to offset to get a prediction in all cases
days_post_injury = 180 + .1,
gose = 99
)
) %>%
mutate(
# all deaths are excatly observed time - tell MSM!
observation_type = ifelse(gose == 1, 3, 1),
# recode levels to 1:7, 99 (needed for MSM!)
gose = factor(gose,
levels = c(1, 3:8, 99),
labels = c(1:7, 99)
) %>% as.character %>% as.numeric
) %>%
arrange(
gupi,
days_post_injury
)
# define transition matrix
Q <- matrix(0, nrow = 7, ncol = 7)
for (i in 1:7) {
for (j in 1:7) {
if (i == j - 1 & i != 1) {
Q[i, j] <- 1
}
if (i == j + 1) {
Q[i, j] <- 1
}
}
}
Q[2:7, 1] <- 1 # allow instantaneous deaths from all states
fit <- msm::msm(
gose ~ days_post_injury,
subject = gupi,
data = tbl_gose_msm,
obstype = observation_type,
gen.inits = TRUE,
qmatrix = Q,
censor = 99,
pci = c(90, 270),
censor.states = 2:7, # cannot be dead since these were filtered previously
control = list(
fnscale = 12000,
maxit = 10^4,
trace = 2
)
)
```
```{r}
# get fitted values at censored (prediction) states
fitted <- msm::viterbi.msm(fit) # viterbi does not respect the fact that
#states cannot be 1 at 180 days -> need to adjust
# store posteriors
tbl_posteriors <- tibble(
gupi = fitted$subject %>% as.character,
t = fitted$time
) %>%
cbind(
fitted$pstate %>% {colnames(.) <- c("1", as.character(3:8)); .}
) %>%
rename(
`2_or_3` = `3`
) %>%
filter(
t == 180.1
) %>%
select(
gupi, `1`, `2_or_3`, `4`, `5`, `6`, `7`, `8`
) %>%
pivot_longer(`1`:`8`, names_to = "gose", values_to = "probability") %>%
group_by(gupi ) %>%
mutate(
probability = ifelse(gose == "1", 0.0, probability),
probability = probability / sum(probability),
gose = sprintf("gose_%s_probability", gose)
) %>%
ungroup %>%
pivot_wider(names_from = gose, values_from = probability)
```
```{r final-imputations}
tbl_final_imputations <- readr::read_csv(
sprintf('%s/tbl_gose_imputation.csv', config$datapath),
col_types = "cinncD"
) %>%
select(gupi, Subject.DeathDate) %>%
distinct %>%
transmute(
gupi = gupi,
death_days = difftime(Subject.DeathDate, "1970-01-01", unit = "days") %>% as.numeric
) %>%
left_join(tbl_posteriors, by = "gupi") %>%
pivot_longer(
names_to = "gose",
values_to = "probability",
gose_1_probability:gose_8_probability
) %>%
mutate(
probability = ifelse(
is.na(death_days),
probability,
ifelse(gose != "gose_1_probability", 0.0, 1.0)
)
) %>%
group_by(gupi) %>%
mutate(
gose_map_imputed = ifelse(all(is.na(probability)),
NA,
names(tbl_posteriors %>% select(-gupi))[which.max(probability)]
)
) %>%
ungroup() %>%
select(-death_days) %>%
pivot_wider(names_from = "gose", values_from = "probability") %>%
mutate(
gose_map_imputed = stringr::str_extract(
gose_map_imputed,
"[0-9]{1}(_or_3){0,1}"
)
) %>%
left_join(
# add new column giving priority to per-protocol gose
readr::read_csv(
sprintf('%s/tbl_gose_imputation.csv', config$datapath),
col_types = "cinncD"
) %>%
transmute(
gupi = gupi,
gose = Outcomes.DerivedCompositeGOSE,
days = Outcomes.DerivedCompositeGOSEDaysPostInjury
) %>%
filter(
complete.cases(.),
days >= config$per_protocol_window[1],
days <= config$per_protocol_window[2]
) %>%
group_by(gupi) %>%
summarize(
gose = gose[which.min(abs(days - 180))],
days = days[which.min(abs(days - 180))]
) %>%
transmute(
gupi,
gose_per_protocol = ifelse(
gose %in% c(2, 3),
"2_or_3",
as.character(gose)
)
),
by = "gupi"
) %>%
mutate(
gose_map_per_protocol_combined = ifelse(
is.na(gose_per_protocol),
gose_map_imputed,
gose_per_protocol
),
# gose_map_imputed is only 1 when we know that the individual died before
# 6 months, i.e. in these cases the per-protocol gose shoul also be 1
gose_per_protocol = ifelse(gose_map_imputed == "1", "1", gose_per_protocol)
) %>%
# reorder
select(
gupi,
gose_per_protocol,
gose_map_imputed,
gose_map_per_protocol_combined,
everything()
)
```
```{r plausi-check}
tbl_final_imputations %>%
select(
gose_map_imputed,
gose_per_protocol
) %>%
filter(
complete.cases(.)
) %>%
mutate_all(
~factor(.x, levels = c("1", "2_or_3", as.character(4:8)))
) %>%
{caret::confusionMatrix(
data = .$gose_map_imputed,
reference = .$gose_per_protocol
)$table} %>%
broom::tidy() %>%
rename(
`GOSe, MAP imputed` = Prediction,
`GOSe, closest per-protocol` = Reference
) %>%
ggplot(aes(y = `GOSe, MAP imputed`, x = `GOSe, closest per-protocol`)) +
geom_raster(aes(fill = n)) +
geom_text(aes(label = n)) +
coord_fixed(expand = FALSE) +
scale_fill_continuous(low = "#FFFFFF", high = "#555555") +
theme_bw() +
ggtitle("Confusion matrix: imputed vs. per-protocol")
```
# Save
```{r save-imputed-values}
tbl_final_imputations %>% {
tmp <- .
names(tmp) <- gsub("gose", "Outcomes.DerivedGOSE", x = names(.))
tmp } %>%
write_csv("tbl_imputed_gose.csv")
```
# Session Info
```{r session-info}
sessionInfo()
```
#!/usr/bin/env Rscript
library(readr)
library(dplyr, warn.conflicts = FALSE)
options(warn = 2) # warnings are errors
args <- commandArgs(trailingOnly = TRUE)
input_csv <- args[[1]]
varnames <- c(
"gupi",
"Outcomes.DerivedCompositeGOSE",
"Outcomes.DerivedCompositeGOSEDaysPostInjury",
"Subject.Age",
"Subject.PatientType",
"Subject.DeathDate"
)
colnames <- readr::read_csv(input_csv, n_max = 1) %>% names
if (!all(colnames == varnames))
stop("colnames do not match varnames, check order!")
tbl_gose_imputation <- readr::read_csv(input_csv, col_types = "cinncD")
#!/usr/bin/env Rscript
library(readr)
library(dplyr, warn.conflicts = FALSE)
library(httr)
args <- commandArgs(trailingOnly = TRUE)
user <- args[[1]]
token <- args[[2]]
version <- args[[3]]
tbl_name <- args[[4]]
# load download links
tbl_download_links <- read_csv(
"config/tbl_download_links.csv",
col_types = c(
col_character(),
col_character(),
col_character()
)
)
# check uniqueness
tbl_download_links %>%
group_by(version, tbl_name) %>%
{
if (n_groups(.) < nrow(.))
stop("download links not unique, check file")
}
# extract link for query
tbl_download_link <- tbl_download_links %>%
filter(
version == get("version", envir = .GlobalEnv),
tbl_name == get("tbl_name", envir = .GlobalEnv)
) %>%
pull(download_link)
if (length(tbl_download_link) == 0)
stop("no neurobot download link for table 'tbl_%s.csv' version '%s' found", tbl_name, version)
# download csv file via neurobot api call
res <- GET(
sprintf("https://neurobot-stage.incf.org/api/data/%s.csv", tbl_download_link),
authenticate(user, token, type = "digest")
)
# check for success and save
if (res$status_code != 200)
stop(sprintf("error downloading tbl_%s.csv, http status code: %s", tbl_name, res$status_code))
writeBin(content(res, "raw"), sprintf("tbl_%s.csv", tbl_name))
#!/bin/bash
set -e
wget https://zenodo.org/record/2600385/files/container.sif
checksum=($(md5sum container.sif))
if [ $checksum != 7db125c9c83621d78981e558546b1e88 ]; then
echo md5 mismatch!
exit 1
fi
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment