Commit 69abd6dc authored by Kevin Kunzmann's avatar Kevin Kunzmann

Got rid of automatic downlaod - database is not stable; switched to renv...

Got rid of automatic downlaod - database is not stable; switched to renv instead of containers; reconciled all remaining code in single .Rmd; fixed minor data cleaning issues related to exclusion of individuals dying pre - 6 months and exclusion of observations past 18 months.
parent 85cc70c8
source("renv/activate.R")
...@@ -5,3 +5,7 @@ output ...@@ -5,3 +5,7 @@ output
container.sif container.sif
*.Rproj *.Rproj
*.Rhistory *.Rhistory
*_cache
*_files
*.csv
singularity: "container.sif"
configfile: "config/snakemake.yml"
rule download:
input:
config = "config/snakemake.yml",
links = "config/tbl_download_links.csv",
script = "scripts/download-data-table.R"
output:
csv = expand("{datapath}/tbl_gose_imputation.csv",
datapath = config["datapath"])
shell:
"""
mkdir -p {config[datapath]}
Rscript scripts/download-data-table.R $NEUROBOT_USR $NEUROBOT_API {config[neurobot_staging_version]} gose_imputation
mv tbl_gose_imputation.csv {output.csv}
"""
rule check_inputs:
input:
csv = expand("{datapath}/tbl_gose_imputation.csv",
datapath = config["datapath"]
),
script = "scripts/check-inputs.R"
shell:
"""
Rscript scripts/check-inputs.R {config[datapath]}/tbl_gose_imputation.csv
"""
rule impute:
input:
config = "config/snakemake.yml",
csv = expand("{datapath}/tbl_gose_imputation.csv",
datapath = config["datapath"]
),
rmd = "imputation-report.Rmd"
output:
html = "output/imputation-report.html",
csv = "output/tbl_imputed_gose.csv"
shell:
"""
Rscript scripts/check-inputs.R {config[datapath]}/tbl_gose_imputation.csv
mkdir -p output
Rscript -e "rmarkdown::render(\\"{input.rmd}\\")"
mv tbl_imputed_gose.csv {output.csv}
mv imputation-report.html {output.html}
"""
neurobot_staging_version: "1.2"
datapath: "data"
seed: 42
age_range: [0.0, 999.0] # years
per_protocol_window: [150.0, 240] # 5-8 months post injury, in days
version,tbl_name,download_link
1.2,gose_imputation,_5cd57b826b34316c8c728416
Bootstrap: docker
From: rocker/verse:latest
%labels
Maintainer Kevin Kunzmann kevin.kunzmann@mrc-bsu.cam.ac.uk
%help
CENTER-TBI 6 months GOSe outcome imputation,
cf. https://git.center-tbi.eu/kunzmann/gose-6mo-imputation for details.
%post
apt-get update
apt-get -y install curl python3-pip
pip3 install snakemake
R -e "devtools::install_github('tidyverse/tidyr')"
R -e "install.packages('diagram')"
R -e "install.packages('caret')"
R -e "install.packages('e1071')"
R -e "install.packages('msm')"
--- ---
title: "Extract and prepare data" title: "Impute 6-months GOSe for CENTER-TBI"
date: "`r Sys.time()`" date: "`r Sys.time()`"
author: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)" author: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
output: html_document output:
html_document:
code_folding: hide
params: params:
config: "config/snakemake.yml" seed: 42
age_range: !r c(0, Inf)
per_protocol_window: !r c(150.0, 240)
--- ---
The markdown file requires the recorded GOSe values in a .csv file
names `tbl_gose.csv` in the same folder.
The .csv file must contain the Neurobot variables
```
subjectId
Outcomes.DerivedCompositeGOSE
Outcomes.DerivedCompositeGOSEDaysPostInjury
Subject.Age
Subject.PatientType
Subject.DeathDate
```
in exactly that order.
At the time of writing this report the newest staging data set could be
accessed via the saved link `https://neurobot-stage.incf.org/_5f476d41eedb867816a10376`.
```{r setup} The required R environment can be installed by
options(tidyverse.quiet = TRUE) ```
library(tidyverse) install.packages("renv")
renv::restore(clean = TRUE, prompt = FALSE)
```
config <- yaml::read_yaml(params$config)
print(config) ```{r setup}
suppressPackageStartupMessages({
library(tidyverse)
library(msm)
})
knitr::opts_chunk$set( knitr::opts_chunk$set(
echo = TRUE # cache = TRUE, # uncomment this line to activate caching of the report
echo = TRUE
) )
set.seed(config$seed)
set.seed(params$seed)
```
```{r print-params}
print(params)
``` ```
```{r load-data} ```{r load-data}
tbl_combined <- readr::read_csv( # check data
sprintf('%s/tbl_gose_imputation.csv', config$datapath), varnames <- c(
"subjectId",
"Outcomes.DerivedCompositeGOSE",
"Outcomes.DerivedCompositeGOSEDaysPostInjury",
"Subject.Age",
"Subject.PatientType",
"Subject.DeathDate"
)
colnames <- readr::read_csv(
"tbl_gose.csv",
n_max = 1,
col_types = "cinncD" col_types = "cinncD"
) %>% ) %>%
names
if (!all(colnames == varnames))
stop("colnames do not match varnames, check order!")
tbl_combined <- readr::read_csv("tbl_gose.csv", col_types = "cinncD") %>%
rename(gupi = subjectId) %>%
filter( filter(
# throw out all rows with missing GOSE # throw out all rows with missing GOSE
!is.na(Outcomes.DerivedCompositeGOSE), !is.na(Outcomes.DerivedCompositeGOSE),
# throw out all observations without a precise timestamp # throw out all observations without a precise timestamp
!is.na(Outcomes.DerivedCompositeGOSEDaysPostInjury) !is.na(Outcomes.DerivedCompositeGOSEDaysPostInjury),
# throw out all observations after 18 months
# (for 12 months imputation!)
Outcomes.DerivedCompositeGOSEDaysPostInjury < 18*30
) %>% ) %>%
mutate( mutate(
Subject.PatientType = case_when( Subject.PatientType = case_when(
...@@ -56,8 +107,7 @@ tbl_combined <- tbl_combined %>% ...@@ -56,8 +107,7 @@ tbl_combined <- tbl_combined %>%
) %>% ) %>%
ungroup() %>% ungroup() %>%
filter( filter(
!is.na(death_days), !is.na(death_days)
death_days > 180
) %>% ) %>%
mutate( mutate(
Outcomes.DerivedCompositeGOSEDaysPostInjury = death_days, Outcomes.DerivedCompositeGOSEDaysPostInjury = death_days,
...@@ -85,11 +135,11 @@ tbl_combined <- tbl_combined %>% ...@@ -85,11 +135,11 @@ tbl_combined <- tbl_combined %>%
tbl_combined <- tbl_combined %>% tbl_combined <- tbl_combined %>%
group_by(gupi) %>% group_by(gupi) %>%
filter( filter(
# all GOSEs before 180 dayspost injury must be > 1 (dead) # all GOSEs before 180 days post injury must be > 1 (dead)
all(Outcomes.DerivedCompositeGOSE[Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180] > 1), all(Outcomes.DerivedCompositeGOSE[Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180] > 1),
# ensure age-bracket # ensure age-bracket
Subject.Age >= config$age_range[1], Subject.Age >= params$age_range[1],
Subject.Age <= config$age_range[2], Subject.Age <= params$age_range[2],
# censor at first GOSE == 1 # censor at first GOSE == 1
all(Outcomes.DerivedCompositeGOSE > 1) | (row_number() <= which(Outcomes.DerivedCompositeGOSE == 1)[1]) all(Outcomes.DerivedCompositeGOSE > 1) | (row_number() <= which(Outcomes.DerivedCompositeGOSE == 1)[1])
) %>% ) %>%
...@@ -106,7 +156,7 @@ tbl_combined$Outcomes.DerivedCompositeGOSE %>% table ...@@ -106,7 +156,7 @@ tbl_combined$Outcomes.DerivedCompositeGOSE %>% table
# Impute # Impute
```{r impute} ```{r impute, results='hide', warning=FALSE}
tbl_gose_msm <- tbl_combined %>% tbl_gose_msm <- tbl_combined %>%
select( select(
gupi, gupi,
...@@ -123,7 +173,7 @@ tbl_gose_msm <- tbl_combined %>% ...@@ -123,7 +173,7 @@ tbl_gose_msm <- tbl_combined %>%
) )
) %>% ) %>%
mutate( mutate(
# all deaths are excatly observed time - tell MSM! # all deaths are excactly observed time - tell MSM!
observation_type = ifelse(gose == 1, 3, 1), observation_type = ifelse(gose == 1, 3, 1),
# recode levels to 1:7, 99 (needed for MSM!) # recode levels to 1:7, 99 (needed for MSM!)
gose = factor(gose, gose = factor(gose,
...@@ -162,7 +212,7 @@ fit <- msm::msm( ...@@ -162,7 +212,7 @@ fit <- msm::msm(
censor.states = 2:7, # cannot be dead since these were filtered previously censor.states = 2:7, # cannot be dead since these were filtered previously
control = list( control = list(
fnscale = 12000, fnscale = 12000,
maxit = 10^4, maxit = 10,# 10^4,
trace = 2 trace = 2
) )
) )
...@@ -204,9 +254,10 @@ tbl_posteriors <- tibble( ...@@ -204,9 +254,10 @@ tbl_posteriors <- tibble(
```{r final-imputations} ```{r final-imputations}
# start by adding confirmed <= 180d deaths back in # start by adding confirmed <= 180d deaths back in
tbl_final_imputations <- readr::read_csv( tbl_final_imputations <- readr::read_csv(
sprintf('%s/tbl_gose_imputation.csv', config$datapath), "tbl_gose.csv",
col_types = "cinncD" col_types = "cinncD"
) %>% ) %>%
rename(gupi = subjectId) %>%
select(gupi, Subject.DeathDate) %>% select(gupi, Subject.DeathDate) %>%
distinct %>% distinct %>%
transmute( transmute(
...@@ -245,9 +296,10 @@ tbl_final_imputations <- readr::read_csv( ...@@ -245,9 +296,10 @@ tbl_final_imputations <- readr::read_csv(
left_join( left_join(
# add new column giving priority to per-protocol gose # add new column giving priority to per-protocol gose
readr::read_csv( readr::read_csv(
sprintf('%s/tbl_gose_imputation.csv', config$datapath), "tbl_gose.csv",
col_types = "cinncD" col_types = "cinncD"
) %>% ) %>%
rename(gupi = subjectId) %>%
transmute( transmute(
gupi = gupi, gupi = gupi,
gose = Outcomes.DerivedCompositeGOSE, gose = Outcomes.DerivedCompositeGOSE,
...@@ -255,13 +307,14 @@ tbl_final_imputations <- readr::read_csv( ...@@ -255,13 +307,14 @@ tbl_final_imputations <- readr::read_csv(
) %>% ) %>%
filter( filter(
complete.cases(.), complete.cases(.),
days >= config$per_protocol_window[1], days >= params$per_protocol_window[1],
days <= config$per_protocol_window[2] days <= params$per_protocol_window[2]
) %>% ) %>%
group_by(gupi) %>% group_by(gupi) %>%
summarize( summarize(
gose = gose[which.min(abs(days - 180))], gose = gose[which.min(abs(days - 180))],
days = days[which.min(abs(days - 180))] days = days[which.min(abs(days - 180))],
.groups = "drop"
) %>% ) %>%
transmute( transmute(
gupi, gupi,
...@@ -275,7 +328,7 @@ tbl_final_imputations <- readr::read_csv( ...@@ -275,7 +328,7 @@ tbl_final_imputations <- readr::read_csv(
) %>% ) %>%
mutate( mutate(
# gose_map_imputed is only 1 when we know that the individual died before # gose_map_imputed is only 1 when we know that the individual died before
# 6 months, i.e. in these cases the per-protocol gose shoul also be 1 # 6 months, i.e. in these cases the per-protocol gose should also be 1
gose_per_protocol = ifelse(gose_map_imputed == "1", "1", gose_per_protocol), gose_per_protocol = ifelse(gose_map_imputed == "1", "1", gose_per_protocol),
gose_map_per_protocol_combined = ifelse( gose_map_per_protocol_combined = ifelse(
is.na(gose_per_protocol), is.na(gose_per_protocol),
...@@ -310,7 +363,7 @@ tbl_final_imputations %>% ...@@ -310,7 +363,7 @@ tbl_final_imputations %>%
data = .$gose_map_imputed, data = .$gose_map_imputed,
reference = .$gose_per_protocol reference = .$gose_per_protocol
)$table} %>% )$table} %>%
broom::tidy() %>% as_tibble() %>%
rename( rename(
`GOSe, MAP imputed` = Prediction, `GOSe, MAP imputed` = Prediction,
`GOSe, closest per-protocol` = Reference `GOSe, closest per-protocol` = Reference
...@@ -324,10 +377,6 @@ tbl_final_imputations %>% ...@@ -324,10 +377,6 @@ tbl_final_imputations %>%
ggtitle("Confusion matrix: imputed vs. per-protocol") ggtitle("Confusion matrix: imputed vs. per-protocol")
``` ```
# Save
```{r save-imputed-values} ```{r save-imputed-values}
tbl_final_imputations %>% { tbl_final_imputations %>% {
tmp <- . tmp <- .
...@@ -336,7 +385,7 @@ tbl_final_imputations %>% { ...@@ -336,7 +385,7 @@ tbl_final_imputations %>% {
rename( rename(
"Subject.GOSE6monthEndpointDerived" = "Subject.DerivedImputed180DaysGOSE_map_per_protocol_combined" "Subject.GOSE6monthEndpointDerived" = "Subject.DerivedImputed180DaysGOSE_map_per_protocol_combined"
) %>% ) %>%
write_csv("tbl_imputed_gose.csv") write_csv("tbl_gose_imputed.csv")
``` ```
...@@ -344,5 +393,6 @@ tbl_final_imputations %>% { ...@@ -344,5 +393,6 @@ tbl_final_imputations %>% {
# Session Info # Session Info
```{r session-info} ```{r session-info}
sessionInfo() sessionInfo() %>%
knitr::kable()
``` ```
This diff is collapsed.
This diff is collapsed.
library/
python/
staging/
This diff is collapsed.
external.libraries:
ignored.packages:
package.dependency.fields: Imports, Depends, LinkingTo
snapshot.type: implicit
use.cache: TRUE
vcs.ignore.library: TRUE
#!/bin/bash
sudo singularity build container.sif container-recipe
#!/usr/bin/env Rscript
library(readr)
library(dplyr, warn.conflicts = FALSE)
options(warn = 2) # warnings are errors
args <- commandArgs(trailingOnly = TRUE)
input_csv <- args[[1]]
varnames <- c(
"gupi",
"Outcomes.DerivedCompositeGOSE",
"Outcomes.DerivedCompositeGOSEDaysPostInjury",
"Subject.Age",
"Subject.PatientType",
"Subject.DeathDate"
)
colnames <- readr::read_csv(
input_csv,
n_max = 1, ,
col_types = "cinncD"
) %>%
names
if (!all(colnames == varnames))
stop("colnames do not match varnames, check order!")
#!/bin/bash
set -e
wget https://zenodo.org/record/2641161/files/container.sif
checksum=($(md5sum container.sif))
if [ $checksum != 4d7edd4ff6ba7c8ff7639c53e7c6daf9 ]; then
echo md5 mismatch!
exit 1
fi
#!/usr/bin/env Rscript
library(readr)
library(dplyr, warn.conflicts = FALSE)
library(httr)
args <- commandArgs(trailingOnly = TRUE)
user <- args[[1]]
token <- args[[2]]
version <- args[[3]]
tbl_name <- args[[4]]
# load download links
tbl_download_links <- read_csv(
"config/tbl_download_links.csv",
col_types = c(
col_character(),
col_character(),
col_character()
)
)
# check uniqueness
tbl_download_links %>%
group_by(version, tbl_name) %>%
{
if (n_groups(.) < nrow(.))
stop("download links not unique, check file")
}
# extract link for query
tbl_download_link <- tbl_download_links %>%
filter(
version == get("version", envir = .GlobalEnv),
tbl_name == get("tbl_name", envir = .GlobalEnv)
) %>%
pull(download_link)
if (length(tbl_download_link) == 0)
stop("no neurobot download link for table 'tbl_%s.csv' version '%s' found", tbl_name, version)
# download csv file via neurobot api call
res <- GET(
sprintf("https://neurobot-stage.incf.org/api/data/%s.csv", tbl_download_link),
authenticate(user, token, type = "digest")
)
# check for success and save
if (res$status_code != 200)
stop(sprintf("error downloading tbl_%s.csv, http status code: %s", tbl_name, res$status_code))
writeBin(content(res, "raw"), sprintf("tbl_%s.csv", tbl_name))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment