initial commit

ac60926f · Kevin · ac60926f · ac60926f · ac60926f · ac60926f
Commit ac60926f authored Apr 15, 2019 by Kevin
10 changed files
--- a/.gitignore
+++ b/.gitignore
+data
+output
+.snakemake
+.Rproj.user
+container.sif
+*.Rproj
+*.Rhistory
--- a/LICENSE
+++ b/LICENSE
+MIT License
+Copyright (c) 2019 Kevin Kunzmann
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/Snakefile
+++ b/Snakefile
+singularity: "container.sif"
+configfile: "config/snakemake.yml"
+rule download:
+    input:
+        config = "config/snakemake.yml",
+        links  = "config/tbl_download_links.csv",
+        script = "scripts/download-data-table.R"
+    output:
+        csv    = expand("{datapath}/tbl_gose_imputation.csv",
+            datapath = config["datapath"])
+    shell:
+        """
+        mkdir -p {config[datapath]}
+        Rscript scripts/download-data-table.R $NEUROBOT_USR $NEUROBOT_API {config[neurobot_staging_version]} gose_imputation
+        mv tbl_gose_imputation.csv {output.csv}
+        """
+rule check_inputs:
+    input:
+        csv    = expand("{datapath}/tbl_gose_imputation.csv",
+            datapath = config["datapath"]
+        ),
+        script = "scripts/check-inputs.R"
+    shell:
+        """
+        Rscript scripts/check-inputs.R {config[datapath]}/tbl_gose_imputation.csv
+        """
+rule impute:
+    input:
+        config = "config/snakemake.yml",
+        csv    = expand("{datapath}/tbl_gose_imputation.csv",
+            datapath = config["datapath"]
+        ),
+        rmd    = "imputation-report.Rmd"
+    output:
+        html   = "output/imputation-report.html",
+        csv    = "output/tbl_imputed_gose.csv"
+    shell:
+        """
+        Rscript scripts/check-inputs.R {config[datapath]}/tbl_gose_imputation.csv
+        mkdir -p output
+        Rscript -e "rmarkdown::render(\\"{input.rmd}\\")"
+        mv tbl_imputed_gose.csv {output.csv}
+        mv imputation-report.html {output.html}
+        """
--- a/config/snakemake.yml
+++ b/config/snakemake.yml
+neurobot_staging_version: "1.2"
+datapath: "data"
+seed: 42
+age_range: [0.0, 999.0] # years
+per_protocol_window: [150.0, 240] # 5-8 months post injury, in days
--- a/config/tbl_download_links.csv
+++ b/config/tbl_download_links.csv
+version,tbl_name,download_link
+1.2,gose_imputation,_5cb46e9a52dc3879e3b7cc40
--- a/container-recipe
+++ b/container-recipe
+Bootstrap: docker
+From: rocker/verse:latest
+%labels
+    Maintainer Kevin Kunzmann kevin.kunzmann@mrc-bsu.cam.ac.uk
+%help
+    CENTER-TBI 6 months GOSe outcome imputation,
+    cf. https://git.center-tbi.eu/kunzmann/gose-6mo-imputation for details.
+%post
+    apt-get update
+    apt-get -y install curl python3-pip
+    pip3 install snakemake
+    R -e "install.packages('msm')"
--- a/imputation-report.Rmd
+++ b/imputation-report.Rmd
+---
+title: "Extract and prepare data"
+date: "`r Sys.time()`"
+author: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
+output: html_document
+params:
+    config:   "config/snakemake.yml"
+---
+```{r setup}
+options(tidyverse.quiet = TRUE)
+library(tidyverse)
+config <- yaml::read_yaml(params$config)
+print(config)
+knitr::opts_chunk$set(
+  echo = TRUE
+)
+set.seed(config$seed)
+```
+```{r load-data}
+tbl_combined <- readr::read_csv(
+        sprintf('%s/tbl_gose_imputation.csv', config$datapath),
+        col_types = "cinncD"
+    ) %>% 
+    filter(
+        # throw out all rows with missing GOSE
+        !is.na(Outcomes.DerivedCompositeGOSE),
+        # throw out all observations without a precise timestamp
+        !is.na(Outcomes.DerivedCompositeGOSEDaysPostInjury)
+    ) %>% 
+    mutate(
+        Subject.PatientType = case_when(
+            Subject.PatientType == "1" ~ "ER",
+            Subject.PatientType == "2" ~ "Admission",
+            Subject.PatientType == "3" ~ "ICU"
+        )
+    ) 
+# add death date as GOSE 1
+tbl_combined <- tbl_combined %>%
+    mutate(
+        death_days = as.numeric(difftime(Subject.DeathDate, "1970-01-01", units = "days"))
+    ) %>% 
+    group_by(gupi) %>% 
+    summarize_all(
+        first
+    ) %>% 
+    ungroup() %>% 
+    filter(
+        !is.na(death_days),
+        death_days > 180
+    ) %>% 
+    mutate(
+        Outcomes.DerivedCompositeGOSEDaysPostInjury = death_days,
+        Outcomes.DerivedCompositeGOSE               = 1L
+    ) %>% 
+    select(-death_days) %>% 
+    rbind(tbl_combined) %>% 
+    arrange(
+       gupi, 
+       Outcomes.DerivedCompositeGOSEDaysPostInjury
+    )
+# make sure GOSE is unique at each observation time
+tbl_combined  <- tbl_combined %>% 
+    group_by(
+        gupi, Outcomes.DerivedCompositeGOSEDaysPostInjury
+    ) %>% 
+    summarize_all(
+        first
+    ) %>% 
+    ungroup
+# filter all individuals with GOSE = 1 before 180 days post injury
+# (no imputation required!)
+tbl_combined <- tbl_combined %>% 
+    group_by(gupi) %>% 
+    filter(
+        # all GOSEs before 180 dayspost injury must be > 1 (dead)
+        all(Outcomes.DerivedCompositeGOSE[Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180] > 1),
+        # ensure age-bracket
+        Subject.Age >= config$age_range[1],
+        Subject.Age <= config$age_range[2],
+        # censor at first GOSE == 1
+        all(Outcomes.DerivedCompositeGOSE > 1) | (row_number() <= which(Outcomes.DerivedCompositeGOSE == 1)[1])
+    ) %>% 
+    ungroup
+```
+```{r}
+tbl_combined$Outcomes.DerivedCompositeGOSE %>% table
+```
+# Impute
+```{r impute}
+tbl_gose_msm <- tbl_combined %>%
+    select(
+        gupi,
+        days_post_injury = Outcomes.DerivedCompositeGOSEDaysPostInjury,
+        gose = Outcomes.DerivedCompositeGOSE
+    ) %>% 
+    rbind(
+      # add censored states at 180.1 days to make prediction for
+      tibble(
+          gupi             = unique(.$gupi),
+          # needed to offset to get a prediction in all cases
+          days_post_injury = 180 + .1, 
+          gose             = 99
+      )
+    ) %>% 
+    mutate(
+        # all deaths are excatly observed time - tell MSM!
+        observation_type = ifelse(gose == 1, 3, 1),
+        # recode levels to 1:7, 99 (needed for MSM!)
+        gose = factor(gose,
+                levels = c(1, 3:8, 99), 
+                labels = c(1:7, 99)
+            ) %>% as.character %>% as.numeric
+    ) %>% 
+    arrange(
+        gupi,
+        days_post_injury
+    )
+# define transition matrix
+Q <- matrix(0, nrow = 7, ncol = 7)
+for (i in 1:7) {
+  for (j in 1:7) {
+    if (i == j - 1 & i != 1) {
+      Q[i, j] <- 1
+    }
+    if (i == j + 1) {
+      Q[i, j] <- 1
+    }
+  }
+}
+Q[2:7, 1] <- 1 # allow instantaneous deaths from all states
+fit <- msm::msm(
+  gose ~ days_post_injury,
+  subject       = gupi,
+  data          = tbl_gose_msm,
+  obstype       = observation_type,
+  gen.inits     = TRUE,
+  qmatrix       = Q,
+  censor        = 99,
+  pci           = c(90, 270),
+  censor.states = 2:7, # cannot be dead since these were filtered previously 
+  control   = list(
+      fnscale = 12000,
+      maxit   = 10^4,
+      trace   = 2
+  )
+)
+```
+```{r}
+# get fitted values at censored (prediction) states
+fitted <- msm::viterbi.msm(fit) # viterbi does not respect the fact that 
+#states cannot be 1 at 180 days -> need to adjust 
+# store posteriors
+tbl_posteriors <- tibble(
+        gupi = fitted$subject %>% as.character,
+        t    = fitted$time
+    ) %>%
+    cbind(
+        fitted$pstate %>% {colnames(.) <- c("1", as.character(3:8)); .}
+    ) %>%
+    rename(
+        `2_or_3` = `3`
+    ) %>%
+    filter(
+        t == 180.1
+    ) %>%
+    select(
+        gupi, `1`, `2_or_3`, `4`, `5`, `6`, `7`, `8`
+    ) %>% 
+    pivot_longer(`1`:`8`, names_to  = "gose", values_to = "probability") %>%
+    group_by(gupi ) %>%
+    mutate(
+        probability = ifelse(gose == "1", 0.0, probability),
+        probability = probability / sum(probability),
+        gose = sprintf("gose_%s_probability", gose)
+    ) %>% 
+    ungroup %>% 
+    pivot_wider(names_from = gose, values_from = probability)
+```
+```{r final-imputations}
+tbl_final_imputations <- readr::read_csv(
+        sprintf('%s/tbl_gose_imputation.csv', config$datapath),
+        col_types = "cinncD"
+    ) %>% 
+    select(gupi, Subject.DeathDate) %>% 
+    distinct %>% 
+    transmute(
+        gupi = gupi,
+        death_days = difftime(Subject.DeathDate, "1970-01-01", unit = "days") %>% as.numeric
+    ) %>% 
+    left_join(tbl_posteriors, by = "gupi") %>% 
+    pivot_longer(
+        names_to  = "gose",
+        values_to = "probability",
+        gose_1_probability:gose_8_probability
+    ) %>% 
+    mutate(
+        probability = ifelse(
+            is.na(death_days),
+            probability,
+            ifelse(gose != "gose_1_probability", 0.0, 1.0)
+        )
+    ) %>%
+    group_by(gupi) %>%
+    mutate(
+        gose_map_imputed = ifelse(all(is.na(probability)), 
+                NA, 
+                names(tbl_posteriors %>% select(-gupi))[which.max(probability)]
+            )
+    ) %>%
+    ungroup() %>% 
+    select(-death_days) %>% 
+    pivot_wider(names_from = "gose", values_from = "probability") %>% 
+    mutate(
+        gose_map_imputed = stringr::str_extract(
+          gose_map_imputed, 
+          "[0-9]{1}(_or_3){0,1}"
+        )
+    ) %>% 
+    left_join(
+        # add new column giving priority to per-protocol gose
+        readr::read_csv(
+            sprintf('%s/tbl_gose_imputation.csv', config$datapath),
+            col_types = "cinncD"
+        ) %>% 
+        transmute(
+          gupi = gupi, 
+          gose = Outcomes.DerivedCompositeGOSE,
+          days = Outcomes.DerivedCompositeGOSEDaysPostInjury
+        ) %>% 
+        filter(
+            complete.cases(.),
+            days >= config$per_protocol_window[1],
+            days <= config$per_protocol_window[2]
+        ) %>% 
+        group_by(gupi) %>% 
+        summarize(
+            gose = gose[which.min(abs(days - 180))],
+            days = days[which.min(abs(days - 180))]
+        ) %>% 
+        transmute(
+            gupi,
+            gose_per_protocol = ifelse(
+                gose %in% c(2, 3), 
+                "2_or_3", 
+                as.character(gose)
+            )
+        ),
+        by = "gupi"
+    ) %>% 
+    mutate(
+        gose_map_per_protocol_combined = ifelse(
+            is.na(gose_per_protocol), 
+            gose_map_imputed, 
+            gose_per_protocol
+        ),
+        # gose_map_imputed is only 1 when we know that the individual died before
+        # 6 months, i.e. in these cases the per-protocol gose shoul also be 1
+        gose_per_protocol = ifelse(gose_map_imputed == "1", "1", gose_per_protocol)
+    ) %>% 
+    # reorder
+    select(
+        gupi, 
+        gose_per_protocol, 
+        gose_map_imputed, 
+        gose_map_per_protocol_combined, 
+        everything()
+    )
+```
+```{r plausi-check}
+tbl_final_imputations %>% 
+    select(
+        gose_map_imputed,
+        gose_per_protocol
+    ) %>% 
+    filter(
+        complete.cases(.)
+    ) %>% 
+    mutate_all(
+        ~factor(.x, levels = c("1", "2_or_3", as.character(4:8)))
+    ) %>% 
+    {caret::confusionMatrix(
+        data      = .$gose_map_imputed,
+        reference = .$gose_per_protocol
+    )$table} %>% 
+    broom::tidy() %>% 
+    rename(
+        `GOSe, MAP imputed`          = Prediction,
+        `GOSe, closest per-protocol` = Reference
+    ) %>% 
+    ggplot(aes(y = `GOSe, MAP imputed`, x = `GOSe, closest per-protocol`)) +
+      geom_raster(aes(fill = n)) + 
+      geom_text(aes(label = n)) +
+      coord_fixed(expand = FALSE) + 
+      scale_fill_continuous(low = "#FFFFFF", high = "#555555") +
+      theme_bw() +
+      ggtitle("Confusion matrix: imputed vs. per-protocol")
+```
+# Save
+```{r save-imputed-values}
+tbl_final_imputations %>% {
+    tmp <- .
+    names(tmp) <- gsub("gose", "Outcomes.DerivedGOSE", x = names(.))
+    tmp } %>% 
+    write_csv("tbl_imputed_gose.csv")
+```
+# Session Info
+```{r session-info}
+sessionInfo()
+``` 
--- a/scripts/check-inputs.R
+++ b/scripts/check-inputs.R
+#!/usr/bin/env Rscript
+library(readr)
+library(dplyr, warn.conflicts = FALSE)
+options(warn = 2) # warnings are errors
+args      <- commandArgs(trailingOnly = TRUE)
+input_csv <- args[[1]]
+varnames <- c(
+    "gupi",
+    "Outcomes.DerivedCompositeGOSE",
+    "Outcomes.DerivedCompositeGOSEDaysPostInjury",
+    "Subject.Age",
+    "Subject.PatientType",
+    "Subject.DeathDate"
+)
+colnames <- readr::read_csv(input_csv, n_max = 1) %>% names
+if (!all(colnames == varnames))
+    stop("colnames do not match varnames, check order!")
+tbl_gose_imputation <- readr::read_csv(input_csv, col_types = "cinncD")
--- a/scripts/download-data-table.R
+++ b/scripts/download-data-table.R
+#!/usr/bin/env Rscript
+library(readr)
+library(dplyr, warn.conflicts = FALSE)
+library(httr)
+args <- commandArgs(trailingOnly = TRUE)
+user     <- args[[1]]
+token    <- args[[2]]
+version  <- args[[3]]
+tbl_name <- args[[4]]
+# load download links
+tbl_download_links <- read_csv(
+    "config/tbl_download_links.csv",
+    col_types = c(
+        col_character(),
+        col_character(),
+        col_character()
+    )
+)
+# check uniqueness
+tbl_download_links %>%
+    group_by(version, tbl_name) %>%
+    {
+        if (n_groups(.) < nrow(.))
+            stop("download links not unique, check file")
+    }
+# extract link for query
+tbl_download_link <- tbl_download_links %>%
+    filter(
+        version  == get("version", envir = .GlobalEnv),
+        tbl_name == get("tbl_name", envir = .GlobalEnv)
+    ) %>%
+    pull(download_link)
+if (length(tbl_download_link) == 0)
+    stop("no neurobot download link for table 'tbl_%s.csv' version '%s' found", tbl_name, version)
+# download csv file via neurobot api call
+res <- GET(
+    sprintf("https://neurobot-stage.incf.org/api/data/%s.csv", tbl_download_link),
+    authenticate(user, token, type = "digest")
+)
+# check for success and save
+if (res$status_code != 200)
+    stop(sprintf("error downloading tbl_%s.csv, http status code: %s", tbl_name, res$status_code))
+writeBin(content(res, "raw"), sprintf("tbl_%s.csv", tbl_name))
--- a/scripts/download_container
+++ b/scripts/download_container
+#!/bin/bash
+set -e
+wget https://zenodo.org/record/2600385/files/container.sif
+checksum=($(md5sum container.sif))
+if [ $checksum != 7db125c9c83621d78981e558546b1e88 ]; then
+    echo md5 mismatch!
+    exit 1
+fi