Got rid of automatic downlaod - database is not stable; switched to renv...

Got rid of automatic downlaod - database is not stable; switched to renv instead of containers; reconciled all remaining code in single .Rmd; fixed minor data cleaning issues related to exclusion of individuals dying pre - 6 months and exclusion of observations past 18 months.

Got rid of automatic downlaod - database is not stable; switched to renv...
Got rid of automatic downlaod - database is not stable; switched to renv instead of containers; reconciled all remaining code in single .Rmd; fixed minor data cleaning issues related to exclusion of individuals dying pre - 6 months and exclusion of observations past 18 months.
69abd6dc · Kevin Kunzmann · 85cc70c8 · 69abd6dc · 69abd6dc · 85cc70c8
Commit 69abd6dc authored Sep 03, 2020 by Kevin Kunzmann
16 changed files
--- a/.Rprofile
+++ b/.Rprofile
+source("renv/activate.R")
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,7 @@ output
 container.sif
 *.Rproj
 *.Rhistory
+*_cache
+*_files
+*.csv
--- a/Snakefile
+++ b/Snakefile
-singularity: "container.sif"
-configfile: "config/snakemake.yml"
-rule download:
-    input:
-        config = "config/snakemake.yml",
-        links  = "config/tbl_download_links.csv",
-        script = "scripts/download-data-table.R"
-    output:
-        csv    = expand("{datapath}/tbl_gose_imputation.csv",
-            datapath = config["datapath"])
-    shell:
-        """
-        mkdir -p {config[datapath]}
-        Rscript scripts/download-data-table.R $NEUROBOT_USR $NEUROBOT_API {config[neurobot_staging_version]} gose_imputation
-        mv tbl_gose_imputation.csv {output.csv}
-        """
-rule check_inputs:
-    input:
-        csv    = expand("{datapath}/tbl_gose_imputation.csv",
-            datapath = config["datapath"]
-        ),
-        script = "scripts/check-inputs.R"
-    shell:
-        """
-        Rscript scripts/check-inputs.R {config[datapath]}/tbl_gose_imputation.csv
-        """
-rule impute:
-    input:
-        config = "config/snakemake.yml",
-        csv    = expand("{datapath}/tbl_gose_imputation.csv",
-            datapath = config["datapath"]
-        ),
-        rmd    = "imputation-report.Rmd"
-    output:
-        html   = "output/imputation-report.html",
-        csv    = "output/tbl_imputed_gose.csv"
-    shell:
-        """
-        Rscript scripts/check-inputs.R {config[datapath]}/tbl_gose_imputation.csv
-        mkdir -p output
-        Rscript -e "rmarkdown::render(\\"{input.rmd}\\")"
-        mv tbl_imputed_gose.csv {output.csv}
-        mv imputation-report.html {output.html}
-        """
--- a/config/snakemake.yml
+++ b/config/snakemake.yml
-neurobot_staging_version: "1.2"
-datapath: "data"
-seed: 42
-age_range: [0.0, 999.0] # years
-per_protocol_window: [150.0, 240] # 5-8 months post injury, in days
--- a/config/tbl_download_links.csv
+++ b/config/tbl_download_links.csv
-version,tbl_name,download_link
-1.2,gose_imputation,_5cd57b826b34316c8c728416
--- a/container-recipe
+++ b/container-recipe
-Bootstrap: docker
-From: rocker/verse:latest
-%labels
-    Maintainer Kevin Kunzmann kevin.kunzmann@mrc-bsu.cam.ac.uk
-%help
-    CENTER-TBI 6 months GOSe outcome imputation,
-    cf. https://git.center-tbi.eu/kunzmann/gose-6mo-imputation for details.
-%post
-    apt-get update
-    apt-get -y install curl python3-pip
-    pip3 install snakemake
-    R -e "devtools::install_github('tidyverse/tidyr')"
-    R -e "install.packages('diagram')"
-    R -e "install.packages('caret')"
-    R -e "install.packages('e1071')"
-    R -e "install.packages('msm')"
--- a/imputation-report.Rmd
+++ b/imputation-report.Rmd
 ---
-title: "Extract and prepare data"
+title: "Impute 6-months GOSe for CENTER-TBI"
 date: "`r Sys.time()`"
 author: "Kevin Kunzmann (kevin.kunzmann@mrc-bsu.cam.ac.uk)"
-output: html_document
+output: 
+    html_document:
+        code_folding: hide
 params:
-    config:   "config/snakemake.yml"
+    seed:                 42
+    age_range:           !r c(0, Inf)
+    per_protocol_window: !r c(150.0, 240)
 ---
+The markdown file requires the recorded GOSe values in a .csv file
+names `tbl_gose.csv` in the same folder.
+The .csv file must contain the Neurobot variables 
+```
+subjectId
+Outcomes.DerivedCompositeGOSE
+Outcomes.DerivedCompositeGOSEDaysPostInjury
+Subject.Age
+Subject.PatientType
+Subject.DeathDate
+```
+in exactly that order. 
+At the time of writing this report the newest staging data set could be
+accessed via the saved link `https://neurobot-stage.incf.org/_5f476d41eedb867816a10376`.
-```{r setup}
+The required R environment can be installed by
-options(tidyverse.quiet = TRUE)
+```
-library(tidyverse)
+install.packages("renv")
+renv::restore(clean = TRUE, prompt = FALSE)
+```
-config <- yaml::read_yaml(params$config)
-print(config)
+```{r setup}
+suppressPackageStartupMessages({
+    library(tidyverse)
+    library(msm)
+})
 knitr::opts_chunk$set(
-  echo = TRUE
+    # cache = TRUE, # uncomment this line to activate caching of the report
+     echo = TRUE
 )
-set.seed(config$seed)
+set.seed(params$seed)
+```
+```{r print-params}
+print(params)
 ```
 ```{r load-data}
-tbl_combined <- readr::read_csv(
+# check data 
-        sprintf('%s/tbl_gose_imputation.csv', config$datapath),
+varnames <- c(
+    "subjectId",
+    "Outcomes.DerivedCompositeGOSE",
+    "Outcomes.DerivedCompositeGOSEDaysPostInjury",
+    "Subject.Age",
+    "Subject.PatientType",
+    "Subject.DeathDate"
+)
+colnames <- readr::read_csv(
+        "tbl_gose.csv",
+            n_max = 1,
        col_types = "cinncD"
-    ) %>% 
+    ) %>%
+    names
+if (!all(colnames == varnames))
+    stop("colnames do not match varnames, check order!")
+tbl_combined <- readr::read_csv("tbl_gose.csv", col_types = "cinncD") %>% 
+    rename(gupi = subjectId) %>%
    filter(
        # throw out all rows with missing GOSE
        !is.na(Outcomes.DerivedCompositeGOSE),
        # throw out all observations without a precise timestamp
-        !is.na(Outcomes.DerivedCompositeGOSEDaysPostInjury)
+        !is.na(Outcomes.DerivedCompositeGOSEDaysPostInjury),
+        # throw out all observations after 18 months 
+        # (for 12 months imputation!)
+        Outcomes.DerivedCompositeGOSEDaysPostInjury < 18*30
    ) %>% 
    mutate(
        Subject.PatientType = case_when(
@@ -56,8 +107,7 @@ tbl_combined <- tbl_combined %>%
    ) %>% 
    ungroup() %>% 
    filter(
-        !is.na(death_days),
+        !is.na(death_days)
-        death_days > 180
    ) %>% 
    mutate(
        Outcomes.DerivedCompositeGOSEDaysPostInjury = death_days,
@@ -85,11 +135,11 @@ tbl_combined  <- tbl_combined %>%
 tbl_combined <- tbl_combined %>% 
    group_by(gupi) %>% 
    filter(
-        # all GOSEs before 180 dayspost injury must be > 1 (dead)
+        # all GOSEs before 180 days post injury must be > 1 (dead)
        all(Outcomes.DerivedCompositeGOSE[Outcomes.DerivedCompositeGOSEDaysPostInjury <= 180] > 1),
        # ensure age-bracket
-        Subject.Age >= config$age_range[1],
+        Subject.Age >= params$age_range[1],
-        Subject.Age <= config$age_range[2],
+        Subject.Age <= params$age_range[2],
        # censor at first GOSE == 1
        all(Outcomes.DerivedCompositeGOSE > 1) | (row_number() <= which(Outcomes.DerivedCompositeGOSE == 1)[1])
    ) %>% 
@@ -106,7 +156,7 @@ tbl_combined$Outcomes.DerivedCompositeGOSE %>% table
 # Impute
-```{r impute}
+```{r impute, results='hide', warning=FALSE}
 tbl_gose_msm <- tbl_combined %>%
    select(
        gupi,
@@ -123,7 +173,7 @@ tbl_gose_msm <- tbl_combined %>%
      )
    ) %>% 
    mutate(
-        # all deaths are excatly observed time - tell MSM!
+        # all deaths are excactly observed time - tell MSM!
        observation_type = ifelse(gose == 1, 3, 1),
        # recode levels to 1:7, 99 (needed for MSM!)
        gose = factor(gose,
@@ -162,7 +212,7 @@ fit <- msm::msm(
  censor.states = 2:7, # cannot be dead since these were filtered previously 
  control   = list(
      fnscale = 12000,
-      maxit   = 10^4,
+      maxit   = 10,# 10^4,
      trace   = 2
  )
 )
@@ -204,9 +254,10 @@ tbl_posteriors <- tibble(
 ```{r final-imputations}
 # start by adding confirmed <= 180d deaths back in
 tbl_final_imputations <- readr::read_csv(
-        sprintf('%s/tbl_gose_imputation.csv', config$datapath),
+        "tbl_gose.csv", 
        col_types = "cinncD"
    ) %>% 
+    rename(gupi = subjectId) %>% 
    select(gupi, Subject.DeathDate) %>% 
    distinct %>% 
    transmute(
@@ -245,9 +296,10 @@ tbl_final_imputations <- readr::read_csv(
    left_join(
        # add new column giving priority to per-protocol gose
        readr::read_csv(
-            sprintf('%s/tbl_gose_imputation.csv', config$datapath),
+            "tbl_gose.csv", 
            col_types = "cinncD"
        ) %>% 
+        rename(gupi = subjectId) %>%
        transmute(
          gupi = gupi, 
          gose = Outcomes.DerivedCompositeGOSE,
@@ -255,13 +307,14 @@ tbl_final_imputations <- readr::read_csv(
        ) %>% 
        filter(
            complete.cases(.),
-            days >= config$per_protocol_window[1],
+            days >= params$per_protocol_window[1],
-            days <= config$per_protocol_window[2]
+            days <= params$per_protocol_window[2]
        ) %>% 
        group_by(gupi) %>% 
        summarize(
            gose = gose[which.min(abs(days - 180))],
-            days = days[which.min(abs(days - 180))]
+            days = days[which.min(abs(days - 180))],
+            .groups = "drop"
        ) %>% 
        transmute(
            gupi,
@@ -275,7 +328,7 @@ tbl_final_imputations <- readr::read_csv(
    ) %>% 
    mutate(
        # gose_map_imputed is only 1 when we know that the individual died before
-        # 6 months, i.e. in these cases the per-protocol gose shoul also be 1
+        # 6 months, i.e. in these cases the per-protocol gose should also be 1
        gose_per_protocol = ifelse(gose_map_imputed == "1", "1", gose_per_protocol),
        gose_map_per_protocol_combined = ifelse(
            is.na(gose_per_protocol), 
@@ -310,7 +363,7 @@ tbl_final_imputations %>%
        data      = .$gose_map_imputed,
        reference = .$gose_per_protocol
    )$table} %>% 
-    broom::tidy() %>% 
+    as_tibble() %>% 
    rename(
        `GOSe, MAP imputed`          = Prediction,
        `GOSe, closest per-protocol` = Reference
@@ -324,10 +377,6 @@ tbl_final_imputations %>%
      ggtitle("Confusion matrix: imputed vs. per-protocol")
 ```
-# Save
 ```{r save-imputed-values}
 tbl_final_imputations %>% {
    tmp <- .
@@ -336,7 +385,7 @@ tbl_final_imputations %>% {
    rename(
      "Subject.GOSE6monthEndpointDerived" = "Subject.DerivedImputed180DaysGOSE_map_per_protocol_combined"
    ) %>% 
-    write_csv("tbl_imputed_gose.csv")
+    write_csv("tbl_gose_imputed.csv")
 ```
@@ -344,5 +393,6 @@ tbl_final_imputations %>% {
 # Session Info
 ```{r session-info}
-sessionInfo()
+sessionInfo() %>%
+    knitr::kable()
 ``` 
--- a/imputation-report.html
+++ b/imputation-report.html
--- a/renv.lock
+++ b/renv.lock
--- a/renv/.gitignore
+++ b/renv/.gitignore
+library/
+python/
+staging/
--- a/renv/activate.R
+++ b/renv/activate.R
--- a/renv/settings.dcf
+++ b/renv/settings.dcf
+external.libraries:
+ignored.packages:
+package.dependency.fields: Imports, Depends, LinkingTo
+snapshot.type: implicit
+use.cache: TRUE
+vcs.ignore.library: TRUE
--- a/scripts/build-image
+++ b/scripts/build-image
-#!/bin/bash
-sudo singularity build container.sif container-recipe 
--- a/scripts/check-inputs.R
+++ b/scripts/check-inputs.R
-#!/usr/bin/env Rscript
-library(readr)
-library(dplyr, warn.conflicts = FALSE)
-options(warn = 2) # warnings are errors
-args      <- commandArgs(trailingOnly = TRUE)
-input_csv <- args[[1]]
-varnames <- c(
-    "gupi",
-    "Outcomes.DerivedCompositeGOSE",
-    "Outcomes.DerivedCompositeGOSEDaysPostInjury",
-    "Subject.Age",
-    "Subject.PatientType",
-    "Subject.DeathDate"
-)
-colnames <- readr::read_csv(
-        input_csv,
-        n_max = 1, ,
-        col_types = "cinncD"
-    ) %>%
-    names
-if (!all(colnames == varnames))
-    stop("colnames do not match varnames, check order!")
--- a/scripts/download-container
+++ b/scripts/download-container
-#!/bin/bash
-set -e
-wget https://zenodo.org/record/2641161/files/container.sif
-checksum=($(md5sum container.sif))
-if [ $checksum != 4d7edd4ff6ba7c8ff7639c53e7c6daf9 ]; then
-    echo md5 mismatch!
-    exit 1
-fi
--- a/scripts/download-data-table.R
+++ b/scripts/download-data-table.R
-#!/usr/bin/env Rscript
-library(readr)
-library(dplyr, warn.conflicts = FALSE)
-library(httr)
-args <- commandArgs(trailingOnly = TRUE)
-user     <- args[[1]]
-token    <- args[[2]]
-version  <- args[[3]]
-tbl_name <- args[[4]]
-# load download links
-tbl_download_links <- read_csv(
-    "config/tbl_download_links.csv",
-    col_types = c(
-        col_character(),
-        col_character(),
-        col_character()
-    )
-)
-# check uniqueness
-tbl_download_links %>%
-    group_by(version, tbl_name) %>%
-    {
-        if (n_groups(.) < nrow(.))
-            stop("download links not unique, check file")
-    }
-# extract link for query
-tbl_download_link <- tbl_download_links %>%
-    filter(
-        version  == get("version", envir = .GlobalEnv),
-        tbl_name == get("tbl_name", envir = .GlobalEnv)
-    ) %>%
-    pull(download_link)
-if (length(tbl_download_link) == 0)
-    stop("no neurobot download link for table 'tbl_%s.csv' version '%s' found", tbl_name, version)
-# download csv file via neurobot api call
-res <- GET(
-    sprintf("https://neurobot-stage.incf.org/api/data/%s.csv", tbl_download_link),
-    authenticate(user, token, type = "digest")
-)
-# check for success and save
-if (res$status_code != 200)
-    stop(sprintf("error downloading tbl_%s.csv, http status code: %s", tbl_name, res$status_code))
-writeBin(content(res, "raw"), sprintf("tbl_%s.csv", tbl_name))