Commit a5ac5123 authored by Kevin Kunzmann's avatar Kevin Kunzmann

Merge branch 'rework-curl-data-download' into 'master'

Rework curl data download

See merge request !1
parents fa7750c2 1a64a6af
......@@ -3,10 +3,14 @@
# Prerequisites
We assume a Unix command line workflow. The following software is required to take advantage of the pre-defined workflow:
* curl for downloading the data (in case you do not have curl installed, it is also available from within the container)
* [python](https://www.python.org/download/releases/3.5.1/) 3.5.1 (higher versions might work as well)
* [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html) version 5.2.1 (higher versions will work as well)
* [singularity](https://www.sylabs.io/guides/2.6/user-guide/index.html) 2.6.0 (higher versions might work as well)
+ CENTER-TBI account and API key, store as NEUROBOT_USR and NEUROBOT_API
environment variables.
The entire analysis is containerized using a [docker container](https://cloud.docker.com/u/kkmann/repository/docker/kkmann/gose-6mo-imputation).
The container can either be used to execute scripts individually inside the container, or it can be used to run the entire
pre-defined snakemake workflow using the container via singularity (recommended).
......
......@@ -6,7 +6,7 @@ configfile: "config.yml"
rule import_neurobot_csv:
rule download_data:
output:
"data/{version}/df_baseline.rds",
"data/{version}/df_ctmri.rds",
......@@ -15,7 +15,7 @@ rule import_neurobot_csv:
"data/{version}/df_gose.rds"
shell:
"""
Rscript scripts/import_neurobot_data.R data/{wildcards.version} data/{wildcards.version}
bash scripts/download_{wildcards.version}.sh
"""
......@@ -24,7 +24,7 @@ rule import_neurobot_csv:
rule prepare_data:
input:
rules.import_neurobot_csv.output,
rules.download_data.output,
markdown = "reports/prepare_data.Rmd"
output:
"output/{version}/data/df_gose.rds",
......@@ -39,12 +39,6 @@ rule prepare_data:
mv reports/figures.zip {output.figures}
"""
# define corresponding target rule for ease of use
rule data_report_v1_1:
input:
pdf = "output/v1.1/prepare_data.pdf",
figures = "output/v1.1/prepare_data_figures.zip"
......@@ -80,14 +74,6 @@ rule generate_validation_data:
Rscript scripts/generate_validation_data.R output/{wildcards.version}/data {config[mi_m]} {config[folds]} {config[seed]}
"""
rule generate_validation_data_v1_1:
input:
["output/v1.1/data/validation/df_%s_mi_%i_fold_%i.rds" % (s, i, j)
for s in ("train", "test")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
......@@ -125,26 +111,6 @@ rule model_posteriors:
for j in range(1, config["folds"] + 1)
]
rule model_assessment:
input:
pop_report = rules.prepare_data.output,
posteriors = rules.model_posteriors.input,
markdown = "reports/model_assessment.Rmd"
output:
pdf = "output/{version}/model_assessment.pdf",
figures = "output/{version}/model_assessment_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}
Rscript -e "rmarkdown::render(\\"{input.markdown}\\", output_dir = \\"output/{wildcards.version}\\", params = list(data_dir = \\"../output/{wildcards.version}/data\\", config_file = \\"../config.yml\\"))"
mv reports/figures.zip {output.figures}
"""
# define corresponding target rule for ease of use
rule cv_model_comparison_report_v1_1:
input:
pdf = "output/v1.1/model_assessment.pdf",
figures = "output/v1.1/model_assessment_figures.zip"
......
#!/usr/bin bash
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://neurobot-stage.incf.org/api/data/_5c8a757252dc3879e3b7cc35.csv
#!/usr/bin bash
VERSION=v1.1
OUT=data/$VERSION
mkdir -p $OUT
# ctmri
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c5485306b3f2f22e14d209a.csv > \
$OUT/df_ctmri.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_ctmri.csv')), file = '$OUT/df_ctmri.rds')"
# imaging
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c5488246b3f2f22e14d209d.csv > \
$OUT/df_imaging.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_imaging.csv')), file = '$OUT/df_imaging.rds')"
# labs
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c5489696b3f2f22e14d209f.csv > \
$OUT/df_labs.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_labs.csv')), file = '$OUT/df_labs.rds')"
# GOSe
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c548a056b3f2f22e14d20a0.csv > \
$OUT/df_gose.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_gose.csv')), file = '$OUT/df_gose.rds')"
# baseline
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c548a5b6b3f2f22e14d20a2.csv > \
$OUT/df_baseline.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_baseline.csv')), file = '$OUT/df_baseline.rds')"
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
in_folder <- args[[1]]
out_folder <- args[[2]]
in_files <- sprintf("%s/%s", in_folder, list.files(path = in_folder, pattern = "*.csv"))
if (length(in_files) != 5) stop("must have exactly 5 input files")
for (f in in_files) {
tmp <- read_csv(f)
if ("Subject.Age" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_baseline.rds", out_folder))
if ("Outcomes.DerivedCompositeGOSE" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_gose.rds", out_folder))
if ("Labs.DLDate" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_labs.rds", out_folder))
if ("Imaging.MarshallCTClassification" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_imaging.rds", out_folder))
if ("CTMRI.CTSubarachnoidHem" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_ctmri.rds", out_folder))
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment