Commit 1a64a6af authored by Kevin Kunzmann's avatar Kevin Kunzmann

Rework curl data download

parent fa7750c2
...@@ -3,9 +3,13 @@ ...@@ -3,9 +3,13 @@
# Prerequisites # Prerequisites
We assume a Unix command line workflow. The following software is required to take advantage of the pre-defined workflow: We assume a Unix command line workflow. The following software is required to take advantage of the pre-defined workflow:
* curl for downloading the data (in case you do not have curl installed, it is also available from within the container)
* [python](https://www.python.org/download/releases/3.5.1/) 3.5.1 (higher versions might work as well) * [python](https://www.python.org/download/releases/3.5.1/) 3.5.1 (higher versions might work as well)
* [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html) version 5.2.1 (higher versions will work as well) * [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html) version 5.2.1 (higher versions will work as well)
* [singularity](https://www.sylabs.io/guides/2.6/user-guide/index.html) 2.6.0 (higher versions might work as well) * [singularity](https://www.sylabs.io/guides/2.6/user-guide/index.html) 2.6.0 (higher versions might work as well)
+ CENTER-TBI account and API key, store as NEUROBOT_USR and NEUROBOT_API
environment variables.
The entire analysis is containerized using a [docker container](https://cloud.docker.com/u/kkmann/repository/docker/kkmann/gose-6mo-imputation). The entire analysis is containerized using a [docker container](https://cloud.docker.com/u/kkmann/repository/docker/kkmann/gose-6mo-imputation).
The container can either be used to execute scripts individually inside the container, or it can be used to run the entire The container can either be used to execute scripts individually inside the container, or it can be used to run the entire
......
...@@ -6,7 +6,7 @@ configfile: "config.yml" ...@@ -6,7 +6,7 @@ configfile: "config.yml"
rule import_neurobot_csv: rule download_data:
output: output:
"data/{version}/df_baseline.rds", "data/{version}/df_baseline.rds",
"data/{version}/df_ctmri.rds", "data/{version}/df_ctmri.rds",
...@@ -15,7 +15,7 @@ rule import_neurobot_csv: ...@@ -15,7 +15,7 @@ rule import_neurobot_csv:
"data/{version}/df_gose.rds" "data/{version}/df_gose.rds"
shell: shell:
""" """
Rscript scripts/import_neurobot_data.R data/{wildcards.version} data/{wildcards.version} bash scripts/download_{wildcards.version}.sh
""" """
...@@ -24,7 +24,7 @@ rule import_neurobot_csv: ...@@ -24,7 +24,7 @@ rule import_neurobot_csv:
rule prepare_data: rule prepare_data:
input: input:
rules.import_neurobot_csv.output, rules.download_data.output,
markdown = "reports/prepare_data.Rmd" markdown = "reports/prepare_data.Rmd"
output: output:
"output/{version}/data/df_gose.rds", "output/{version}/data/df_gose.rds",
...@@ -39,12 +39,6 @@ rule prepare_data: ...@@ -39,12 +39,6 @@ rule prepare_data:
mv reports/figures.zip {output.figures} mv reports/figures.zip {output.figures}
""" """
# define corresponding target rule for ease of use
rule data_report_v1_1:
input:
pdf = "output/v1.1/prepare_data.pdf",
figures = "output/v1.1/prepare_data_figures.zip"
...@@ -80,14 +74,6 @@ rule generate_validation_data: ...@@ -80,14 +74,6 @@ rule generate_validation_data:
Rscript scripts/generate_validation_data.R output/{wildcards.version}/data {config[mi_m]} {config[folds]} {config[seed]} Rscript scripts/generate_validation_data.R output/{wildcards.version}/data {config[mi_m]} {config[folds]} {config[seed]}
""" """
rule generate_validation_data_v1_1:
input:
["output/v1.1/data/validation/df_%s_mi_%i_fold_%i.rds" % (s, i, j)
for s in ("train", "test")
for i in range(1, config["mi_m"] + 1)
for j in range(1, config["folds"] + 1)
]
...@@ -125,26 +111,6 @@ rule model_posteriors: ...@@ -125,26 +111,6 @@ rule model_posteriors:
for j in range(1, config["folds"] + 1) for j in range(1, config["folds"] + 1)
] ]
rule model_assessment:
input:
pop_report = rules.prepare_data.output,
posteriors = rules.model_posteriors.input,
markdown = "reports/model_assessment.Rmd"
output:
pdf = "output/{version}/model_assessment.pdf",
figures = "output/{version}/model_assessment_figures.zip"
shell:
"""
mkdir -p output/{wildcards.version}
Rscript -e "rmarkdown::render(\\"{input.markdown}\\", output_dir = \\"output/{wildcards.version}\\", params = list(data_dir = \\"../output/{wildcards.version}/data\\", config_file = \\"../config.yml\\"))"
mv reports/figures.zip {output.figures}
"""
# define corresponding target rule for ease of use
rule cv_model_comparison_report_v1_1:
input:
pdf = "output/v1.1/model_assessment.pdf",
figures = "output/v1.1/model_assessment_figures.zip"
......
#!/usr/bin bash
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://neurobot-stage.incf.org/api/data/_5c8a757252dc3879e3b7cc35.csv
#!/usr/bin bash
VERSION=v1.1
OUT=data/$VERSION
mkdir -p $OUT
# ctmri
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c5485306b3f2f22e14d209a.csv > \
$OUT/df_ctmri.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_ctmri.csv')), file = '$OUT/df_ctmri.rds')"
# imaging
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c5488246b3f2f22e14d209d.csv > \
$OUT/df_imaging.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_imaging.csv')), file = '$OUT/df_imaging.rds')"
# labs
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c5489696b3f2f22e14d209f.csv > \
$OUT/df_labs.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_labs.csv')), file = '$OUT/df_labs.rds')"
# GOSe
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c548a056b3f2f22e14d20a0.csv > \
$OUT/df_gose.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_gose.csv')), file = '$OUT/df_gose.rds')"
# baseline
curl \
--user $NEUROBOT_USR:$NEUROBOT_API \
--digest https://center-tbi.incf.org/api/data/_5c548a5b6b3f2f22e14d20a2.csv > \
$OUT/df_baseline.csv
Rscript -e "library(tidyverse); saveRDS(as_tibble(read_csv('$OUT/df_baseline.csv')), file = '$OUT/df_baseline.rds')"
library(tidyverse)
args <- commandArgs(trailingOnly = TRUE)
in_folder <- args[[1]]
out_folder <- args[[2]]
in_files <- sprintf("%s/%s", in_folder, list.files(path = in_folder, pattern = "*.csv"))
if (length(in_files) != 5) stop("must have exactly 5 input files")
for (f in in_files) {
tmp <- read_csv(f)
if ("Subject.Age" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_baseline.rds", out_folder))
if ("Outcomes.DerivedCompositeGOSE" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_gose.rds", out_folder))
if ("Labs.DLDate" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_labs.rds", out_folder))
if ("Imaging.MarshallCTClassification" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_imaging.rds", out_folder))
if ("CTMRI.CTSubarachnoidHem" %in% names(tmp))
saveRDS(as_tibble(tmp), file = sprintf("%s/df_ctmri.rds", out_folder))
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment