refactor

ccb8b3be · Kevin Kunzmann · 7cfd7495 · ccb8b3be · ccb8b3be · ccb8b3be
Commit ccb8b3be authored Mar 05, 2020 by Kevin Kunzmann
7 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ logs
 nohup.out
 container.sif
 .DS_Store
+.Rproj.user
--- a/Snakefile
+++ b/Snakefile
-# edit configuration here (see cluster.json for running on slurm scheduler)
-configfile: "config.yml"
+configfile:  "config/parameters.yml"
+singularity: "container.sif"
+localrules:  impute, generate_samples_file

-localrules: impute, clean, download_container, generate_samples_file


-
-# this rule runs the entire pipeline and creates a R data set file (rds)
-# containing all imputed expression values
 rule impute:
    input:
-        "container.sif",
        expand("{output_dir}/imputed-gene-expressions/{region}.expression.txt.gz", region = config['brain_regions'], output_dir = config['output_dir'])

-# delete output and logs (if run on slurm cluster)
-rule clean:
-    shell:
-        """
-        rm -rf {config[output_dir]}
-        rm -rf logs
-        rm -rf nohup.out
-        """

 # download vcf files using gsutil (needs to be set up and configured!
 rule download_imputed_genotype_chromosome:
    output:
-        "output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{i}.vcf.gz"
-    singularity:
-        "container.sif"
+        "output/imputed-genotypes/chromosome-{i}.vcf.gz"
    shell:
        """
+        setr -ex
        mkdir -p output/imputed-genotypes
-        gsutil cp gs://fimm-horizon-outgoing-data/CENTER_TBI_data_freeze_191014/Imputed_data/CENTER.TBI_imputed_N3741_hg19_HRC_MAC1_freeze_191014_chr{wildcards.i}.vcf.gz \
-            output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{wildcards.i}.vcf.gz
+        FILEDIR=gs://fimm-horizon-outgoing-data/20201002-center-tbi-genetic-data/genome-wide-imputation-data
+        gsutil cp $FILEDIR/chromosome-{wildcards.i}.vcf.gz {output}
        """

-rule download_imputed_genotypes:
-    input:
-        expand("output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{i}.vcf.gz",
-            i = list(map(str, range(1, 23))) + ['X']
-        )

-# download the required singularity container from zenodo.org
-rule download_container:
-    output:
-        "container.sif"
-    shell:
-        """
-        bash scripts/download_container.sh
-        """
-
-# extract dosages in custom PrediXcan format from vcf file
-rule vcf_to_dosages:
+rule dosages:
    input:
-        "container.sif",
-        vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr{i}.vcf.gz"
+        expand("{out}/imputed-genotypes/chromosome-22.vcf.gz", out=config["output_dir"])
    output:
-        "{output_dir}/dosages/chr{i}.dosage.txt.gz"
-    singularity:
-        "container.sif"
+        expand("{out}/imputed-genotypes/chromosome-22.dosage.txt.gz", out=config["output_dir"])
    shell:
        """
-        export prefix={wildcards.output_dir}/dosages
-        mkdir -p $prefix
-        echo "decompress and filter out: 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' ..."
-        bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' {input.vcf_gz_file} > $prefix/chr{wildcards.i}_.vcf
-        echo "add MAFs ..."
-        bcftools +fill-tags $prefix/chr{wildcards.i}_.vcf > $prefix/chr{wildcards.i}.vcf
-        rm $prefix/chr{wildcards.i}_.vcf
-        echo 'querying dosages ...'
-        bcftools query -f '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' $prefix/chr{wildcards.i}.vcf > $prefix/chr{wildcards.i}.dosage.txt
-        rm $prefix/chr{wildcards.i}.vcf
-        echo 'compressing ...'
-        gzip $prefix/chr{wildcards.i}.dosage.txt
-        printf "done.\n\r\n\r"
+        set -ex
+        mkdir -p {config[output_dir]}/dosages
+        pv {input} | \
+            bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' | \
+            bcftools +fill-tags | \
+            bcftools query -f \ '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' > \
+            {config[output_dir]}/dosages/chromosome-22.dosage.txt
+        gzip {config[output_dir]}/dosages/chromosome-22.dosage.txt
+        # convert locations to GTEX v8 by hg38 position
        """

-# extract sample file for PrediXcan
-rule generate_samples_file:
+
+rule samples_file:
    input:
-        "container.sif",
-        vcf_gz_file = "output/imputed-genotypes/CENTER_TBI_imputed_genotype_chr1.vcf.gz"
+        expand("{out}/imputed-genotypes/chromosome-22.vcf", out=config["output_dir"])
    output:
-        "{output_dir}/dosages/samples.txt"
-    singularity:
-        "container.sif"
-    params:
-        format = "\'{ printf(\"%s %s\\n\", $1, $1); }\'"
+        expand("{out}/dosages/samples.txt", out=config["output_dir"])
    shell:
        """
-        export prefix={wildcards.output_dir}/dosages
-        mkdir -p $prefix
-        bcftools query -l {input.vcf_gz_file} >> $prefix/samples_.txt
-        # family ID = individual ID
-        awk {params.format} < $prefix/samples_.txt > $prefix/samples.txt
-        rm $prefix/samples_.txt
+        set -ex
+        mkdir -p {config[output_dir]}/dosages
+        # query sample ids from first chromosome
+        bcftools query -l {input} >> {config[output_dir]}/dosages/samples.txt
+        # predixcan allows family and sample id, duplicate column and overwrite
+        Rscript -e "library(dplyr); readr::read_tsv('output/dosages/samples.txt', col_names = 'SID', col_types = 'c') %>% dplyr::mutate(FID = SID) %>% readr::write_tsv(path = 'output/dosages/samples.txt', col_names = FALSE)"
        """

-# run PrediXcan to impute gene expression for individual tissue type
+
 rule impute_gene_expressions:
    input:
        "container.sif",

--- a/cluster.json
+++ b/cluster.json
--- a/config.yml
+++ b/config.yml
--- a/impute-gene-expression.Rproj
+++ b/impute-gene-expression.Rproj
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 4
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
--- a/container.def
+++ b/container.def
@@ -39,6 +39,7 @@ From: ubuntu:18.04
    chmod +x /usr/bin/predixcan
    pip install \
        argparse datetime numpy
+
    # download, extract and store (brain) weights
    mkdir /usr/predixcan
    wget https://s3.amazonaws.com/predictdb2/GTEx-V7_HapMap-2017-11-29.tar.gz -O /usr/predixcan/GTEx-V7_HapMap-2017-11-29.tar.gz
@@ -48,8 +49,9 @@ From: ubuntu:18.04
        tar -xvz -f GTEx-V7_HapMap-2017-11-29.tar.gz -C GTEx-V7_HapMap-2017-11-29 --wildcards "*_Brain_*"; \
        rm GTEx-V7_HapMap-2017-11-29.tar.gz \
        )
+
    # predixcan connects to the weights database with sql, needs write permission
-    # even if the file system will be read only for the cvontainer
+    # even if the file system will be read only for the container
    chmod -R 777 /usr/predixcan

    # install R and packages

--- a/scripts/map-hg38-dosage-to-gtex-variants.Rmd
+++ b/scripts/map-hg38-dosage-to-gtex-variants.Rmd
+---
+title:  "Mapping hg38 to GTEx variants"
+author: "Kevin Kunzmann"
+date:   "`r Sys.time()`"
+output: 
+    html_document:
+        code_folding: hide
+params: 
+    chromosome: 22
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(
+    echo     = TRUE
+)
+
+pander::panderOptions("table.split.table", 200)
+
+options(
+    tidyverse.quite = TRUE,
+    width           = 90
+)
+library(tidyverse, warn.conflicts = FALSE)
+library(glue, warn.conflicts = FALSE)
+
+set.seed(42)
+```
+
+```{r load-data}
+dosages_file <- glue(
+    "output/dosages/chromosome-{params$chromosome}.dosage.txt.gz"
+)
+
+out_file <- str_replace(
+    dosages_file, 
+    "(?<=-)([0-9]{1,2})(?=.dosage.txt)", 
+    "\\1-gtex-v8"
+)
+
+tbl_gtex_lookup <- vroom::vroom(
+        "GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz"
+    ) %>%
+    filter(
+        # to make sure we do not lose corner cases, 
+        # also load neighboring chromosomes
+        chr %in% glue("chr{(params$chromosome - 1):(params$chromosome + 1)}"),
+        nchar(ref) == 1,
+        nchar(alt) == 1
+    ) %>%
+    mutate(
+        variant_pos = as.integer(variant_pos)
+    ) %>% 
+    arrange(variant_pos)
+
+sample_ids <- read_delim(
+    "output/dosages/samples.txt", 
+    delim = "\t", 
+    col_types = "cc",
+    col_names = c("FID", "IID")
+)
+
+tbl_dosage <- vroom::vroom(
+        dosages_file, 
+        delim = " ", 
+        col_names = c(
+            "dummy_",
+            "chromosome",
+            "rsid",
+            "variant_pos",
+            "ref",
+            "alt",
+            "MAF",
+            sample_ids$IID
+        ),
+        col_types = cols(
+            .default = col_double(),
+            dummy_ = col_character(),
+            chromosome = col_integer(),
+            rsid = col_character(),
+            variant_pos = col_integer(),
+            ref = col_character(),
+            alt = col_character()
+        ),
+        na = "."
+    ) %>%
+    select(-dummy_) %>% 
+    arrange(variant_pos)
+```
+
+
+```{r}
+tbl_dosage <- left_join(
+        tbl_dosage,
+        tbl_gtex_lookup %>% transmute(
+            variant_pos, 
+            gtex_id  = variant_id,
+            ref_gtex = ref, 
+            alt_gtex = alt
+        ),
+        by = "variant_pos"
+    ) %>% 
+    select(
+        chromosome, variant_pos, gtex_id, rsid, 
+        ref, alt, ref_gtex, alt_gtex, 
+        MAF,
+        everything()
+    )
+```
+
+```{r}
+tbl_dosage %>% 
+    filter(
+        is.na(ref_gtex) | is.na(alt_gtex)
+    ) %>% 
+    {n_mis <<- nrow(.); .} %>% 
+    ggplot() +
+        aes(variant_pos) +
+        geom_histogram(
+            fill  = "black", 
+            color = NA,
+            bins  = ceiling(n_mis / 33)
+        ) + 
+        theme_bw()
+```
+
+```{r}
+tbl_dosage %>% 
+    filter(is.na(ref_gtex) | is.na(alt_gtex)) %>% 
+    ggplot() +
+        aes(variant_pos, y = 0) +
+        geom_point(
+            alpha = 0.2, 
+            shape = 16, 
+            position = position_jitter(width = 0, height = 1)
+        ) + 
+        theme_bw()
+```
+
+
+
+
+
+## Session info
+
+```{r, echo=FALSE}
+sessionInfo() %>% 
+    pander::pander()
+```