Commit 9cd2f22b authored by Kevin Kunzmann's avatar Kevin Kunzmann

...

parent 35acc798
...@@ -44,16 +44,17 @@ rule data: ...@@ -44,16 +44,17 @@ rule data:
rule dosage: rule dosage:
input: input:
"{out}/imputed-genotypes/chromosome-{i}.vcf.gz", vcf = "{out}/imputed-genotypes/chromosome-{i}.vcf.gz",
"{out}/GTEx_v8_hg38p7_variant_lookup_table.txt.gz" lookup = "{out}/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
output: output:
"{out}/dosages/chromosome-{i}.dosage.txt.gz" "{out}/dosages/chromosome-{i}-gtex-v8.dosage.txt.gz",
"{out}/reports/map-hg38-dosage-to-gtex-variants-chromosome-{i}.html"
shell: shell:
""" """
set -ex set -ex
mkdir -p {config[output_dir]}/dosages mkdir -p {config[output_dir]}/dosages
# filter for SNPs of defined quality and extract dosage # filter for SNPs of defined quality and extract dosage
pv {input} | \ pv {input.vcf} | \
bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' | \ bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' | \
bcftools +fill-tags | \ bcftools +fill-tags | \
bcftools query -f \ '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' > \ bcftools query -f \ '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' > \
...@@ -61,6 +62,9 @@ rule dosage: ...@@ -61,6 +62,9 @@ rule dosage:
# compress # compress
gzip {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt gzip {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt
# convert locations to GTEx v8 by hg38 position # convert locations to GTEx v8 by hg38 position
mkdir -p {wildcards.out}/reports
Rscript -e "rmarkdown::render('scripts/map-hg38-dosage-to-gtex-variants.Rmd', knit_root_dir = getwd(), output_dir = '{wildcards.out}/reports', output_file= 'map-hg38-dosage-to-gtex-variants-chromosome-{wildcards.i}', params = list(chromosome = {wildcards.i}))"
rm {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt.gz
""" """
# compute dosage for all 23 chromosomes # compute dosage for all 23 chromosomes
...@@ -68,16 +72,15 @@ rule dosages: ...@@ -68,16 +72,15 @@ rule dosages:
input: input:
rules.data.output, rules.data.output,
expand( expand(
"{out}/imputed-genotypes/chromosome-{i}.vcf.gz", "{out}/dosages/chromosome-{i}-gtex-v8.dosage.txt.gz",
out=config["output_dir"], out=config["output_dir"],
i=range(1,24) i=range(1,24)
) )
rule samples_file: rule samples_file:
input: input:
expand("{out}/imputed-genotypes/chromosome-22.vcf", out=config["output_dir"]) expand("{out}/imputed-genotypes/chromosome-22.vcf.gz", out=config["output_dir"])
output: output:
expand("{out}/dosages/samples.txt", out=config["output_dir"]) expand("{out}/dosages/samples.txt", out=config["output_dir"])
shell: shell:
......
jobs: 20 jobs: 25
use-singularity: True use-singularity: True
cluster: "sbatch -A MRC-BSU-SL2-CPU -p skylake --ntasks 1 --cpus-per-task 1 --nodes 1 -t 02:00:00 --job-name '{rule}__{wildcards}' --output '{rule}__{wildcards}.out' --error '{rule}__{wildcards}.err'" cluster: "sbatch -A MRC-BSU-SL2-CPU -p skylake --ntasks 1 --cpus-per-task 1 --nodes 1 -t 02:00:00 --job-name '{rule}__{wildcards}' --output '{rule}__{wildcards}.out' --error '{rule}__{wildcards}.err'"
singularity-args: "-H $PWD" singularity-args: "-H $PWD"
......
...@@ -26,16 +26,18 @@ library(glue, warn.conflicts = FALSE) ...@@ -26,16 +26,18 @@ library(glue, warn.conflicts = FALSE)
set.seed(42) set.seed(42)
``` ```
```{r load-data} ## Chromosome `r params$chromosome`
```{r load-data, message=FALSE}
dosages_file <- glue( dosages_file <- glue(
"output/dosages/chromosome-{params$chromosome}.dosage.txt.gz" "output/dosages/chromosome-{params$chromosome}.dosage.txt.gz"
) )
out_file <- str_replace( out_file <- str_replace(
dosages_file, dosages_file,
"(?<=-)([0-9]{1,2})(?=.dosage.txt)", "(?<=-)([0-9]{1,2})(?=.dosage.txt)",
"\\1-gtex-v8" "\\1-gtex-v8"
) )
tbl_gtex_lookup <- vroom::vroom( tbl_gtex_lookup <- vroom::vroom(
"output/GTEx_v8_hg38p7_variant_lookup_table.txt.gz" "output/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
...@@ -43,7 +45,7 @@ tbl_gtex_lookup <- vroom::vroom( ...@@ -43,7 +45,7 @@ tbl_gtex_lookup <- vroom::vroom(
filter( filter(
# to make sure we do not lose corner cases, # to make sure we do not lose corner cases,
# also load neighboring chromosomes # also load neighboring chromosomes
chr %in% glue("chr{(params$chromosome - 1):(params$chromosome + 1)}"), chr %in% glue("chr{params$chromosome}"),
nchar(ref) == 1, nchar(ref) == 1,
nchar(alt) == 1 nchar(alt) == 1
) %>% ) %>%
...@@ -73,13 +75,13 @@ tbl_dosage <- vroom::vroom( ...@@ -73,13 +75,13 @@ tbl_dosage <- vroom::vroom(
sample_ids$IID sample_ids$IID
), ),
col_types = cols( col_types = cols(
.default = col_double(), .default = col_double(),
dummy_ = col_character(), dummy_ = col_character(),
chromosome = col_integer(), chromosome = col_character(),
rsid = col_character(), rsid = col_character(),
variant_pos = col_integer(), variant_pos = col_integer(),
ref = col_character(), ref = col_character(),
alt = col_character() alt = col_character()
), ),
na = "." na = "."
) %>% ) %>%
...@@ -88,7 +90,7 @@ tbl_dosage <- vroom::vroom( ...@@ -88,7 +90,7 @@ tbl_dosage <- vroom::vroom(
``` ```
```{r} ```{r match-by-position}
tbl_dosage <- left_join( tbl_dosage <- left_join(
tbl_dosage, tbl_dosage,
tbl_gtex_lookup %>% transmute( tbl_gtex_lookup %>% transmute(
...@@ -107,37 +109,126 @@ tbl_dosage <- left_join( ...@@ -107,37 +109,126 @@ tbl_dosage <- left_join(
) )
``` ```
```{r}
tbl_dosage %>% ### Unmatched variants
```{r plot-unmatched}
tbl_unmatched <- tbl_dosage %>%
filter(
is.na(gtex_id)
) %>%
{n_mis <<- nrow(.); .}
ggplot(tbl_unmatched) +
aes(variant_pos) +
geom_histogram(
fill = "black",
color = NA,
bins = ceiling(n_mis / 33)
) +
scale_x_continuous("position") +
ggtitle("Histogram of unmatched variants") +
theme_bw()
ggplot(tbl_unmatched) +
aes(variant_pos, y = 0) +
geom_point(
alpha = 0.2,
shape = 16,
position = position_jitter(width = 0, height = 1)
) +
scale_x_continuous("position") +
scale_y_continuous("") +
ggtitle("Jitter-plot of individual unmatched variants") +
theme_bw()
```
Overall, `r nrow(tbl_unmatched)` variants
(`r sprintf("%.1f", 100*nrow(tbl_unmatched)/nrow(tbl_dosage))`%)
cannot be matched to GTEx v8 variants by position.
```{r match-by-bases}
tbl_matched <- tbl_dosage %>%
filter(
!is.na(gtex_id)
) %>%
group_by(variant_pos, ref, alt) %>%
mutate(
match = pmap_lgl(
list(ref, alt, ref_gtex, alt_gtex, chromosome, gtex_id),
function(ref, alt, ref_gtex, alt_gtex, chromosome, gtex_id) {
# match by bases
matched <- (ref == ref_gtex) & (alt == alt_gtex)
inds <- which(matched)
if (length(inds) > 1) {
matching_chromosome <- chromosome[inds] == str_extract(gtex_id[inds], "^chr[1-9]{1,2}")
if (sum(matching_chromosome) == 0)
warning("no matching chromosome found")
matched[inds] <- matching_chromosome
}
return(matched)
}
)
)
```
```{r mismatch}
tbl_mismatch <- tbl_matched %>%
filter( filter(
is.na(ref_gtex) | is.na(alt_gtex) !any(match)
) %>% ) %>%
{n_mis <<- nrow(.); .} %>% select(
ggplot() + chromosome, variant_pos, gtex_id, rsid, ref, alt, ref_gtex, alt_gtex, MAF
aes(variant_pos) + )
geom_histogram(
fill = "black", pander::pander(tbl_mismatch, caption = "mismatched variants by bases")
color = NA,
bins = ceiling(n_mis / 33)
) +
theme_bw()
``` ```
```{r} Of the position-matched variants `r nrow(tbl_mismatch)`
tbl_dosage %>% have inconsitent bases with GTex v8.
filter(is.na(ref_gtex) | is.na(alt_gtex)) %>%
ggplot() + Automatic check that reference base is consistent
aes(variant_pos, y = 0) + (no issues with complementing strands etc.):
geom_point( `r assertthat::assert_that(all(tbl_mismatch$ref == tbl_mismatch$ref_gtex))`
alpha = 0.2,
shape = 16, ```{r check-non-unique}
position = position_jitter(width = 0, height = 1) tbl_non_unique <- tbl_matched %>%
) + filter(
theme_bw() sum(match) > 1
) %>%
select(
chromosome, variant_pos, gtex_id, rsid, ref, alt,
ref_gtex, alt_gtex, MAF
)
``` ```
Automatic check that matching for variants is unique:
`r assertthat::assert_that(nrow(tbl_non_unique) == 0)`
## Matched variants
```{r filter-unique-matches}
tbl_unique_matched <- tbl_matched %>%
filter(
match
) %>%
select(
-rsid, -ref_gtex, -alt_gtex
) %>%
ungroup()
```
A total of
`r nrow(tbl_unique_matched)`
variants in chromosome
`r params$chromosome`
could be matched uniquely to a GTEx v8 variant
(`r sprintf("%.1f", 100*nrow(tbl_unique_matched)/nrow(tbl_dosage))`%).
```{r save}
write_delim(tbl_unique_matched, out_file)
```
## Session info ## Session info
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment