Commit 9cd2f22b authored by Kevin Kunzmann's avatar Kevin Kunzmann

...

parent 35acc798
......@@ -44,16 +44,17 @@ rule data:
rule dosage:
input:
"{out}/imputed-genotypes/chromosome-{i}.vcf.gz",
"{out}/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
vcf = "{out}/imputed-genotypes/chromosome-{i}.vcf.gz",
lookup = "{out}/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
output:
"{out}/dosages/chromosome-{i}.dosage.txt.gz"
"{out}/dosages/chromosome-{i}-gtex-v8.dosage.txt.gz",
"{out}/reports/map-hg38-dosage-to-gtex-variants-chromosome-{i}.html"
shell:
"""
set -ex
mkdir -p {config[output_dir]}/dosages
# filter for SNPs of defined quality and extract dosage
pv {input} | \
pv {input.vcf} | \
bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' | \
bcftools +fill-tags | \
bcftools query -f \ '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' > \
......@@ -61,6 +62,9 @@ rule dosage:
# compress
gzip {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt
# convert locations to GTEx v8 by hg38 position
mkdir -p {wildcards.out}/reports
Rscript -e "rmarkdown::render('scripts/map-hg38-dosage-to-gtex-variants.Rmd', knit_root_dir = getwd(), output_dir = '{wildcards.out}/reports', output_file= 'map-hg38-dosage-to-gtex-variants-chromosome-{wildcards.i}', params = list(chromosome = {wildcards.i}))"
rm {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt.gz
"""
# compute dosage for all 23 chromosomes
......@@ -68,16 +72,15 @@ rule dosages:
input:
rules.data.output,
expand(
"{out}/imputed-genotypes/chromosome-{i}.vcf.gz",
"{out}/dosages/chromosome-{i}-gtex-v8.dosage.txt.gz",
out=config["output_dir"],
i=range(1,24)
)
rule samples_file:
input:
expand("{out}/imputed-genotypes/chromosome-22.vcf", out=config["output_dir"])
expand("{out}/imputed-genotypes/chromosome-22.vcf.gz", out=config["output_dir"])
output:
expand("{out}/dosages/samples.txt", out=config["output_dir"])
shell:
......
jobs: 20
jobs: 25
use-singularity: True
cluster: "sbatch -A MRC-BSU-SL2-CPU -p skylake --ntasks 1 --cpus-per-task 1 --nodes 1 -t 02:00:00 --job-name '{rule}__{wildcards}' --output '{rule}__{wildcards}.out' --error '{rule}__{wildcards}.err'"
singularity-args: "-H $PWD"
......
......@@ -26,16 +26,18 @@ library(glue, warn.conflicts = FALSE)
set.seed(42)
```
```{r load-data}
## Chromosome `r params$chromosome`
```{r load-data, message=FALSE}
dosages_file <- glue(
"output/dosages/chromosome-{params$chromosome}.dosage.txt.gz"
)
out_file <- str_replace(
dosages_file,
"(?<=-)([0-9]{1,2})(?=.dosage.txt)",
"\\1-gtex-v8"
)
dosages_file,
"(?<=-)([0-9]{1,2})(?=.dosage.txt)",
"\\1-gtex-v8"
)
tbl_gtex_lookup <- vroom::vroom(
"output/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
......@@ -43,7 +45,7 @@ tbl_gtex_lookup <- vroom::vroom(
filter(
# to make sure we do not lose corner cases,
# also load neighboring chromosomes
chr %in% glue("chr{(params$chromosome - 1):(params$chromosome + 1)}"),
chr %in% glue("chr{params$chromosome}"),
nchar(ref) == 1,
nchar(alt) == 1
) %>%
......@@ -73,13 +75,13 @@ tbl_dosage <- vroom::vroom(
sample_ids$IID
),
col_types = cols(
.default = col_double(),
dummy_ = col_character(),
chromosome = col_integer(),
rsid = col_character(),
.default = col_double(),
dummy_ = col_character(),
chromosome = col_character(),
rsid = col_character(),
variant_pos = col_integer(),
ref = col_character(),
alt = col_character()
ref = col_character(),
alt = col_character()
),
na = "."
) %>%
......@@ -88,7 +90,7 @@ tbl_dosage <- vroom::vroom(
```
```{r}
```{r match-by-position}
tbl_dosage <- left_join(
tbl_dosage,
tbl_gtex_lookup %>% transmute(
......@@ -107,37 +109,126 @@ tbl_dosage <- left_join(
)
```
```{r}
tbl_dosage %>%
### Unmatched variants
```{r plot-unmatched}
tbl_unmatched <- tbl_dosage %>%
filter(
is.na(gtex_id)
) %>%
{n_mis <<- nrow(.); .}
ggplot(tbl_unmatched) +
aes(variant_pos) +
geom_histogram(
fill = "black",
color = NA,
bins = ceiling(n_mis / 33)
) +
scale_x_continuous("position") +
ggtitle("Histogram of unmatched variants") +
theme_bw()
ggplot(tbl_unmatched) +
aes(variant_pos, y = 0) +
geom_point(
alpha = 0.2,
shape = 16,
position = position_jitter(width = 0, height = 1)
) +
scale_x_continuous("position") +
scale_y_continuous("") +
ggtitle("Jitter-plot of individual unmatched variants") +
theme_bw()
```
Overall, `r nrow(tbl_unmatched)` variants
(`r sprintf("%.1f", 100*nrow(tbl_unmatched)/nrow(tbl_dosage))`%)
cannot be matched to GTEx v8 variants by position.
```{r match-by-bases}
tbl_matched <- tbl_dosage %>%
filter(
!is.na(gtex_id)
) %>%
group_by(variant_pos, ref, alt) %>%
mutate(
match = pmap_lgl(
list(ref, alt, ref_gtex, alt_gtex, chromosome, gtex_id),
function(ref, alt, ref_gtex, alt_gtex, chromosome, gtex_id) {
# match by bases
matched <- (ref == ref_gtex) & (alt == alt_gtex)
inds <- which(matched)
if (length(inds) > 1) {
matching_chromosome <- chromosome[inds] == str_extract(gtex_id[inds], "^chr[1-9]{1,2}")
if (sum(matching_chromosome) == 0)
warning("no matching chromosome found")
matched[inds] <- matching_chromosome
}
return(matched)
}
)
)
```
```{r mismatch}
tbl_mismatch <- tbl_matched %>%
filter(
is.na(ref_gtex) | is.na(alt_gtex)
!any(match)
) %>%
{n_mis <<- nrow(.); .} %>%
ggplot() +
aes(variant_pos) +
geom_histogram(
fill = "black",
color = NA,
bins = ceiling(n_mis / 33)
) +
theme_bw()
select(
chromosome, variant_pos, gtex_id, rsid, ref, alt, ref_gtex, alt_gtex, MAF
)
pander::pander(tbl_mismatch, caption = "mismatched variants by bases")
```
```{r}
tbl_dosage %>%
filter(is.na(ref_gtex) | is.na(alt_gtex)) %>%
ggplot() +
aes(variant_pos, y = 0) +
geom_point(
alpha = 0.2,
shape = 16,
position = position_jitter(width = 0, height = 1)
) +
theme_bw()
Of the position-matched variants `r nrow(tbl_mismatch)`
have inconsitent bases with GTex v8.
Automatic check that reference base is consistent
(no issues with complementing strands etc.):
`r assertthat::assert_that(all(tbl_mismatch$ref == tbl_mismatch$ref_gtex))`
```{r check-non-unique}
tbl_non_unique <- tbl_matched %>%
filter(
sum(match) > 1
) %>%
select(
chromosome, variant_pos, gtex_id, rsid, ref, alt,
ref_gtex, alt_gtex, MAF
)
```
Automatic check that matching for variants is unique:
`r assertthat::assert_that(nrow(tbl_non_unique) == 0)`
## Matched variants
```{r filter-unique-matches}
tbl_unique_matched <- tbl_matched %>%
filter(
match
) %>%
select(
-rsid, -ref_gtex, -alt_gtex
) %>%
ungroup()
```
A total of
`r nrow(tbl_unique_matched)`
variants in chromosome
`r params$chromosome`
could be matched uniquely to a GTEx v8 variant
(`r sprintf("%.1f", 100*nrow(tbl_unique_matched)/nrow(tbl_dosage))`%).
```{r save}
write_delim(tbl_unique_matched, out_file)
```
## Session info
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment