Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
impute-gene-expression
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Kevin Kunzmann
impute-gene-expression
Commits
9cd2f22b
Commit
9cd2f22b
authored
Mar 10, 2020
by
Kevin Kunzmann
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
...
parent
35acc798
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
138 additions
and
44 deletions
+138
-44
Snakefile
Snakefile
+10
-7
config/snakemake/mrc-bsu-cluster/config.yaml
config/snakemake/mrc-bsu-cluster/config.yaml
+1
-1
scripts/map-hg38-dosage-to-gtex-variants.Rmd
scripts/map-hg38-dosage-to-gtex-variants.Rmd
+127
-36
No files found.
Snakefile
View file @
9cd2f22b
...
@@ -44,16 +44,17 @@ rule data:
...
@@ -44,16 +44,17 @@ rule data:
rule dosage:
rule dosage:
input:
input:
"{out}/imputed-genotypes/chromosome-{i}.vcf.gz",
vcf =
"{out}/imputed-genotypes/chromosome-{i}.vcf.gz",
"{out}/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
lookup =
"{out}/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
output:
output:
"{out}/dosages/chromosome-{i}.dosage.txt.gz"
"{out}/dosages/chromosome-{i}-gtex-v8.dosage.txt.gz",
"{out}/reports/map-hg38-dosage-to-gtex-variants-chromosome-{i}.html"
shell:
shell:
"""
"""
set -ex
set -ex
mkdir -p {config[output_dir]}/dosages
mkdir -p {config[output_dir]}/dosages
# filter for SNPs of defined quality and extract dosage
# filter for SNPs of defined quality and extract dosage
pv {input} | \
pv {input
.vcf
} | \
bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' | \
bcftools filter -e 'MAF[0]<{config[min_MAF]} | INFO<{config[min_INFO]} | TYPE!="snp" | N_ALT!=1' | \
bcftools +fill-tags | \
bcftools +fill-tags | \
bcftools query -f \ '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' > \
bcftools query -f \ '%CHROM %ID %POS %REF %ALT %INFO/MAF [%DS ]\n' > \
...
@@ -61,6 +62,9 @@ rule dosage:
...
@@ -61,6 +62,9 @@ rule dosage:
# compress
# compress
gzip {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt
gzip {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt
# convert locations to GTEx v8 by hg38 position
# convert locations to GTEx v8 by hg38 position
mkdir -p {wildcards.out}/reports
Rscript -e "rmarkdown::render('scripts/map-hg38-dosage-to-gtex-variants.Rmd', knit_root_dir = getwd(), output_dir = '{wildcards.out}/reports', output_file= 'map-hg38-dosage-to-gtex-variants-chromosome-{wildcards.i}', params = list(chromosome = {wildcards.i}))"
rm {config[output_dir]}/dosages/chromosome-{wildcards.i}.dosage.txt.gz
"""
"""
# compute dosage for all 23 chromosomes
# compute dosage for all 23 chromosomes
...
@@ -68,16 +72,15 @@ rule dosages:
...
@@ -68,16 +72,15 @@ rule dosages:
input:
input:
rules.data.output,
rules.data.output,
expand(
expand(
"{out}/
imputed-genotypes/chromosome-{i}.vcf
.gz",
"{out}/
dosages/chromosome-{i}-gtex-v8.dosage.txt
.gz",
out=config["output_dir"],
out=config["output_dir"],
i=range(1,24)
i=range(1,24)
)
)
rule samples_file:
rule samples_file:
input:
input:
expand("{out}/imputed-genotypes/chromosome-22.vcf", out=config["output_dir"])
expand("{out}/imputed-genotypes/chromosome-22.vcf
.gz
", out=config["output_dir"])
output:
output:
expand("{out}/dosages/samples.txt", out=config["output_dir"])
expand("{out}/dosages/samples.txt", out=config["output_dir"])
shell:
shell:
...
...
config/snakemake/mrc-bsu-cluster/config.yaml
View file @
9cd2f22b
jobs
:
2
0
jobs
:
2
5
use-singularity
:
True
use-singularity
:
True
cluster
:
"
sbatch
-A
MRC-BSU-SL2-CPU
-p
skylake
--ntasks
1
--cpus-per-task
1
--nodes
1
-t
02:00:00
--job-name
'{rule}__{wildcards}'
--output
'{rule}__{wildcards}.out'
--error
'{rule}__{wildcards}.err'"
cluster
:
"
sbatch
-A
MRC-BSU-SL2-CPU
-p
skylake
--ntasks
1
--cpus-per-task
1
--nodes
1
-t
02:00:00
--job-name
'{rule}__{wildcards}'
--output
'{rule}__{wildcards}.out'
--error
'{rule}__{wildcards}.err'"
singularity-args
:
"
-H
$PWD"
singularity-args
:
"
-H
$PWD"
...
...
scripts/map-hg38-dosage-to-gtex-variants.Rmd
View file @
9cd2f22b
...
@@ -26,7 +26,9 @@ library(glue, warn.conflicts = FALSE)
...
@@ -26,7 +26,9 @@ library(glue, warn.conflicts = FALSE)
set.seed(42)
set.seed(42)
```
```
```{r load-data}
## Chromosome `r params$chromosome`
```{r load-data, message=FALSE}
dosages_file <- glue(
dosages_file <- glue(
"output/dosages/chromosome-{params$chromosome}.dosage.txt.gz"
"output/dosages/chromosome-{params$chromosome}.dosage.txt.gz"
)
)
...
@@ -35,7 +37,7 @@ out_file <- str_replace(
...
@@ -35,7 +37,7 @@ out_file <- str_replace(
dosages_file,
dosages_file,
"(?<=-)([0-9]{1,2})(?=.dosage.txt)",
"(?<=-)([0-9]{1,2})(?=.dosage.txt)",
"\\1-gtex-v8"
"\\1-gtex-v8"
)
)
tbl_gtex_lookup <- vroom::vroom(
tbl_gtex_lookup <- vroom::vroom(
"output/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
"output/GTEx_v8_hg38p7_variant_lookup_table.txt.gz"
...
@@ -43,7 +45,7 @@ tbl_gtex_lookup <- vroom::vroom(
...
@@ -43,7 +45,7 @@ tbl_gtex_lookup <- vroom::vroom(
filter(
filter(
# to make sure we do not lose corner cases,
# to make sure we do not lose corner cases,
# also load neighboring chromosomes
# also load neighboring chromosomes
chr %in% glue("chr{
(params$chromosome - 1):(params$chromosome + 1)
}"),
chr %in% glue("chr{
params$chromosome
}"),
nchar(ref) == 1,
nchar(ref) == 1,
nchar(alt) == 1
nchar(alt) == 1
) %>%
) %>%
...
@@ -75,7 +77,7 @@ tbl_dosage <- vroom::vroom(
...
@@ -75,7 +77,7 @@ tbl_dosage <- vroom::vroom(
col_types = cols(
col_types = cols(
.default = col_double(),
.default = col_double(),
dummy_ = col_character(),
dummy_ = col_character(),
chromosome
= col_integ
er(),
chromosome
= col_charact
er(),
rsid = col_character(),
rsid = col_character(),
variant_pos = col_integer(),
variant_pos = col_integer(),
ref = col_character(),
ref = col_character(),
...
@@ -88,7 +90,7 @@ tbl_dosage <- vroom::vroom(
...
@@ -88,7 +90,7 @@ tbl_dosage <- vroom::vroom(
```
```
```{r}
```{r
match-by-position
}
tbl_dosage <- left_join(
tbl_dosage <- left_join(
tbl_dosage,
tbl_dosage,
tbl_gtex_lookup %>% transmute(
tbl_gtex_lookup %>% transmute(
...
@@ -107,37 +109,126 @@ tbl_dosage <- left_join(
...
@@ -107,37 +109,126 @@ tbl_dosage <- left_join(
)
)
```
```
```{r}
tbl_dosage %>%
### Unmatched variants
```{r plot-unmatched}
tbl_unmatched <- tbl_dosage %>%
filter(
filter(
is.na(
ref_gtex) | is.na(alt_gtex
)
is.na(
gtex_id
)
) %>%
) %>%
{n_mis <<- nrow(.); .} %>%
{n_mis <<- nrow(.); .}
ggplot() +
ggplot(tbl_unmatched) +
aes(variant_pos) +
aes(variant_pos) +
geom_histogram(
geom_histogram(
fill = "black",
fill = "black",
color = NA,
color = NA,
bins = ceiling(n_mis / 33)
bins = ceiling(n_mis / 33)
) +
) +
scale_x_continuous("position") +
ggtitle("Histogram of unmatched variants") +
theme_bw()
theme_bw()
```
```{r}
ggplot(tbl_unmatched) +
tbl_dosage %>%
filter(is.na(ref_gtex) | is.na(alt_gtex)) %>%
ggplot() +
aes(variant_pos, y = 0) +
aes(variant_pos, y = 0) +
geom_point(
geom_point(
alpha = 0.2,
alpha = 0.2,
shape = 16,
shape = 16,
position = position_jitter(width = 0, height = 1)
position = position_jitter(width = 0, height = 1)
) +
) +
scale_x_continuous("position") +
scale_y_continuous("") +
ggtitle("Jitter-plot of individual unmatched variants") +
theme_bw()
theme_bw()
```
```
Overall, `r nrow(tbl_unmatched)` variants
(`r sprintf("%.1f", 100*nrow(tbl_unmatched)/nrow(tbl_dosage))`%)
cannot be matched to GTEx v8 variants by position.
```{r match-by-bases}
tbl_matched <- tbl_dosage %>%
filter(
!is.na(gtex_id)
) %>%
group_by(variant_pos, ref, alt) %>%
mutate(
match = pmap_lgl(
list(ref, alt, ref_gtex, alt_gtex, chromosome, gtex_id),
function(ref, alt, ref_gtex, alt_gtex, chromosome, gtex_id) {
# match by bases
matched <- (ref == ref_gtex) & (alt == alt_gtex)
inds <- which(matched)
if (length(inds) > 1) {
matching_chromosome <- chromosome[inds] == str_extract(gtex_id[inds], "^chr[1-9]{1,2}")
if (sum(matching_chromosome) == 0)
warning("no matching chromosome found")
matched[inds] <- matching_chromosome
}
return(matched)
}
)
)
```
```{r mismatch}
tbl_mismatch <- tbl_matched %>%
filter(
!any(match)
) %>%
select(
chromosome, variant_pos, gtex_id, rsid, ref, alt, ref_gtex, alt_gtex, MAF
)
pander::pander(tbl_mismatch, caption = "mismatched variants by bases")
```
Of the position-matched variants `r nrow(tbl_mismatch)`
have inconsitent bases with GTex v8.
Automatic check that reference base is consistent
(no issues with complementing strands etc.):
`r assertthat::assert_that(all(tbl_mismatch$ref == tbl_mismatch$ref_gtex))`
```{r check-non-unique}
tbl_non_unique <- tbl_matched %>%
filter(
sum(match) > 1
) %>%
select(
chromosome, variant_pos, gtex_id, rsid, ref, alt,
ref_gtex, alt_gtex, MAF
)
```
Automatic check that matching for variants is unique:
`r assertthat::assert_that(nrow(tbl_non_unique) == 0)`
## Matched variants
```{r filter-unique-matches}
tbl_unique_matched <- tbl_matched %>%
filter(
match
) %>%
select(
-rsid, -ref_gtex, -alt_gtex
) %>%
ungroup()
```
A total of
`r nrow(tbl_unique_matched)`
variants in chromosome
`r params$chromosome`
could be matched uniquely to a GTEx v8 variant
(`r sprintf("%.1f", 100*nrow(tbl_unique_matched)/nrow(tbl_dosage))`%).
```{r save}
write_delim(tbl_unique_matched, out_file)
```
## Session info
## Session info
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment