Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
GOSe-6mo-imputation-paper
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Kevin Kunzmann
GOSe-6mo-imputation-paper
Commits
6b448f2e
Commit
6b448f2e
authored
Mar 22, 2019
by
Kevin Kunzmann
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
revision of manuscript
parent
0badf884
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
32 additions
and
28 deletions
+32
-28
manuscript/manuscript.Rmd
manuscript/manuscript.Rmd
+32
-28
No files found.
manuscript/manuscript.Rmd
View file @
6b448f2e
...
@@ -553,7 +553,7 @@ alternative measure of bias which does not require this tacit assumption.
...
@@ -553,7 +553,7 @@ alternative measure of bias which does not require this tacit assumption.
Note
that
the
scale
is
not
directlz
comparable
to
the
one
of
the
Note
that
the
scale
is
not
directlz
comparable
to
the
one
of
the
other
three
quantities
!
other
three
quantities
!
All
measures
are
considered
both
conditional
on
the
ground
-
truth
All
measures
are
considered
both
conditional
on
the
ground
-
truth
(
unobserved
true
GOSe
)
as
well
as
averaged
over
the
entire
test
set
.
(
unobserved
observed
GOSe
)
as
well
as
averaged
over
the
entire
test
set
.
LOCF
,
by
design
,
cannot
provide
imputed
values
when
there
are
no
LOCF
,
by
design
,
cannot
provide
imputed
values
when
there
are
no
...
@@ -671,7 +671,7 @@ baseline covariates.
...
@@ -671,7 +671,7 @@ baseline covariates.
We
first
consider
results
for
the
set
of
test
cases
which
allow
LOCF
imputation
We
first
consider
results
for
the
set
of
test
cases
which
allow
LOCF
imputation
(
n
=
`
r
df_predictions
%>%
filter
(
model
==
"LOCF"
)
%>%
nrow
-
length
(
idx
)`).
(
n
=
`
r
df_predictions
%>%
filter
(
model
==
"LOCF"
)
%>%
nrow
-
length
(
idx
)`).
Both
the
raw
count
as
well
as
the
relative
(
by
left
-
out
true
GOSe
)
confusion
matrices
Both
the
raw
count
as
well
as
the
relative
(
by
left
-
out
observed
GOSe
)
confusion
matrices
are
presented
in
Figure
???.
are
presented
in
Figure
???.
```{
r
confusion
-
matrix
-
locf
,
warning
=
FALSE
,
message
=
FALSE
,
echo
=
FALSE
,
fig
.
cap
=
"Confusion matrices on LOCF subset."
}
```{
r
confusion
-
matrix
-
locf
,
warning
=
FALSE
,
message
=
FALSE
,
echo
=
FALSE
,
fig
.
cap
=
"Confusion matrices on LOCF subset."
}
...
@@ -687,16 +687,16 @@ plot_confusion_matrices <- function(df_predictions, models) {
...
@@ -687,16 +687,16 @@ plot_confusion_matrices <- function(df_predictions, models) {
)
%>%
)
%>%
as
.
matrix
%>%
as_tibble
%>%
as
.
matrix
%>%
as_tibble
%>%
mutate
(`
Predicted
GOSE
`
=
row_number
()
%>%
as
.
character
)
%>%
mutate
(`
Predicted
GOSE
`
=
row_number
()
%>%
as
.
character
)
%>%
gather
(`
True
GOSE
`,
n
,
1
:
8
)
gather
(`
Observed
GOSE
`,
n
,
1
:
8
)
)
%>%
)
%>%
unnest
%>%
unnest
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
True
GOSE
`)
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
Observed
GOSE
`)
%>%
summarize
(
n
=
mean
(
n
))
%>%
summarize
(
n
=
mean
(
n
))
%>%
ungroup
%>%
ungroup
%>%
mutate
(
model
=
factor
(
model
,
models
))
mutate
(
model
=
factor
(
model
,
models
))
p_cnf_mtrx_raw
<-
df_average_confusion_matrices
%>%
p_cnf_mtrx_raw
<-
df_average_confusion_matrices
%>%
ggplot
(
aes
(`
True
GOSE
`,
`
Predicted
GOSE
`,
fill
=
n
))
+
ggplot
(
aes
(`
Observed
GOSE
`,
`
Predicted
GOSE
`,
fill
=
n
))
+
geom_raster
()
+
geom_raster
()
+
geom_text
(
aes
(
geom_text
(
aes
(
label
=
sprintf
(
"%.1f"
,
n
)
%>%
label
=
sprintf
(
"%.1f"
,
n
)
%>%
...
@@ -708,7 +708,7 @@ plot_confusion_matrices <- function(df_predictions, models) {
...
@@ -708,7 +708,7 @@ plot_confusion_matrices <- function(df_predictions, models) {
geom_vline
(
xintercept
=
c
(
2
,
4
,
6
)
+
.5
,
color
=
"black"
)
+
geom_vline
(
xintercept
=
c
(
2
,
4
,
6
)
+
.5
,
color
=
"black"
)
+
scale_fill_gradient
(
low
=
"white"
,
high
=
"#555555"
)
+
scale_fill_gradient
(
low
=
"white"
,
high
=
"#555555"
)
+
coord_fixed
(
expand
=
FALSE
)
+
coord_fixed
(
expand
=
FALSE
)
+
labs
(
x
=
"
true
GOSe"
,
y
=
"imputed GOSe"
,
fill
=
""
)
+
labs
(
x
=
"
observed
GOSe"
,
y
=
"imputed GOSe"
,
fill
=
""
)
+
theme_bw
()
+
theme_bw
()
+
theme
(
theme
(
panel
.
grid
=
element_blank
()
panel
.
grid
=
element_blank
()
...
@@ -717,18 +717,18 @@ plot_confusion_matrices <- function(df_predictions, models) {
...
@@ -717,18 +717,18 @@ plot_confusion_matrices <- function(df_predictions, models) {
ggtitle
(
"Average confusion matrix accross folds (absolute counts)"
)
ggtitle
(
"Average confusion matrix accross folds (absolute counts)"
)
p_cnf_mtrx_colnrm
<-
df_average_confusion_matrices
%>%
p_cnf_mtrx_colnrm
<-
df_average_confusion_matrices
%>%
group_by
(
model
,
`
True
GOSE
`)
%>%
group_by
(
model
,
`
Observed
GOSE
`)
%>%
mutate
(
mutate
(
`
fraction
(
column
)`
=
n
/
sum
(
n
),
`
fraction
(
column
)`
=
n
/
sum
(
n
),
`
fraction
(
column
)`
=
ifelse
(
is
.
nan
(`
fraction
(
column
)`),
0
,
`
fraction
(
column
)`)
`
fraction
(
column
)`
=
ifelse
(
is
.
nan
(`
fraction
(
column
)`),
0
,
`
fraction
(
column
)`)
)
%>%
)
%>%
ggplot
(
aes
(`
True
GOSE
`,
`
Predicted
GOSE
`,
fill
=
`
fraction
(
column
)`))
+
ggplot
(
aes
(`
Observed
GOSE
`,
`
Predicted
GOSE
`,
fill
=
`
fraction
(
column
)`))
+
geom_raster
()
+
geom_raster
()
+
geom_hline
(
yintercept
=
c
(
2
,
4
,
6
)
+
.5
,
color
=
"black"
)
+
geom_hline
(
yintercept
=
c
(
2
,
4
,
6
)
+
.5
,
color
=
"black"
)
+
geom_vline
(
xintercept
=
c
(
2
,
4
,
6
)
+
.5
,
color
=
"black"
)
+
geom_vline
(
xintercept
=
c
(
2
,
4
,
6
)
+
.5
,
color
=
"black"
)
+
scale_fill_gradient
(
""
,
low
=
"white"
,
high
=
"black"
,
limits
=
c
(
0
,
1
))
+
scale_fill_gradient
(
""
,
low
=
"white"
,
high
=
"black"
,
limits
=
c
(
0
,
1
))
+
coord_fixed
(
expand
=
FALSE
)
+
coord_fixed
(
expand
=
FALSE
)
+
labs
(
x
=
"
true
GOSe"
,
y
=
"imputed GOSe"
,
fill
=
""
)
+
labs
(
x
=
"
observed
GOSe"
,
y
=
"imputed GOSe"
,
fill
=
""
)
+
theme_bw
()
+
theme_bw
()
+
theme
(
theme
(
panel
.
grid
=
element_blank
()
panel
.
grid
=
element_blank
()
...
@@ -786,10 +786,10 @@ df_average_confusion_matrices <- df_predictions %>%
...
@@ -786,10 +786,10 @@ df_average_confusion_matrices <- df_predictions %>%
)
%>%
)
%>%
as
.
matrix
%>%
as_tibble
%>%
as
.
matrix
%>%
as_tibble
%>%
mutate
(`
Predicted
GOSE
`
=
row_number
()
%>%
as
.
character
)
%>%
mutate
(`
Predicted
GOSE
`
=
row_number
()
%>%
as
.
character
)
%>%
gather
(`
True
GOSE
`,
n
,
1
:
8
)
gather
(`
Observed
GOSE
`,
n
,
1
:
8
)
)
%>%
)
%>%
unnest
%>%
unnest
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
True
GOSE
`)
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
Observed
GOSE
`)
%>%
summarize
(
n
=
mean
(
n
))
%>%
summarize
(
n
=
mean
(
n
))
%>%
ungroup
%>%
ungroup
%>%
mutate
(
model
=
factor
(
model
,
models
))
mutate
(
model
=
factor
(
model
,
models
))
...
@@ -797,7 +797,7 @@ rbind(
...
@@ -797,7 +797,7 @@ rbind(
df_average_confusion_matrices
%>%
df_average_confusion_matrices
%>%
filter
(
model
%
in
%
c
(
"LOCF"
,
"MM"
,
"GP + cov"
,
"MSM"
))
%>%
filter
(
model
%
in
%
c
(
"LOCF"
,
"MM"
,
"GP + cov"
,
"MSM"
))
%>%
group_by
(
model
)
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
<=
3
)
%>%
filter
(`
Observed
GOSE
`
<=
3
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
>
3
)
%>%
filter
(`
Predicted
GOSE
`
>
3
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
@@ -806,7 +806,7 @@ df_average_confusion_matrices %>%
...
@@ -806,7 +806,7 @@ df_average_confusion_matrices %>%
df_average_confusion_matrices
%>%
df_average_confusion_matrices
%>%
filter
(
model
%
in
%
c
(
"LOCF"
,
"MM"
,
"GP + cov"
,
"MSM"
))
%>%
filter
(
model
%
in
%
c
(
"LOCF"
,
"MM"
,
"GP + cov"
,
"MSM"
))
%>%
group_by
(
model
)
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
==
4
)
%>%
filter
(`
Observed
GOSE
`
==
4
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
>
4
)
%>%
filter
(`
Predicted
GOSE
`
>
4
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
@@ -815,7 +815,7 @@ df_average_confusion_matrices %>%
...
@@ -815,7 +815,7 @@ df_average_confusion_matrices %>%
df_average_confusion_matrices
%>%
df_average_confusion_matrices
%>%
filter
(
model
%
in
%
c
(
"LOCF"
,
"MM"
,
"GP + cov"
,
"MSM"
))
%>%
filter
(
model
%
in
%
c
(
"LOCF"
,
"MM"
,
"GP + cov"
,
"MSM"
))
%>%
group_by
(
model
)
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
<
8
)
%>%
filter
(`
Observed
GOSE
`
<
8
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
==
8
)
%>%
filter
(`
Predicted
GOSE
`
==
8
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
@@ -895,13 +895,17 @@ ggsave(filename = "errors_stratified_locf.png", width = 9, height = 3)
...
@@ -895,13 +895,17 @@ ggsave(filename = "errors_stratified_locf.png", width = 9, height = 3)
Just
as
with
the
overall
performance
,
differences
are
most
pronounced
in
terms
Just
as
with
the
overall
performance
,
differences
are
most
pronounced
in
terms
of
bias
.
of
bias
.
Interestingly
,
the
conditional
perspective
reveals
differences
between
bias
as
difference
between
mean
imputed
and
mean
observed
values
(
tacitly
assuming
an
at
least
interval
scale
)
and
the
difference
in
the
probability
to
over
-
or
undershoot
the
observed
value
.
Again
,
the
category
imbalance
in
the
GOSe
distribution
explains
the
fact
that
Again
,
the
category
imbalance
in
the
GOSe
distribution
explains
the
fact
that
all
model
-
based
approaches
tend
to
perform
better
for
the
most
frequent
all
model
-
based
approaches
tend
to
perform
better
for
the
most
frequent
categories
6
,
7
,
and
8
while
sacrificing
performance
for
the
less
frequent
categories
6
,
7
,
and
8
while
sacrificing
performance
for
the
less
frequent
categories
4
and
5
as
compared
to
LOCF
.
categories
4
and
5
as
compared
to
LOCF
.
Bias
-
wise
all
methods
exhibit
a
certain
regression
to
the
mean
effect
since
low
Bias
-
wise
all
methods
exhibit
a
certain
regression
to
the
mean
effect
since
low
categories
tend
to
be
confused
with
better
(
higher
)
GOSe
on
average
while
categories
tend
to
be
confused
with
better
(
higher
)
GOSe
on
average
while
high
true
GOSe
values
are
dampened
(
negative
bias
at
7
,
8
).
high
observed
GOSe
values
are
dampened
(
negative
bias
at
7
,
8
).
Since
LOCF
does
not
take
the
category
imbalance
into
account
and
since
it
exhibits
Since
LOCF
does
not
take
the
category
imbalance
into
account
and
since
it
exhibits
a
relatively
large
negative
bias
at
the
most
frequent
GOSe
values
,
it
is
a
relatively
large
negative
bias
at
the
most
frequent
GOSe
values
,
it
is
overall
negatively
biased
.
overall
negatively
biased
.
...
@@ -909,14 +913,14 @@ Interestingly, the conditional assessment of the GP regressions bias profile
...
@@ -909,14 +913,14 @@ Interestingly, the conditional assessment of the GP regressions bias profile
reveals
that
the
overall
unbiasedness
can
be
explained
by
the
relatively
high
reveals
that
the
overall
unbiasedness
can
be
explained
by
the
relatively
high
positive
and
negative
biases
conditional
on
low
/
high
GOSe
values
canceling
out
positive
and
negative
biases
conditional
on
low
/
high
GOSe
values
canceling
out
in
the
overall
population
.
in
the
overall
population
.
Since
the
accuracy
results
mirror
this
effect
,
the
GP
regression
model
seems
The
MSM
and
MM
models
are
fairly
similar
with
respect
to
accuracy
but
MSM
to
suffer
from
an
overly
string
regression
to
the
mean
effect
.
clearly
dominates
with
respect
to
bias
.
Note
that
irrespective
of
the
exact
definition
of
bias
used
,
MSM
ominates
the
other
The
MSM
and
MM
models
are
fairly
similar
with
respect
to
accuracy
with
a
slight
model
-
based
approaches
.
advantage
for
MSM
.
Comparing
LOCF
and
MSM
,
there
is
a
slight
advantage
of
MSM
in
terms
of
accuracy
for
Interestingly
,
though
,
the
MSM
approach
is
consistently
less
positively
biased
the
majority
classes
3
,
7
,
8
which
explain
the
overall
difference
shwon
in
Figure
???.
across
the
rarer
low
GOSe
categories
while
being
only
insignificantly
more
With
respect
to
bias
,
MSM
also
performs
better
than
LOCF
for
the
most
frequently
negatively
biased
for
categories
7
and
8
.
observed
categories
,
but
the
extent
of
this
improvement
depend
on
the
performance
measure
.
...
@@ -956,17 +960,17 @@ df_average_confusion_matrices <- df_predictions %>%
...
@@ -956,17 +960,17 @@ df_average_confusion_matrices <- df_predictions %>%
)
%>%
)
%>%
as
.
matrix
%>%
as_tibble
%>%
as
.
matrix
%>%
as_tibble
%>%
mutate
(`
Predicted
GOSE
`
=
row_number
()
%>%
as
.
character
)
%>%
mutate
(`
Predicted
GOSE
`
=
row_number
()
%>%
as
.
character
)
%>%
gather
(`
True
GOSE
`,
n
,
1
:
8
)
gather
(`
Observed
GOSE
`,
n
,
1
:
8
)
)
%>%
)
%>%
unnest
%>%
unnest
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
True
GOSE
`)
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
Observed
GOSE
`)
%>%
summarize
(
n
=
mean
(
n
))
%>%
summarize
(
n
=
mean
(
n
))
%>%
ungroup
%>%
ungroup
%>%
mutate
(
model
=
factor
(
model
,
models
))
mutate
(
model
=
factor
(
model
,
models
))
rbind
(
rbind
(
df_average_confusion_matrices
%>%
df_average_confusion_matrices
%>%
group_by
(
model
)
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
<=
3
)
%>%
filter
(`
Observed
GOSE
`
<=
3
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
>
3
)
%>%
filter
(`
Predicted
GOSE
`
>
3
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
@@ -974,7 +978,7 @@ df_average_confusion_matrices %>%
...
@@ -974,7 +978,7 @@ df_average_confusion_matrices %>%
df_average_confusion_matrices
%>%
df_average_confusion_matrices
%>%
group_by
(
model
)
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
==
4
)
%>%
filter
(`
Observed
GOSE
`
==
4
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
>
4
)
%>%
filter
(`
Predicted
GOSE
`
>
4
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
@@ -982,7 +986,7 @@ df_average_confusion_matrices %>%
...
@@ -982,7 +986,7 @@ df_average_confusion_matrices %>%
df_average_confusion_matrices
%>%
df_average_confusion_matrices
%>%
group_by
(
model
)
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
<
8
)
%>%
filter
(`
Observed
GOSE
`
<
8
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
==
8
)
%>%
filter
(`
Predicted
GOSE
`
==
8
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment