Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
GOSe-6mo-imputation-paper
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Kevin Kunzmann
GOSe-6mo-imputation-paper
Commits
6b448f2e
Commit
6b448f2e
authored
Mar 22, 2019
by
Kevin Kunzmann
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
revision of manuscript
parent
0badf884
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
32 additions
and
28 deletions
+32
-28
manuscript/manuscript.Rmd
manuscript/manuscript.Rmd
+32
-28
No files found.
manuscript/manuscript.Rmd
View file @
6b448f2e
...
...
@@ -553,7 +553,7 @@ alternative measure of bias which does not require this tacit assumption.
Note
that
the
scale
is
not
directlz
comparable
to
the
one
of
the
other
three
quantities
!
All
measures
are
considered
both
conditional
on
the
ground
-
truth
(
unobserved
true
GOSe
)
as
well
as
averaged
over
the
entire
test
set
.
(
unobserved
observed
GOSe
)
as
well
as
averaged
over
the
entire
test
set
.
LOCF
,
by
design
,
cannot
provide
imputed
values
when
there
are
no
...
...
@@ -671,7 +671,7 @@ baseline covariates.
We
first
consider
results
for
the
set
of
test
cases
which
allow
LOCF
imputation
(
n
=
`
r
df_predictions
%>%
filter
(
model
==
"LOCF"
)
%>%
nrow
-
length
(
idx
)`).
Both
the
raw
count
as
well
as
the
relative
(
by
left
-
out
true
GOSe
)
confusion
matrices
Both
the
raw
count
as
well
as
the
relative
(
by
left
-
out
observed
GOSe
)
confusion
matrices
are
presented
in
Figure
???.
```{
r
confusion
-
matrix
-
locf
,
warning
=
FALSE
,
message
=
FALSE
,
echo
=
FALSE
,
fig
.
cap
=
"Confusion matrices on LOCF subset."
}
...
...
@@ -687,16 +687,16 @@ plot_confusion_matrices <- function(df_predictions, models) {
)
%>%
as
.
matrix
%>%
as_tibble
%>%
mutate
(`
Predicted
GOSE
`
=
row_number
()
%>%
as
.
character
)
%>%
gather
(`
True
GOSE
`,
n
,
1
:
8
)
gather
(`
Observed
GOSE
`,
n
,
1
:
8
)
)
%>%
unnest
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
True
GOSE
`)
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
Observed
GOSE
`)
%>%
summarize
(
n
=
mean
(
n
))
%>%
ungroup
%>%
mutate
(
model
=
factor
(
model
,
models
))
p_cnf_mtrx_raw
<-
df_average_confusion_matrices
%>%
ggplot
(
aes
(`
True
GOSE
`,
`
Predicted
GOSE
`,
fill
=
n
))
+
ggplot
(
aes
(`
Observed
GOSE
`,
`
Predicted
GOSE
`,
fill
=
n
))
+
geom_raster
()
+
geom_text
(
aes
(
label
=
sprintf
(
"%.1f"
,
n
)
%>%
...
...
@@ -708,7 +708,7 @@ plot_confusion_matrices <- function(df_predictions, models) {
geom_vline
(
xintercept
=
c
(
2
,
4
,
6
)
+
.5
,
color
=
"black"
)
+
scale_fill_gradient
(
low
=
"white"
,
high
=
"#555555"
)
+
coord_fixed
(
expand
=
FALSE
)
+
labs
(
x
=
"
true
GOSe"
,
y
=
"imputed GOSe"
,
fill
=
""
)
+
labs
(
x
=
"
observed
GOSe"
,
y
=
"imputed GOSe"
,
fill
=
""
)
+
theme_bw
()
+
theme
(
panel
.
grid
=
element_blank
()
...
...
@@ -717,18 +717,18 @@ plot_confusion_matrices <- function(df_predictions, models) {
ggtitle
(
"Average confusion matrix accross folds (absolute counts)"
)
p_cnf_mtrx_colnrm
<-
df_average_confusion_matrices
%>%
group_by
(
model
,
`
True
GOSE
`)
%>%
group_by
(
model
,
`
Observed
GOSE
`)
%>%
mutate
(
`
fraction
(
column
)`
=
n
/
sum
(
n
),
`
fraction
(
column
)`
=
ifelse
(
is
.
nan
(`
fraction
(
column
)`),
0
,
`
fraction
(
column
)`)
)
%>%
ggplot
(
aes
(`
True
GOSE
`,
`
Predicted
GOSE
`,
fill
=
`
fraction
(
column
)`))
+
ggplot
(
aes
(`
Observed
GOSE
`,
`
Predicted
GOSE
`,
fill
=
`
fraction
(
column
)`))
+
geom_raster
()
+
geom_hline
(
yintercept
=
c
(
2
,
4
,
6
)
+
.5
,
color
=
"black"
)
+
geom_vline
(
xintercept
=
c
(
2
,
4
,
6
)
+
.5
,
color
=
"black"
)
+
scale_fill_gradient
(
""
,
low
=
"white"
,
high
=
"black"
,
limits
=
c
(
0
,
1
))
+
coord_fixed
(
expand
=
FALSE
)
+
labs
(
x
=
"
true
GOSe"
,
y
=
"imputed GOSe"
,
fill
=
""
)
+
labs
(
x
=
"
observed
GOSe"
,
y
=
"imputed GOSe"
,
fill
=
""
)
+
theme_bw
()
+
theme
(
panel
.
grid
=
element_blank
()
...
...
@@ -786,10 +786,10 @@ df_average_confusion_matrices <- df_predictions %>%
)
%>%
as
.
matrix
%>%
as_tibble
%>%
mutate
(`
Predicted
GOSE
`
=
row_number
()
%>%
as
.
character
)
%>%
gather
(`
True
GOSE
`,
n
,
1
:
8
)
gather
(`
Observed
GOSE
`,
n
,
1
:
8
)
)
%>%
unnest
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
True
GOSE
`)
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
Observed
GOSE
`)
%>%
summarize
(
n
=
mean
(
n
))
%>%
ungroup
%>%
mutate
(
model
=
factor
(
model
,
models
))
...
...
@@ -797,7 +797,7 @@ rbind(
df_average_confusion_matrices
%>%
filter
(
model
%
in
%
c
(
"LOCF"
,
"MM"
,
"GP + cov"
,
"MSM"
))
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
<=
3
)
%>%
filter
(`
Observed
GOSE
`
<=
3
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
>
3
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
...
@@ -806,7 +806,7 @@ df_average_confusion_matrices %>%
df_average_confusion_matrices
%>%
filter
(
model
%
in
%
c
(
"LOCF"
,
"MM"
,
"GP + cov"
,
"MSM"
))
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
==
4
)
%>%
filter
(`
Observed
GOSE
`
==
4
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
>
4
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
...
@@ -815,7 +815,7 @@ df_average_confusion_matrices %>%
df_average_confusion_matrices
%>%
filter
(
model
%
in
%
c
(
"LOCF"
,
"MM"
,
"GP + cov"
,
"MSM"
))
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
<
8
)
%>%
filter
(`
Observed
GOSE
`
<
8
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
==
8
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
...
@@ -895,13 +895,17 @@ ggsave(filename = "errors_stratified_locf.png", width = 9, height = 3)
Just
as
with
the
overall
performance
,
differences
are
most
pronounced
in
terms
of
bias
.
Interestingly
,
the
conditional
perspective
reveals
differences
between
bias
as
difference
between
mean
imputed
and
mean
observed
values
(
tacitly
assuming
an
at
least
interval
scale
)
and
the
difference
in
the
probability
to
over
-
or
undershoot
the
observed
value
.
Again
,
the
category
imbalance
in
the
GOSe
distribution
explains
the
fact
that
all
model
-
based
approaches
tend
to
perform
better
for
the
most
frequent
categories
6
,
7
,
and
8
while
sacrificing
performance
for
the
less
frequent
categories
4
and
5
as
compared
to
LOCF
.
Bias
-
wise
all
methods
exhibit
a
certain
regression
to
the
mean
effect
since
low
categories
tend
to
be
confused
with
better
(
higher
)
GOSe
on
average
while
high
true
GOSe
values
are
dampened
(
negative
bias
at
7
,
8
).
high
observed
GOSe
values
are
dampened
(
negative
bias
at
7
,
8
).
Since
LOCF
does
not
take
the
category
imbalance
into
account
and
since
it
exhibits
a
relatively
large
negative
bias
at
the
most
frequent
GOSe
values
,
it
is
overall
negatively
biased
.
...
...
@@ -909,14 +913,14 @@ Interestingly, the conditional assessment of the GP regressions bias profile
reveals
that
the
overall
unbiasedness
can
be
explained
by
the
relatively
high
positive
and
negative
biases
conditional
on
low
/
high
GOSe
values
canceling
out
in
the
overall
population
.
Since
the
accuracy
results
mirror
this
effect
,
the
GP
regression
model
seems
to
suffer
from
an
overly
string
regression
to
the
mean
effect
.
The
MSM
and
MM
models
are
fairly
similar
with
respect
to
accuracy
with
a
slight
advantage
for
MSM
.
Interestingly
,
though
,
the
MSM
approach
is
consistently
less
positively
biased
across
the
rarer
low
GOSe
categories
while
being
only
insignificantly
more
negatively
biased
for
categories
7
and
8
.
The
MSM
and
MM
models
are
fairly
similar
with
respect
to
accuracy
but
MSM
clearly
dominates
with
respect
to
bias
.
Note
that
irrespective
of
the
exact
definition
of
bias
used
,
MSM
ominates
the
other
model
-
based
approaches
.
Comparing
LOCF
and
MSM
,
there
is
a
slight
advantage
of
MSM
in
terms
of
accuracy
for
the
majority
classes
3
,
7
,
8
which
explain
the
overall
difference
shwon
in
Figure
???.
With
respect
to
bias
,
MSM
also
performs
better
than
LOCF
for
the
most
frequently
observed
categories
,
but
the
extent
of
this
improvement
depend
on
the
performance
measure
.
...
...
@@ -956,17 +960,17 @@ df_average_confusion_matrices <- df_predictions %>%
)
%>%
as
.
matrix
%>%
as_tibble
%>%
mutate
(`
Predicted
GOSE
`
=
row_number
()
%>%
as
.
character
)
%>%
gather
(`
True
GOSE
`,
n
,
1
:
8
)
gather
(`
Observed
GOSE
`,
n
,
1
:
8
)
)
%>%
unnest
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
True
GOSE
`)
%>%
group_by
(
model
,
`
Predicted
GOSE
`,
`
Observed
GOSE
`)
%>%
summarize
(
n
=
mean
(
n
))
%>%
ungroup
%>%
mutate
(
model
=
factor
(
model
,
models
))
rbind
(
df_average_confusion_matrices
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
<=
3
)
%>%
filter
(`
Observed
GOSE
`
<=
3
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
>
3
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
...
@@ -974,7 +978,7 @@ df_average_confusion_matrices %>%
df_average_confusion_matrices
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
==
4
)
%>%
filter
(`
Observed
GOSE
`
==
4
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
>
4
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
...
@@ -982,7 +986,7 @@ df_average_confusion_matrices %>%
df_average_confusion_matrices
%>%
group_by
(
model
)
%>%
filter
(`
True
GOSE
`
<
8
)
%>%
filter
(`
Observed
GOSE
`
<
8
)
%>%
mutate
(
n_total
=
sum
(
n
))
%>%
filter
(`
Predicted
GOSE
`
==
8
)
%>%
summarize
(
fraction
=
sum
(
n
/
n_total
))
%>%
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment