]> code.communitydata.science - ml_measurement_error_public.git/blob - simulations/plot_dv_example.R
update plotting code and makefile
[ml_measurement_error_public.git] / simulations / plot_dv_example.R
1 source("RemembR/R/RemembeR.R")
2 library(arrow)
3 library(data.table)
4 library(ggplot2)
5 library(filelock)
6 library(argparser)
7
8 parser <- arg_parser("Simulate data and fit corrected models.")
9 parser <- add_argument(parser, "--infile", default="", help="name of the file to read.")
10 parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
11 args <- parse_args(parser)
12
13 build_plot_dataset <- function(df){
14     x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
15     x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
16                              zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
17                              bias = Bxy - Bxy.est.naive,
18                              Bxy.est.naive = Bxy.est.naive,
19                              sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
20
21     x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
22                                mean.bias = mean(bias),
23                                mean.est = mean(Bxy.est.naive),
24                                var.est = var(Bxy.est.naive),
25                                N.sims = .N,
26                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
27                                variable='x',
28                                method='Naive'
29                                ),
30                             by=c('N','m')]
31     
32
33     g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
34     g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
35                              zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
36                              bias = Bgy - Bgy.est.naive,
37                              Bgy.est.naive = Bgy.est.naive,
38                              sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
39
40     g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
41                                mean.bias = mean(bias),
42                                mean.est = mean(Bgy.est.naive),
43                                var.est = var(Bgy.est.naive),
44                                N.sims = .N,
45                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
46                                variable='g',
47                                method='Naive'
48                                ),
49                             by=c('N','m')]
50     
51
52     x.feasible <- df[,.(N, m, Bxy, Bxy.est.feasible, Bxy.ci.lower.feasible, Bxy.ci.upper.feasible)]
53     x.feasible <- x.feasible[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.feasible) & (Bxy <= Bxy.ci.upper.feasible)),
54                                    zero.in.ci = (0 >= Bxy.ci.lower.feasible) & (0 <= Bxy.ci.upper.feasible),
55                                    bias = Bxy - Bxy.est.feasible,
56                                    Bxy.est.feasible = Bxy.est.feasible,
57                                    sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.feasible)))]
58
59     x.feasible.plot <- x.feasible[,.(p.true.in.ci = mean(true.in.ci),
60                                      mean.bias = mean(bias),
61                                      mean.est = mean(Bxy.est.feasible),
62                                      var.est = var(Bxy.est.feasible),
63                                      N.sims = .N,
64                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
65                                      variable='x',
66                                      method='Feasible'
67                                      ),
68                                   by=c('N','m')]
69     
70
71     g.feasible <- df[,.(N, m, Bgy, Bgy.est.feasible, Bgy.ci.lower.feasible, Bgy.ci.upper.feasible)]
72     g.feasible <- g.feasible[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.feasible) & (Bgy <= Bgy.ci.upper.feasible)),
73                                    zero.in.ci = (0 >= Bgy.ci.lower.feasible) & (0 <= Bgy.ci.upper.feasible),
74                                    bias = Bgy - Bgy.est.feasible,
75                                    Bgy.est.feasible = Bgy.est.feasible,
76                                    sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.feasible)))]
77
78     g.feasible.plot <- g.feasible[,.(p.true.in.ci = mean(true.in.ci),
79                                      mean.bias = mean(bias),
80                                      mean.est = mean(Bgy.est.feasible),
81                                      var.est = var(Bgy.est.feasible),
82                                      N.sims = .N,
83                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
84                                      variable='g',
85                                      method='Feasible'
86                                      ),
87                                   by=c('N','m')]
88     
89
90
91     x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
92
93     x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
94                                          zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
95                                          bias = Bxy.est.true - Bxy.est.amelia.full,
96                                          sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
97
98     x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
99                                            mean.bias = mean(bias),
100                                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
101                                            mean.est = mean(Bxy.est.amelia.full),
102                                            var.est = var(Bxy.est.amelia.full),
103                                            N.sims = .N,
104                                            variable='x',
105                                            method='Multiple imputation'
106                                            ),
107                                         by=c('N','m')]
108
109
110     g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
111     g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
112                                          zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
113                                          bias =  Bgy.est.amelia.full - Bgy.est.true,
114                                          sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
115
116     g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
117                                            mean.bias = mean(bias),
118                                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
119                                            mean.est = mean(Bgy.est.amelia.full),
120                                            var.est = var(Bgy.est.amelia.full),
121                                            N.sims = .N,
122                                            variable='g',
123                                            method='Multiple imputation'
124                                            ),
125                                         by=c('N','m')]
126
127     x.mle <- df[,.(N,m, Bxy.est.true, Bxy.est.mle, Bxy.ci.lower.mle, Bxy.ci.upper.mle)]
128
129     x.mle <- x.mle[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.mle) & (Bxy.est.true <= Bxy.ci.upper.mle),
130                                          zero.in.ci = (0 >= Bxy.ci.lower.mle) & (0 <= Bxy.ci.upper.mle),
131                                          bias =  Bxy.est.mle - Bxy.est.true,
132                                          sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mle))]
133
134     x.mle.plot <- x.mle[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
135                                    mean.bias = mean(bias),
136                                    p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
137                                    mean.est = mean(Bxy.est.mle),
138                                    var.est = var(Bxy.est.mle),
139                                    N.sims = .N,
140                                    variable='x',
141                                    method='Maximum Likelihood'
142                                    ),
143                                 by=c('N','m')]
144
145
146
147     g.mle <- df[,.(N,m, Bgy.est.true, Bgy.est.mle, Bgy.ci.lower.mle, Bgy.ci.upper.mle)]
148
149     g.mle <- g.mle[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mle) & (Bgy.est.true <= Bgy.ci.upper.mle),
150                                          zero.in.ci = (0 >= Bgy.ci.lower.mle) & (0 <= Bgy.ci.upper.mle),
151                                          bias =  Bgy.est.mle - Bgy.est.true,
152                                          sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mle))]
153
154     g.mle.plot <- g.mle[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
155                                    mean.bias = mean(bias),
156                                    p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
157                                    mean.est = mean(Bgy.est.mle),
158                                    var.est = var(Bgy.est.mle),
159                                    N.sims = .N,
160                                    variable='g',
161                                    method='Maximum Likelihood'
162                                    ),
163                                 by=c('N','m')]
164
165
166
167
168     x.pseudo <- df[,.(N,m, Bxy.est.true, Bxy.est.pseudo, Bxy.ci.lower.pseudo, Bxy.ci.upper.pseudo)]
169
170     x.pseudo <- x.pseudo[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.pseudo) & (Bxy.est.true <= Bxy.ci.upper.pseudo),
171                                          zero.in.ci = (0 >= Bxy.ci.lower.pseudo) & (0 <= Bxy.ci.upper.pseudo),
172                                          bias =  Bxy.est.pseudo - Bxy.est.true,
173                                          sign.correct = sign(Bxy.est.true) == sign(Bxy.est.pseudo))]
174
175     x.pseudo.plot <- x.pseudo[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
176                                    mean.bias = mean(bias),
177                                    p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
178                                    mean.est = mean(Bxy.est.pseudo),
179                                    var.est = var(Bxy.est.pseudo),
180                                    N.sims = .N,
181                                    variable='x',
182                                    method='Pseudo Likelihood'
183                                    ),
184                                 by=c('N','m')]
185
186
187
188     g.pseudo <- df[,.(N,m, Bgy.est.true, Bgy.est.pseudo, Bgy.ci.lower.pseudo, Bgy.ci.upper.pseudo)]
189
190     g.pseudo <- g.pseudo[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.pseudo) & (Bgy.est.true <= Bgy.ci.upper.pseudo),
191                                          zero.in.ci = (0 >= Bgy.ci.lower.pseudo) & (0 <= Bgy.ci.upper.pseudo),
192                                          bias =  Bgy.est.pseudo - Bgy.est.true,
193                                          sign.correct = sign(Bgy.est.true) == sign(Bgy.est.pseudo))]
194
195     g.pseudo.plot <- g.pseudo[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
196                                    mean.bias = mean(bias),
197                                    p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
198                                    mean.est = mean(Bgy.est.pseudo),
199                                    var.est = var(Bgy.est.pseudo),
200                                    N.sims = .N,
201                                    variable='g',
202                                    method='Pseudo Likelihood'
203                                    ),
204                                 by=c('N','m')]
205
206
207     
208     accuracy <- df[,mean(accuracy)]
209
210     plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.mle.plot, g.mle.plot, x.pseudo.plot, g.pseudo.plot, x.feasible.plot, g.feasible.plot),use.names=T)
211
212     plot.df[,accuracy := accuracy]
213
214     plot.df <- plot.df[,":="(sd.est=sqrt(var.est)/N.sims)]
215
216     return(plot.df)
217 }
218
219
220 df <- read_feather(args$infile)
221 plot.df <- build_plot_dataset(df)
222 remember(plot.df,args$name)
223
224
225 ## df[gmm.ER_pval<0.05]
226
227
228
229
230
231 ## ggplot(plot.df[variable=='x'], aes(y=mean.est, ymax=mean.est + var.est/2, ymin=mean.est-var.est/2, x=method)) + geom_pointrange() + facet_grid(-m~N) + scale_x_discrete(labels=label_wrap_gen(10))
232
233 ## ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
234
235 ## ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 

Community Data Science Collective || Want to submit a patch?