]> code.communitydata.science - ml_measurement_error_public.git/blob - simulations/plot_example.R
update plotting code and makefile
[ml_measurement_error_public.git] / simulations / plot_example.R
1 source("RemembR/R/RemembeR.R")
2 library(arrow)
3 library(data.table)
4 library(ggplot2)
5 library(filelock)
6 library(argparser)
7
8 parser <- arg_parser("Simulate data and fit corrected models.")
9 parser <- add_argument(parser, "--infile", default="", help="name of the file to read.")
10 parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
11 args <- parse_args(parser)
12
13 build_plot_dataset <- function(df){
14     x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
15     x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
16                              zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
17                              bias = Bxy - Bxy.est.naive,
18                              Bxy.est.naive = Bxy.est.naive,
19                              sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
20
21     x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
22                                mean.bias = mean(bias),
23                                mean.est = mean(Bxy.est.naive),
24                                var.est = var(Bxy.est.naive),
25                                N.sims = .N,
26                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
27                                variable='x',
28                                method='Naive'
29                                ),
30                             by=c('N','m')]
31     
32
33     g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
34     g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
35                              zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
36                              bias = Bgy - Bgy.est.naive,
37                              Bgy.est.naive = Bgy.est.naive,
38                              sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
39
40     g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
41                                mean.bias = mean(bias),
42                                mean.est = mean(Bgy.est.naive),
43                                var.est = var(Bgy.est.naive),
44                                N.sims = .N,
45                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
46                                variable='g',
47                                method='Naive'
48                                ),
49                             by=c('N','m')]
50     
51
52     x.feasible <- df[,.(N, m, Bxy, Bxy.est.feasible, Bxy.ci.lower.feasible, Bxy.ci.upper.feasible)]
53     x.feasible <- x.feasible[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.feasible) & (Bxy <= Bxy.ci.upper.feasible)),
54                                    zero.in.ci = (0 >= Bxy.ci.lower.feasible) & (0 <= Bxy.ci.upper.feasible),
55                                    bias = Bxy - Bxy.est.feasible,
56                                    Bxy.est.feasible = Bxy.est.feasible,
57                                    sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.feasible)))]
58
59     x.feasible.plot <- x.feasible[,.(p.true.in.ci = mean(true.in.ci),
60                                      mean.bias = mean(bias),
61                                      mean.est = mean(Bxy.est.feasible),
62                                      var.est = var(Bxy.est.feasible),
63                                      N.sims = .N,
64                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
65                                      variable='x',
66                                      method='Feasible'
67                                      ),
68                                   by=c('N','m')]
69     
70
71     g.feasible <- df[,.(N, m, Bgy, Bgy.est.feasible, Bgy.ci.lower.feasible, Bgy.ci.upper.feasible)]
72     g.feasible <- g.feasible[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.feasible) & (Bgy <= Bgy.ci.upper.feasible)),
73                                    zero.in.ci = (0 >= Bgy.ci.lower.feasible) & (0 <= Bgy.ci.upper.feasible),
74                                    bias = Bgy - Bgy.est.feasible,
75                                    Bgy.est.feasible = Bgy.est.feasible,
76                                    sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.feasible)))]
77
78     g.feasible.plot <- g.feasible[,.(p.true.in.ci = mean(true.in.ci),
79                                      mean.bias = mean(bias),
80                                      mean.est = mean(Bgy.est.feasible),
81                                      var.est = var(Bgy.est.feasible),
82                                      N.sims = .N,
83                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
84                                      variable='g',
85                                      method='Feasible'
86                                      ),
87                                   by=c('N','m')]
88     
89
90
91     x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
92
93     x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
94                                          zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
95                                          bias = Bxy.est.true - Bxy.est.amelia.full,
96                                          sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
97
98     x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
99                                            mean.bias = mean(bias),
100                                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
101                                            mean.est = mean(Bxy.est.amelia.full),
102                                            var.est = var(Bxy.est.amelia.full),
103                                            N.sims = .N,
104                                            variable='x',
105                                            method='Multiple imputation'
106                                            ),
107                                         by=c('N','m')]
108
109
110     g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
111     g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
112                                          zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
113                                          bias =  Bgy.est.amelia.full - Bgy.est.true,
114                                          sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
115
116     g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
117                                            mean.bias = mean(bias),
118                                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
119                                            mean.est = mean(Bgy.est.amelia.full),
120                                            var.est = var(Bgy.est.amelia.full),
121                                            N.sims = .N,
122                                            variable='g',
123                                            method='Multiple imputation'
124                                            ),
125                                         by=c('N','m')]
126
127     ## x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
128     ## x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
129     ##                                    zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
130     ##                                    bias =  Bxy.est.amelia.nok - Bxy.est.true,
131     ##                                    sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
132
133     ## x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
134     ##                                      mean.bias = mean(bias),
135     ##                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
136     ##                                      mean.est = mean(Bxy.est.amelia.nok),
137     ##                                      var.est = var(Bxy.est.amelia.nok),
138     ##                                      N.sims = .N,
139     ##                                      variable='x',
140     ##                                      method='Multiple imputation (Classifier features unobserved)'
141     ##                                      ),
142     ##                                   by=c('N','m')]
143
144     ## g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
145     ## g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
146     ##                                    zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
147     ##                                    bias =  Bgy.est.amelia.nok - Bgy.est.true,
148     ##                                    sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
149
150     ## g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
151     ##                                      mean.bias = mean(bias),
152     ##                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
153     ##                                      mean.est = mean(Bgy.est.amelia.nok),
154     ##                                      var.est = var(Bgy.est.amelia.nok),
155     ##                                      N.sims = .N,
156     ##                                      variable='g',
157     ##                                      method="Multiple imputation (Classifier features unobserved)"
158     ##                                      ),
159     ##                                   by=c('N','m')]
160
161
162     x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
163
164     x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
165                              zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
166                              bias =  Bxy.est.mecor - Bxy.est.true,
167                              sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
168
169     x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
170                                mean.bias = mean(bias),
171                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
172                                mean.est = mean(Bxy.est.mecor),
173                                var.est = var(Bxy.est.mecor),
174                                N.sims = .N,
175                                variable='x',
176                                method='Regression Calibration'
177                                ),
178                             by=c('N','m')]
179
180     g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
181
182     g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
183                              zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
184                              bias =  Bgy.est.mecor - Bgy.est.true,
185                              sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
186
187     g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
188                                mean.bias = mean(bias),
189                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
190                                mean.est = mean(Bgy.est.mecor),
191                                var.est = var(Bgy.est.mecor),
192                                N.sims = .N,
193                                variable='g',
194                                method='Regression Calibration'
195                                ),
196                             by=c('N','m')]
197
198     ## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
199
200     ## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
201     ##                                      zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
202     ##                                      bias =  Bgy.est.mecor - Bgy.est.true,
203     ##                                      sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
204
205     ## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
206     ##                                        mean.bias = mean(bias),
207     ##                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
208     ##                                      variable='g',
209     ##                                      method='Regression Calibration'
210     ##                                      ),
211     ##                                     by=c('N','m')]
212
213
214     x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
215     x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
216                          zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
217                          bias =  Bxy.est.gmm - Bxy.est.true,
218                          sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
219
220     x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
221                            mean.bias = mean(bias),
222                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
223                            mean.est = mean(Bxy.est.gmm),
224
225                            var.est = var(Bxy.est.gmm),
226                            N.sims = .N,
227                            variable='x',
228                            method='2SLS+gmm'
229                            ),
230                         by=c('N','m')]
231
232     g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
233     g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
234                          zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
235                          bias =  Bgy.est.gmm - Bgy.est.true,
236                          sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
237
238     g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
239                            mean.bias = mean(bias),
240                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
241                            mean.est = mean(Bgy.est.gmm),
242                            var.est = var(Bgy.est.gmm),
243                            N.sims = .N,
244                            variable='g',
245                            method='2SLS+gmm'
246                            ),
247                         by=c('N','m')]
248
249     accuracy <- df[,mean(accuracy)]
250
251     plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot, x.feasible.plot, g.feasible.plot),use.names=T)
252
253     plot.df[,accuracy := accuracy]
254
255     plot.df <- plot.df[,":="(sd.est=sqrt(var.est)/N.sims)]
256
257     return(plot.df)
258 }
259
260
261 df <- read_feather(args$infile)
262 plot.df <- build_plot_dataset(df)
263 remember(plot.df,args$name)
264
265
266 ## df[gmm.ER_pval<0.05]
267
268
269
270
271
272 ## ggplot(plot.df[variable=='x'], aes(y=mean.est, ymax=mean.est + var.est/2, ymin=mean.est-var.est/2, x=method)) + geom_pointrange() + facet_grid(-m~N) + scale_x_discrete(labels=label_wrap_gen(10))
273
274 ## ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
275
276 ## ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 

Community Data Science Collective || Want to submit a patch?