]> code.communitydata.science - ml_measurement_error_public.git/blob - simulations/plot_example_2_B.R
make first simulation with precise accuracies and explained variances
[ml_measurement_error_public.git] / simulations / plot_example_2_B.R
1 source("RemembR/R/RemembeR.R")
2 library(arrow)
3 library(data.table)
4 library(ggplot2)
5 library(filelock)
6 library(argparse)
7
8 l <- filelock::lock("example_2_B.feather_lock",exclusive=FALSE)
9 df <- data.table(read_feather("example_2_B.feather"))
10 filelock::unlock(l)
11
12 parser <- arg_parser("Simulate data and fit corrected models.")
13 parser <- add_argument(parser, "--infile", default="", help="name of the file to read.")
14 parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.")
15 args <- parse_args(parser)
16
17 build_plot_dataset <- function(df){
18     x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)]
19     x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)),
20                              zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive),
21                              bias = Bxy - Bxy.est.naive,
22                              Bxy.est.naive = Bxy.est.naive,
23                              sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))]
24
25     x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci),
26                                mean.bias = mean(bias),
27                                mean.est = mean(Bxy.est.naive),
28                                var.est = var(Bxy.est.naive),
29                                N.sims = .N,
30                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
31                                variable='x',
32                                method='Naive'
33                                ),
34                             by=c('N','m')]
35     
36
37     g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)]
38     g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)),
39                              zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive),
40                              bias = Bgy - Bgy.est.naive,
41                              Bgy.est.naive = Bgy.est.naive,
42                              sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))]
43
44     g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci),
45                                mean.bias = mean(bias),
46                                mean.est = mean(Bgy.est.naive),
47                                var.est = var(Bgy.est.naive),
48                                N.sims = .N,
49                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
50                                variable='g',
51                                method='Naive'
52                                ),
53                             by=c('N','m')]
54     
55
56     x.feasible <- df[,.(N, m, Bxy, Bxy.est.feasible, Bxy.ci.lower.feasible, Bxy.ci.upper.feasible)]
57     x.feasible <- x.feasible[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.feasible) & (Bxy <= Bxy.ci.upper.feasible)),
58                                    zero.in.ci = (0 >= Bxy.ci.lower.feasible) & (0 <= Bxy.ci.upper.feasible),
59                                    bias = Bxy - Bxy.est.feasible,
60                                    Bxy.est.feasible = Bxy.est.feasible,
61                                    sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.feasible)))]
62
63     x.feasible.plot <- x.feasible[,.(p.true.in.ci = mean(true.in.ci),
64                                      mean.bias = mean(bias),
65                                      mean.est = mean(Bxy.est.feasible),
66                                      var.est = var(Bxy.est.feasible),
67                                      N.sims = .N,
68                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
69                                      variable='x',
70                                      method='Feasible'
71                                      ),
72                                   by=c('N','m')]
73     
74
75     g.feasible <- df[,.(N, m, Bgy, Bgy.est.feasible, Bgy.ci.lower.feasible, Bgy.ci.upper.feasible)]
76     g.feasible <- g.feasible[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.feasible) & (Bgy <= Bgy.ci.upper.feasible)),
77                                    zero.in.ci = (0 >= Bgy.ci.lower.feasible) & (0 <= Bgy.ci.upper.feasible),
78                                    bias = Bgy - Bgy.est.feasible,
79                                    Bgy.est.feasible = Bgy.est.feasible,
80                                    sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.feasible)))]
81
82     g.feasible.plot <- g.feasible[,.(p.true.in.ci = mean(true.in.ci),
83                                      mean.bias = mean(bias),
84                                      mean.est = mean(Bgy.est.feasible),
85                                      var.est = var(Bgy.est.feasible),
86                                      N.sims = .N,
87                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
88                                      variable='g',
89                                      method='Feasible'
90                                      ),
91                                   by=c('N','m')]
92     
93
94
95     x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)]
96
97     x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full),
98                                          zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full),
99                                          bias = Bxy.est.true - Bxy.est.amelia.full,
100                                          sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))]
101
102     x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
103                                            mean.bias = mean(bias),
104                                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
105                                            mean.est = mean(Bxy.est.amelia.full),
106                                            var.est = var(Bxy.est.amelia.full),
107                                            N.sims = .N,
108                                            variable='x',
109                                            method='Multiple imputation'
110                                            ),
111                                         by=c('N','m')]
112
113
114     g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)]
115     g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full),
116                                          zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full),
117                                          bias =  Bgy.est.amelia.full - Bgy.est.true,
118                                          sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))]
119
120     g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
121                                            mean.bias = mean(bias),
122                                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
123                                            mean.est = mean(Bgy.est.amelia.full),
124                                            var.est = var(Bgy.est.amelia.full),
125                                            N.sims = .N,
126                                            variable='g',
127                                            method='Multiple imputation'
128                                            ),
129                                         by=c('N','m')]
130
131     x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)]
132     x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok),
133                                        zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok),
134                                        bias =  Bxy.est.amelia.nok - Bxy.est.true,
135                                        sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))]
136
137     x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
138                                          mean.bias = mean(bias),
139                                          p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
140                                          mean.est = mean(Bxy.est.amelia.nok),
141                                          var.est = var(Bxy.est.amelia.nok),
142                                          N.sims = .N,
143                                          variable='x',
144                                          method='Multiple imputation (Classifier features unobserved)'
145                                          ),
146                                       by=c('N','m')]
147
148     g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)]
149     g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok),
150                                        zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok),
151                                        bias =  Bgy.est.amelia.nok - Bgy.est.true,
152                                        sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))]
153
154     g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
155                                          mean.bias = mean(bias),
156                                          p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
157                                          mean.est = mean(Bgy.est.amelia.nok),
158                                          var.est = var(Bgy.est.amelia.nok),
159                                          N.sims = .N,
160                                          variable='g',
161                                          method="Multiple imputation (Classifier features unobserved)"
162                                          ),
163                                       by=c('N','m')]
164
165
166     x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)]
167
168     x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor),
169                              zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor),
170                              bias =  Bxy.est.mecor - Bxy.est.true,
171                              sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))]
172
173     x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
174                                mean.bias = mean(bias),
175                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
176                                mean.est = mean(Bxy.est.mecor),
177                                var.est = var(Bxy.est.mecor),
178                                N.sims = .N,
179                                variable='x',
180                                method='Regression Calibration'
181                                ),
182                             by=c('N','m')]
183
184     g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)]
185
186     g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor),
187                              zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor),
188                              bias =  Bgy.est.mecor - Bgy.est.true,
189                              sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
190
191     g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
192                                mean.bias = mean(bias),
193                                p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
194                                mean.est = mean(Bgy.est.mecor),
195                                var.est = var(Bgy.est.mecor),
196                                N.sims = .N,
197                                variable='g',
198                                method='Regression Calibration'
199                                ),
200                             by=c('N','m')]
201
202     ## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)]
203
204     ## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor),
205     ##                                      zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor),
206     ##                                      bias =  Bgy.est.mecor - Bgy.est.true,
207     ##                                      sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))]
208
209     ## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
210     ##                                        mean.bias = mean(bias),
211     ##                                      p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
212     ##                                      variable='g',
213     ##                                      method='Regression Calibration'
214     ##                                      ),
215     ##                                     by=c('N','m')]
216
217
218     x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)]
219     x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm),
220                          zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm),
221                          bias =  Bxy.est.gmm - Bxy.est.true,
222                          sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))]
223
224     x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
225                            mean.bias = mean(bias),
226                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
227                            mean.est = mean(Bxy.est.gmm),
228                            var.est = var(Bxy.est.gmm),
229                            N.sims = .N,
230                            variable='x',
231                            method='2SLS+gmm'
232                            ),
233                         by=c('N','m')]
234
235     g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)]
236     g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm),
237                          zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm),
238                          bias =  Bgy.est.gmm - Bgy.est.true,
239                          sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))]
240
241     g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)),
242                            mean.bias = mean(bias),
243                            p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))),
244                            mean.est = mean(Bgy.est.gmm),
245                            var.est = var(Bgy.est.gmm),
246                            N.sims = .N,
247                            variable='g',
248                            method='2SLS+gmm'
249                            ),
250                         by=c('N','m')]
251
252     accuracy <- df[,mean(accuracy)]
253     return(plot.df)
254 }
255
256 df <- read_feather(args$infile)
257 plot.df <- build_plot_dataset(df)
258 remember(plot.df,args$name))
259
260
261 ## df[gmm.ER_pval<0.05]
262
263
264
265 ## plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot, x.feasible.plot, g.feasible.plot),use.names=T)
266
267 ## plot.df[,accuracy := accuracy]
268
269 ## # plot.df <- plot.df[,":="(sd.est=sqrt(var.est)/N.sims)]
270
271
272
273 ## ggplot(plot.df[variable=='x'], aes(y=mean.est, ymax=mean.est + var.est/2, ymin=mean.est-var.est/2, x=method)) + geom_pointrange() + facet_grid(-m~N) + scale_x_discrete(labels=label_wrap_gen(10))
274
275 ## ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 
276
277 ## ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") 

Community Data Science Collective || Want to submit a patch?