1 library(predictionError)
3 options(amelia.parallel="no",
10 ## This uses the pseudolikelihood approach from Carroll page 349.
12 ## assumes differential error, but that only depends on Y
13 ## inefficient, because pseudolikelihood
14 logistic.correction.pseudo <- function(df){
15 p1.est <- mean(df[w_pred==1]$y.obs==1,na.rm=T)
16 p0.est <- mean(df[w_pred==0]$y.obs==0,na.rm=T)
18 nll <- function(B0, Bxy, Bgy){
19 probs <- (1 - p0.est) + (p1.est + p0.est - 1)*plogis(B0 + Bxy * df$x + Bgy * df$g)
21 part1 = sum(log(probs[df$w_pred == 1]))
22 part2 = sum(log(1-probs[df$w_pred == 0]))
24 return(-1*(part1 + part2))
27 mlefit <- stats4::mle(minuslogl = nll, start = list(B0=0, Bxy=0.0, Bgy=0.0))
32 ## This uses the likelihood approach from Carroll page 353.
33 ## assumes that we have a good measurement error model
34 logistic.correction.liklihood <- function(df){
36 ## liklihood for observed responses
37 nll <- function(B0, Bxy, Bgy, gamma0, gamma_y, gamma_g){
38 df.obs <- df[!is.na(y.obs)]
39 p.y.obs <- plogis(B0 + Bxy * df.obs$x + Bgy*df.obs$g)
40 p.y.obs[df.obs$y==0] <- 1-p.y.obs[df.obs$y==0]
41 p.s.obs <- plogis(gamma0 + gamma_y * df.obs$y + gamma_g*df.obs$g)
42 p.s.obs[df.obs$w_pred==0] <- 1 - p.s.obs[df.obs$w_pred==0]
44 p.obs <- p.s.obs * p.y.obs
46 df.unobs <- df[is.na(y.obs)]
48 p.unobs.1 <- plogis(B0 + Bxy * df.unobs$x + Bgy*df.unobs$g)*plogis(gamma0 + gamma_y + gamma_g*df.unobs$g)
49 p.unobs.0 <- (1-plogis(B0 + Bxy * df.unobs$x + Bgy*df.unobs$g))*plogis(gamma0 + gamma_g*df.unobs$g)
50 p.unobs <- p.unobs.1 + p.unobs.0
51 p.unobs[df.unobs$w_pred==0] <- 1 - p.unobs[df.unobs$w_pred==0]
53 p <- c(p.obs, p.unobs)
55 return(-1*(sum(log(p))))
58 mlefit <- stats4::mle(minuslogl = nll, start = list(B0=1, Bxy=0,Bgy=0, gamma0=5, gamma_y=0, gamma_g=0))
64 logistic <- function(x) {1/(1+exp(-1*x))}
66 run_simulation_depvar <- function(df, result){
68 accuracy <- df[,mean(w_pred==y)]
69 result <- append(result, list(accuracy=accuracy))
71 (model.true <- glm(y ~ x + g, data=df,family=binomial(link='logit')))
72 true.ci.Bxy <- confint(model.true)['x',]
73 true.ci.Bgy <- confint(model.true)['g',]
75 result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
76 Bgy.est.true=coef(model.true)['g'],
77 Bxy.ci.upper.true = true.ci.Bxy[2],
78 Bxy.ci.lower.true = true.ci.Bxy[1],
79 Bgy.ci.upper.true = true.ci.Bgy[2],
80 Bgy.ci.lower.true = true.ci.Bgy[1]))
82 (model.feasible <- glm(y.obs~x+g,data=df,family=binomial(link='logit')))
84 feasible.ci.Bxy <- confint(model.feasible)['x',]
85 result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x'],
86 Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
87 Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
89 feasible.ci.Bgy <- confint(model.feasible)['g',]
90 result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
91 Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
92 Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
94 (model.naive <- glm(w_pred~x+g, data=df, family=binomial(link='logit')))
96 naive.ci.Bxy <- confint(model.naive)['x',]
97 naive.ci.Bgy <- confint(model.naive)['g',]
99 result <- append(result, list(Bxy.est.naive=coef(model.naive)['x'],
100 Bgy.est.naive=coef(model.naive)['g'],
101 Bxy.ci.upper.naive = naive.ci.Bxy[2],
102 Bxy.ci.lower.naive = naive.ci.Bxy[1],
103 Bgy.ci.upper.naive = naive.ci.Bgy[2],
104 Bgy.ci.lower.naive = naive.ci.Bgy[1]))
107 (model.naive.cont <- lm(w~x+g, data=df))
108 naivecont.ci.Bxy <- confint(model.naive.cont)['x',]
109 naivecont.ci.Bgy <- confint(model.naive.cont)['g',]
111 ## my implementatoin of liklihood based correction
112 mod.caroll.lik <- logistic.correction.liklihood(df)
113 coef <- coef(mod.caroll.lik)
114 ci <- confint(mod.caroll.lik)
116 result <- append(result,
117 list(Bxy.est.mle = coef['Bxy'],
118 Bxy.ci.upper.mle = ci['Bxy','97.5 %'],
119 Bxy.ci.lower.mle = ci['Bxy','2.5 %'],
120 Bgy.est.mle = coef['Bgy'],
121 Bgy.ci.upper.mle = ci['Bgy','97.5 %'],
122 Bgy.ci.lower.mle = ci['Bgy','2.5 %']))
125 ## my implementatoin of liklihood based correction
126 mod.caroll.pseudo <- logistic.correction.pseudo(df)
127 coef <- coef(mod.caroll.pseudo)
128 ci <- confint(mod.caroll.pseudo)
130 result <- append(result,
131 list(Bxy.est.pseudo = coef['Bxy'],
132 Bxy.ci.upper.pseudo = ci['Bxy','97.5 %'],
133 Bxy.ci.lower.pseudo = ci['Bxy','2.5 %'],
134 Bgy.est.pseudo = coef['Bgy'],
135 Bgy.ci.upper.pseudo = ci['Bgy','97.5 %'],
136 Bgy.ci.lower.pseudo = ci['Bgy','2.5 %']))
139 # amelia says use normal distribution for binary variables.
140 amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('y','ystar','w_pred'))
141 mod.amelia.k <- zelig(y.obs~x+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
142 (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
144 est.x.mi <- coefse['x','Estimate']
145 est.x.se <- coefse['x','Std.Error']
146 result <- append(result,
147 list(Bxy.est.amelia.full = est.x.mi,
148 Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
149 Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
152 est.g.mi <- coefse['g','Estimate']
153 est.g.se <- coefse['g','Std.Error']
155 result <- append(result,
156 list(Bgy.est.amelia.full = est.g.mi,
157 Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
158 Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
165 run_simulation <- function(df, result){
167 accuracy <- df[,mean(w_pred==x)]
168 result <- append(result, list(accuracy=accuracy))
170 (model.true <- lm(y ~ x + g, data=df))
171 true.ci.Bxy <- confint(model.true)['x',]
172 true.ci.Bgy <- confint(model.true)['g',]
174 result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
175 Bgy.est.true=coef(model.true)['g'],
176 Bxy.ci.upper.true = true.ci.Bxy[2],
177 Bxy.ci.lower.true = true.ci.Bxy[1],
178 Bgy.ci.upper.true = true.ci.Bgy[2],
179 Bgy.ci.lower.true = true.ci.Bgy[1]))
181 (model.feasible <- lm(y~x.obs+g,data=df))
183 feasible.ci.Bxy <- confint(model.feasible)['x.obs',]
184 result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'],
185 Bxy.ci.upper.feasible = feasible.ci.Bxy[2],
186 Bxy.ci.lower.feasible = feasible.ci.Bxy[1]))
188 feasible.ci.Bgy <- confint(model.feasible)['g',]
189 result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'],
190 Bgy.ci.upper.feasible = feasible.ci.Bgy[2],
191 Bgy.ci.lower.feasible = feasible.ci.Bgy[1]))
193 (model.naive <- lm(y~w+g, data=df))
195 naive.ci.Bxy <- confint(model.naive)['w',]
196 naive.ci.Bgy <- confint(model.naive)['g',]
198 result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
199 Bgy.est.naive=coef(model.naive)['g'],
200 Bxy.ci.upper.naive = naive.ci.Bxy[2],
201 Bxy.ci.lower.naive = naive.ci.Bxy[1],
202 Bgy.ci.upper.naive = naive.ci.Bgy[2],
203 Bgy.ci.lower.naive = naive.ci.Bgy[1]))
206 amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'))
207 mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
208 (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE))
210 est.x.mi <- coefse['x.obs','Estimate']
211 est.x.se <- coefse['x.obs','Std.Error']
212 result <- append(result,
213 list(Bxy.est.amelia.full = est.x.mi,
214 Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
215 Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
218 est.g.mi <- coefse['g','Estimate']
219 est.g.se <- coefse['g','Std.Error']
221 result <- append(result,
222 list(Bgy.est.amelia.full = est.g.mi,
223 Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
224 Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
227 ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
228 ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms)
229 ## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
230 ## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE))
232 ## est.x.mi <- coefse['x.obs','Estimate']
233 ## est.x.se <- coefse['x.obs','Std.Error']
234 ## result <- append(result,
235 ## list(Bxy.est.amelia.nok = est.x.mi,
236 ## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
237 ## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
240 ## est.g.mi <- coefse['g','Estimate']
241 ## est.g.se <- coefse['g','Std.Error']
243 ## result <- append(result,
244 ## list(Bgy.est.amelia.nok = est.g.mi,
245 ## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
246 ## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
250 m <- nrow(df[!is.na(x.obs)])
251 p <- v <- train <- rep(0,N)
255 df <- df[order(x.obs)]
260 # gmm gets pretty close
261 (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=TRUE))
263 result <- append(result,
264 list(Bxy.est.gmm = gmm.res$beta[1,1],
265 Bxy.ci.upper.gmm = gmm.res$confint[1,2],
266 Bxy.ci.lower.gmm = gmm.res$confint[1,1],
267 gmm.ER_pval = gmm.res$ER_pval
270 result <- append(result,
271 list(Bgy.est.gmm = gmm.res$beta[2,1],
272 Bgy.ci.upper.gmm = gmm.res$confint[2,2],
273 Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
276 mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient')
278 (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',])
279 result <- append(result, list(
280 Bxy.est.mecor = mecor.ci['Estimate'],
281 Bxy.upper.mecor = mecor.ci['UCI'],
282 Bxy.lower.mecor = mecor.ci['LCI'])
285 (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',])
287 result <- append(result, list(
288 Bgy.est.mecor = mecor.ci['Estimate'],
289 Bgy.upper.mecor = mecor.ci['UCI'],
290 Bgy.lower.mecor = mecor.ci['LCI'])
294 ## rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))