X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/588bdd7ed74cf8fe8fd0f15df58a6a40c26ebae5..979dc14b6861ae31f00d56392fd5b8cf69f17333:/simulations/simulation_base.R?ds=sidebyside diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R index a73ed79..ee46ec6 100644 --- a/simulations/simulation_base.R +++ b/simulations/simulation_base.R @@ -4,207 +4,365 @@ options(amelia.parallel="no", amelia.ncpus=1) library(Amelia) library(Zelig) -library(stats4) +library(bbmle) +library(matrixStats) # for numerically stable logsumexps +source("measerr_methods.R") ## for my more generic function. ## This uses the pseudolikelihood approach from Carroll page 349. ## assumes MAR ## assumes differential error, but that only depends on Y ## inefficient, because pseudolikelihood -logistic.correction.pseudo <- function(df){ + +## This uses the pseudo-likelihood approach from Carroll page 346. +my.pseudo.mle <- function(df){ p1.est <- mean(df[w_pred==1]$y.obs==1,na.rm=T) p0.est <- mean(df[w_pred==0]$y.obs==0,na.rm=T) - nll <- function(B0, Bxy, Bgy){ - probs <- (1 - p0.est) + (p1.est + p0.est - 1)*plogis(B0 + Bxy * df$x + Bgy * df$g) + nll <- function(B0, Bxy, Bzy){ - part1 = sum(log(probs[df$w_pred == 1])) - part2 = sum(log(1-probs[df$w_pred == 0])) + pw <- vector(mode='numeric',length=nrow(df)) + dfw1 <- df[w_pred==1] + dfw0 <- df[w_pred==0] + pw[df$w_pred==1] <- plogis(B0 + Bxy * dfw1$x + Bzy * dfw1$z, log=T) + pw[df$w_pred==0] <- plogis(B0 + Bxy * dfw0$x + Bzy * dfw0$z, lower.tail=FALSE, log=T) - return(-1*(part1 + part2)) + probs <- colLogSumExps(rbind(log(1 - p0.est), log(p1.est + p0.est - 1) + pw)) + return(-1*sum(probs)) } - mlefit <- stats4::mle(minuslogl = nll, start = list(B0=0, Bxy=0.0, Bgy=0.0)) + mlefit <- mle2(minuslogl = nll, start = list(B0=0.0, Bxy=0.0, Bzy=0.0), control=list(maxit=1e6),method='L-BFGS-B') return(mlefit) } + +## model from Zhang's arxiv paper, with predictions for y +## Zhang got this model from Hausman 1998 +### I think this is actually eqivalent to the pseudo.mle method +zhang.mle.iv <- function(df){ + df.obs <- df[!is.na(x.obs)] + df.unobs <- df[is.na(x.obs)] + + tn <- df.obs[(w_pred == 0) & (x.obs == w_pred),.N] + pn <- df.obs[(w_pred==0), .N] + npv <- tn / pn + + tp <- df.obs[(w_pred==1) & (x.obs == w_pred),.N] + pp <- df.obs[(w_pred==1),.N] + ppv <- tp / pp + + nll <- function(B0=0, Bxy=0, Bzy=0, sigma_y=0.1){ + + ## fpr = 1 - TNR + ### Problem: accounting for uncertainty in ppv / npv + + ## fnr = 1 - TPR + ll.y.obs <- with(df.obs, dnorm(y, B0 + Bxy * x + Bzy * z, sd=sigma_y,log=T)) + ll <- sum(ll.y.obs) + + # unobserved case; integrate out x + ll.x.1 <- with(df.unobs, dnorm(y, B0 + Bxy + Bzy * z, sd = sigma_y, log=T)) + ll.x.0 <- with(df.unobs, dnorm(y, B0 + Bzy * z, sd = sigma_y,log=T)) + + ## case x == 1 + lls.x.1 <- colLogSumExps(rbind(log(ppv) + ll.x.1, log(1-ppv) + ll.x.0)) + + ## case x == 0 + lls.x.0 <- colLogSumExps(rbind(log(1-npv) + ll.x.1, log(npv) + ll.x.0)) + + lls <- colLogSumExps(rbind(df.unobs$w_pred * lls.x.1, (1-df.unobs$w_pred) * lls.x.0)) + ll <- ll + sum(lls) + return(-ll) + } + mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6), lower=list(sigma_y=0.0001, B0=-Inf, Bxy=-Inf, Bzy=-Inf), + upper=list(sigma_y=Inf, B0=Inf, Bxy=Inf, Bzy=Inf),method='L-BFGS-B') + return(mlefit) +} + +## this is equivalent to the pseudo-liklihood model from Caroll +## zhang.mle.dv <- function(df){ + +## nll <- function(B0=0, Bxy=0, Bzy=0, ppv=0.9, npv=0.9){ +## df.obs <- df[!is.na(y.obs)] + +## ## fpr = 1 - TNR +## ll.w0y0 <- with(df.obs[y.obs==0],dbinom(1-w_pred,1,npv,log=TRUE)) +## ll.w1y1 <- with(df.obs[y.obs==1],dbinom(w_pred,1,ppv,log=TRUE)) + +## # observed case +## ll.y.obs <- vector(mode='numeric', length=nrow(df.obs)) +## ll.y.obs[df.obs$y.obs==1] <- with(df.obs[y.obs==1], plogis(B0 + Bxy * x + Bzy * z,log=T)) +## ll.y.obs[df.obs$y.obs==0] <- with(df.obs[y.obs==0], plogis(B0 + Bxy * x + Bzy * z,log=T,lower.tail=FALSE)) + +## ll <- sum(ll.y.obs) + sum(ll.w0y0) + sum(ll.w1y1) + +## # unobserved case; integrate out y +## ## case y = 1 +## ll.y.1 <- vector(mode='numeric', length=nrow(df)) +## pi.y.1 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T)) +## ## P(w=1| y=1)P(y=1) + P(w=0|y=1)P(y=1) = P(w=1,y=1) + P(w=0,y=1) +## lls.y.1 <- colLogSumExps(rbind(log(ppv) + pi.y.1, log(1-ppv) + pi.y.1)) + +## ## case y = 0 +## ll.y.0 <- vector(mode='numeric', length=nrow(df)) +## pi.y.0 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T,lower.tail=FALSE)) + +## ## P(w=1 | y=0)P(y=0) + P(w=0|y=0)P(y=0) = P(w=1,y=0) + P(w=0,y=0) +## lls.y.0 <- colLogSumExps(rbind(log(npv) + pi.y.0, log(1-npv) + pi.y.0)) + +## lls <- colLogSumExps(rbind(lls.y.1, lls.y.0)) +## ll <- ll + sum(lls) +## return(-ll) +## } +## mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6),method='L-BFGS-B',lower=list(B0=-Inf, Bxy=-Inf, Bzy=-Inf, ppv=0.001,npv=0.001), +## upper=list(B0=Inf, Bxy=Inf, Bzy=Inf,ppv=0.999,npv=0.999)) +## return(mlefit) +## } + +zhang.mle.dv <- function(df){ + df.obs <- df[!is.na(y.obs)] + df.unobs <- df[is.na(y.obs)] + + fp <- df.obs[(w_pred==1) & (y.obs != w_pred),.N] + p <- df.obs[(w_pred==1),.N] + fpr <- fp / p + fn <- df.obs[(w_pred==0) & (y.obs != w_pred), .N] + n <- df.obs[(w_pred==0),.N] + fnr <- fn / n + + nll <- function(B0=0, Bxy=0, Bzy=0){ + + + ## observed case + ll.y.obs <- vector(mode='numeric', length=nrow(df.obs)) + ll.y.obs[df.obs$y.obs==1] <- with(df.obs[y.obs==1], plogis(B0 + Bxy * x + Bzy * z,log=T)) + ll.y.obs[df.obs$y.obs==0] <- with(df.obs[y.obs==0], plogis(B0 + Bxy * x + Bzy * z,log=T,lower.tail=FALSE)) + + ll <- sum(ll.y.obs) + + pi.y.1 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T)) + pi.y.0 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T,lower.tail=FALSE)) + + lls <- with(df.unobs, colLogSumExps(rbind(w_pred * colLogSumExps(rbind(log(fpr), log(1 - fnr - fpr)+pi.y.1)), + (1-w_pred) * colLogSumExps(rbind(log(1-fpr), log(1 - fnr - fpr)+pi.y.0))))) + + ll <- ll + sum(lls) + return(-ll) + } + mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6),method='L-BFGS-B',lower=c(B0=-Inf, Bxy=-Inf, Bzy=-Inf), + upper=c(B0=Inf, Bxy=Inf, Bzy=Inf)) + return(mlefit) +} + ## This uses the likelihood approach from Carroll page 353. ## assumes that we have a good measurement error model -logistic.correction.liklihood <- function(df){ +my.mle <- function(df){ ## liklihood for observed responses - nll <- function(B0, Bxy, Bgy, gamma0, gamma_y, gamma_g){ + nll <- function(B0, Bxy, Bzy, gamma0, gamma_y, gamma_z, gamma_yz){ df.obs <- df[!is.na(y.obs)] - p.y.obs <- plogis(B0 + Bxy * df.obs$x + Bgy*df.obs$g) - p.y.obs[df.obs$y==0] <- 1-p.y.obs[df.obs$y==0] - p.s.obs <- plogis(gamma0 + gamma_y * df.obs$y + gamma_g*df.obs$g) - p.s.obs[df.obs$w_pred==0] <- 1 - p.s.obs[df.obs$w_pred==0] + yobs0 <- df.obs$y==0 + yobs1 <- df.obs$y==1 + p.y.obs <- vector(mode='numeric', length=nrow(df.obs)) - p.obs <- p.s.obs * p.y.obs + p.y.obs[yobs1] <- plogis(B0 + Bxy * df.obs[yobs1]$x + Bzy*df.obs[yobs1]$z,log=T) + p.y.obs[yobs0] <- plogis(B0 + Bxy * df.obs[yobs0]$x + Bzy*df.obs[yobs0]$z,lower.tail=FALSE,log=T) + + wobs0 <- df.obs$w_pred==0 + wobs1 <- df.obs$w_pred==1 + p.w.obs <- vector(mode='numeric', length=nrow(df.obs)) + + p.w.obs[wobs1] <- plogis(gamma0 + gamma_y * df.obs[wobs1]$y + gamma_z*df.obs[wobs1]$z + df.obs[wobs1]$z*df.obs[wobs1]$y* gamma_yz, log=T) + p.w.obs[wobs0] <- plogis(gamma0 + gamma_y * df.obs[wobs0]$y + gamma_z*df.obs[wobs0]$z + df.obs[wobs0]$z*df.obs[wobs0]$y* gamma_yz, lower.tail=FALSE, log=T) + + p.obs <- p.w.obs + p.y.obs df.unobs <- df[is.na(y.obs)] - p.unobs.1 <- plogis(B0 + Bxy * df.unobs$x + Bgy*df.unobs$g)*plogis(gamma0 + gamma_y + gamma_g*df.unobs$g) - p.unobs.0 <- (1-plogis(B0 + Bxy * df.unobs$x + Bgy*df.unobs$g))*plogis(gamma0 + gamma_g*df.unobs$g) - p.unobs <- p.unobs.1 + p.unobs.0 - p.unobs[df.unobs$w_pred==0] <- 1 - p.unobs[df.unobs$w_pred==0] + p.unobs.0 <- vector(mode='numeric',length=nrow(df.unobs)) + p.unobs.1 <- vector(mode='numeric',length=nrow(df.unobs)) + + wunobs.0 <- df.unobs$w_pred == 0 + wunobs.1 <- df.unobs$w_pred == 1 + + p.unobs.0[wunobs.1] <- plogis(B0 + Bxy * df.unobs[wunobs.1]$x + Bzy*df.unobs[wunobs.1]$z, log=T) + plogis(gamma0 + gamma_y + gamma_z*df.unobs[wunobs.1]$z + df.unobs[wunobs.1]$z*gamma_yz, log=T) + + p.unobs.0[wunobs.0] <- plogis(B0 + Bxy * df.unobs[wunobs.0]$x + Bzy*df.unobs[wunobs.0]$z, log=T) + plogis(gamma0 + gamma_y + gamma_z*df.unobs[wunobs.0]$z + df.unobs[wunobs.0]$z*gamma_yz, lower.tail=FALSE, log=T) + + p.unobs.1[wunobs.1] <- plogis(B0 + Bxy * df.unobs[wunobs.1]$x + Bzy*df.unobs[wunobs.1]$z, log=T, lower.tail=FALSE) + plogis(gamma0 + gamma_z*df.unobs[wunobs.1]$z, log=T) + + p.unobs.1[wunobs.0] <- plogis(B0 + Bxy * df.unobs[wunobs.0]$x + Bzy*df.unobs[wunobs.0]$z, log=T, lower.tail=FALSE) + plogis(gamma0 + gamma_z*df.unobs[wunobs.0]$z, lower.tail=FALSE, log=T) + + p.unobs <- colLogSumExps(rbind(p.unobs.1, p.unobs.0)) p <- c(p.obs, p.unobs) - return(-1*(sum(log(p)))) + return(-1*(sum(p))) } - mlefit <- stats4::mle(minuslogl = nll, start = list(B0=1, Bxy=0,Bgy=0, gamma0=5, gamma_y=0, gamma_g=0)) + mlefit <- mle2(minuslogl = nll, start = list(B0=0, Bxy=0,Bzy=0, gamma0=0, gamma_y=0, gamma_z=0, gamma_yz=0), control=list(maxit=1e6),method='L-BFGS-B') return(mlefit) } - -logistic <- function(x) {1/(1+exp(-1*x))} - -run_simulation_depvar <- function(df, result){ +run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formula=w_pred~y){ accuracy <- df[,mean(w_pred==y)] result <- append(result, list(accuracy=accuracy)) - (model.true <- glm(y ~ x + g, data=df,family=binomial(link='logit'))) + (model.true <- glm(y ~ x + z, data=df,family=binomial(link='logit'))) true.ci.Bxy <- confint(model.true)['x',] - true.ci.Bgy <- confint(model.true)['g',] + true.ci.Bzy <- confint(model.true)['z',] result <- append(result, list(Bxy.est.true=coef(model.true)['x'], - Bgy.est.true=coef(model.true)['g'], + Bzy.est.true=coef(model.true)['z'], Bxy.ci.upper.true = true.ci.Bxy[2], Bxy.ci.lower.true = true.ci.Bxy[1], - Bgy.ci.upper.true = true.ci.Bgy[2], - Bgy.ci.lower.true = true.ci.Bgy[1])) + Bzy.ci.upper.true = true.ci.Bzy[2], + Bzy.ci.lower.true = true.ci.Bzy[1])) - (model.feasible <- glm(y.obs~x+g,data=df,family=binomial(link='logit'))) + (model.feasible <- glm(y.obs~x+z,data=df,family=binomial(link='logit'))) feasible.ci.Bxy <- confint(model.feasible)['x',] result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x'], Bxy.ci.upper.feasible = feasible.ci.Bxy[2], Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) - feasible.ci.Bgy <- confint(model.feasible)['g',] - result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'], - Bgy.ci.upper.feasible = feasible.ci.Bgy[2], - Bgy.ci.lower.feasible = feasible.ci.Bgy[1])) + feasible.ci.Bzy <- confint(model.feasible)['z',] + result <- append(result, list(Bzy.est.feasible=coef(model.feasible)['z'], + Bzy.ci.upper.feasible = feasible.ci.Bzy[2], + Bzy.ci.lower.feasible = feasible.ci.Bzy[1])) - (model.naive <- glm(w_pred~x+g, data=df, family=binomial(link='logit'))) + (model.naive <- glm(w_pred~x+z, data=df, family=binomial(link='logit'))) naive.ci.Bxy <- confint(model.naive)['x',] - naive.ci.Bgy <- confint(model.naive)['g',] + naive.ci.Bzy <- confint(model.naive)['z',] result <- append(result, list(Bxy.est.naive=coef(model.naive)['x'], - Bgy.est.naive=coef(model.naive)['g'], + Bzy.est.naive=coef(model.naive)['z'], Bxy.ci.upper.naive = naive.ci.Bxy[2], Bxy.ci.lower.naive = naive.ci.Bxy[1], - Bgy.ci.upper.naive = naive.ci.Bgy[2], - Bgy.ci.lower.naive = naive.ci.Bgy[1])) + Bzy.ci.upper.naive = naive.ci.Bzy[2], + Bzy.ci.lower.naive = naive.ci.Bzy[1])) - (model.naive.cont <- lm(w~x+g, data=df)) + (model.naive.cont <- lm(w~x+z, data=df)) naivecont.ci.Bxy <- confint(model.naive.cont)['x',] - naivecont.ci.Bgy <- confint(model.naive.cont)['g',] + naivecont.ci.Bzy <- confint(model.naive.cont)['z',] - ## my implementatoin of liklihood based correction - mod.caroll.lik <- logistic.correction.liklihood(df) - coef <- coef(mod.caroll.lik) - ci <- confint(mod.caroll.lik) + ## my implementation of liklihood based correction + temp.df <- copy(df) + temp.df[,y:=y.obs] + mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula) + fisher.info <- solve(mod.caroll.lik$hessian) + coef <- mod.caroll.lik$par + ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 result <- append(result, - list(Bxy.est.mle = coef['Bxy'], - Bxy.ci.upper.mle = ci['Bxy','97.5 %'], - Bxy.ci.lower.mle = ci['Bxy','2.5 %'], - Bgy.est.mle = coef['Bgy'], - Bgy.ci.upper.mle = ci['Bgy','97.5 %'], - Bgy.ci.lower.mle = ci['Bgy','2.5 %'])) - + list(Bxy.est.mle = coef['x'], + Bxy.ci.upper.mle = ci.upper['x'], + Bxy.ci.lower.mle = ci.lower['x'], + Bzy.est.mle = coef['z'], + Bzy.ci.upper.mle = ci.upper['z'], + Bzy.ci.lower.mle = ci.lower['z'])) + ## my implementatoin of liklihood based correction - mod.caroll.pseudo <- logistic.correction.pseudo(df) - coef <- coef(mod.caroll.pseudo) - ci <- confint(mod.caroll.pseudo) + mod.zhang <- zhang.mle.dv(df) + coef <- coef(mod.zhang) + ci <- confint(mod.zhang,method='quad') result <- append(result, - list(Bxy.est.pseudo = coef['Bxy'], - Bxy.ci.upper.pseudo = ci['Bxy','97.5 %'], - Bxy.ci.lower.pseudo = ci['Bxy','2.5 %'], - Bgy.est.pseudo = coef['Bgy'], - Bgy.ci.upper.pseudo = ci['Bgy','97.5 %'], - Bgy.ci.lower.pseudo = ci['Bgy','2.5 %'])) - - - # amelia says use normal distribution for binary variables. - amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('y','ystar','w_pred')) - mod.amelia.k <- zelig(y.obs~x+g, model='ls', data=amelia.out.k$imputations, cite=FALSE) - (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + list(Bxy.est.zhang = coef['Bxy'], + Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], + Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], + Bzy.est.zhang = coef['Bzy'], + Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], + Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) - est.x.mi <- coefse['x','Estimate'] - est.x.se <- coefse['x','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.full = est.x.mi, - Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se - )) + - est.g.mi <- coefse['g','Estimate'] - est.g.se <- coefse['g','Std.Error'] + # amelia says use normal distribution for binary variables. + tryCatch({ + amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('y','ystar','w')) + mod.amelia.k <- zelig(y.obs~x+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) + (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + est.x.mi <- coefse['x','Estimate'] + est.x.se <- coefse['x','Std.Error'] + result <- append(result, + list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se + )) + + est.z.mi <- coefse['z','Estimate'] + est.z.se <- coefse['z','Std.Error'] + + result <- append(result, + list(Bzy.est.amelia.full = est.z.mi, + Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, + Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se + )) + + }, + error = function(e){ + message("An error occurred:\n",e) + result$error <- paste0(result$error,'\n', e) + }) - result <- append(result, - list(Bgy.est.amelia.full = est.g.mi, - Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, - Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se - )) return(result) } -run_simulation <- function(df, result){ + +## outcome_formula, proxy_formula, and truth_formula are passed to measerr_mle +run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NULL, truth_formula=NULL){ accuracy <- df[,mean(w_pred==x)] result <- append(result, list(accuracy=accuracy)) - (model.true <- lm(y ~ x + g, data=df)) + (model.true <- lm(y ~ x + z, data=df)) true.ci.Bxy <- confint(model.true)['x',] - true.ci.Bgy <- confint(model.true)['g',] + true.ci.Bzy <- confint(model.true)['z',] result <- append(result, list(Bxy.est.true=coef(model.true)['x'], - Bgy.est.true=coef(model.true)['g'], + Bzy.est.true=coef(model.true)['z'], Bxy.ci.upper.true = true.ci.Bxy[2], Bxy.ci.lower.true = true.ci.Bxy[1], - Bgy.ci.upper.true = true.ci.Bgy[2], - Bgy.ci.lower.true = true.ci.Bgy[1])) + Bzy.ci.upper.true = true.ci.Bzy[2], + Bzy.ci.lower.true = true.ci.Bzy[1])) - (model.feasible <- lm(y~x.obs+g,data=df)) + (model.feasible <- lm(y~x.obs+z,data=df)) feasible.ci.Bxy <- confint(model.feasible)['x.obs',] result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'], Bxy.ci.upper.feasible = feasible.ci.Bxy[2], Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) - feasible.ci.Bgy <- confint(model.feasible)['g',] - result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'], - Bgy.ci.upper.feasible = feasible.ci.Bgy[2], - Bgy.ci.lower.feasible = feasible.ci.Bgy[1])) + feasible.ci.Bzy <- confint(model.feasible)['z',] + result <- append(result, list(Bzy.est.feasible=coef(model.feasible)['z'], + Bzy.ci.upper.feasible = feasible.ci.Bzy[2], + Bzy.ci.lower.feasible = feasible.ci.Bzy[1])) - (model.naive <- lm(y~w+g, data=df)) + (model.naive <- lm(y~w_pred+z, data=df)) - naive.ci.Bxy <- confint(model.naive)['w',] - naive.ci.Bgy <- confint(model.naive)['g',] + naive.ci.Bxy <- confint(model.naive)['w_pred',] + naive.ci.Bzy <- confint(model.naive)['z',] - result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], - Bgy.est.naive=coef(model.naive)['g'], + result <- append(result, list(Bxy.est.naive=coef(model.naive)['w_pred'], + Bzy.est.naive=coef(model.naive)['z'], Bxy.ci.upper.naive = naive.ci.Bxy[2], Bxy.ci.lower.naive = naive.ci.Bxy[1], - Bgy.ci.upper.naive = naive.ci.Bgy[2], - Bgy.ci.lower.naive = naive.ci.Bgy[1])) + Bzy.ci.upper.naive = naive.ci.Bzy[2], + Bzy.ci.lower.naive = naive.ci.Bzy[1])) - amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred')) - mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE) + tryCatch({ + amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w')) + mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) est.x.mi <- coefse['x.obs','Estimate'] @@ -215,15 +373,65 @@ run_simulation <- function(df, result){ Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se )) - est.g.mi <- coefse['g','Estimate'] - est.g.se <- coefse['g','Std.Error'] + est.z.mi <- coefse['z','Estimate'] + est.z.se <- coefse['z','Std.Error'] result <- append(result, - list(Bgy.est.amelia.full = est.g.mi, - Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, - Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se + list(Bzy.est.amelia.full = est.z.mi, + Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, + Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se )) + }, + error = function(e){ + message("An error occurred:\n",e) + result$error <-paste0(result$error,'\n', e) + } + ) + + tryCatch({ + temp.df <- copy(df) + temp.df <- temp.df[,x:=x.obs] + mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula) + fisher.info <- solve(mod.caroll.lik$hessian) + coef <- mod.caroll.lik$par + ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 + + + result <- append(result, + list(Bxy.est.mle = coef['x'], + Bxy.ci.upper.mle = ci.upper['x'], + Bxy.ci.lower.mle = ci.lower['x'], + Bzy.est.mle = coef['z'], + Bzy.ci.upper.mle = ci.upper['z'], + Bzy.ci.lower.mle = ci.lower['z'])) + }, + + error = function(e){ + message("An error occurred:\n",e) + result$error <- paste0(result$error,'\n', e) + }) + + tryCatch({ + + mod.zhang.lik <- zhang.mle.iv(df) + coef <- coef(mod.zhang.lik) + ci <- confint(mod.zhang.lik,method='quad') + result <- append(result, + list(Bxy.est.zhang = coef['Bxy'], + Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], + Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], + Bzy.est.zhang = coef['Bzy'], + Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], + Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) + }, + + error = function(e){ + message("An error occurred:\n",e) + result$error <- paste0(result$error,'\n', e) + }) + ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms) ## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) @@ -255,10 +463,10 @@ run_simulation <- function(df, result){ df <- df[order(x.obs)] y <- df[,y] x <- df[,x.obs] - g <- df[,g] - w <- df[,w] + z <- df[,z] + w <- df[,w_pred] # gmm gets pretty close - (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=TRUE)) + (gmm.res <- predicted_covariates(y, x, z, w, v, train, p, max_iter=100, verbose=TRUE)) result <- append(result, list(Bxy.est.gmm = gmm.res$beta[1,1], @@ -268,28 +476,34 @@ run_simulation <- function(df, result){ )) result <- append(result, - list(Bgy.est.gmm = gmm.res$beta[2,1], - Bgy.ci.upper.gmm = gmm.res$confint[2,2], - Bgy.ci.lower.gmm = gmm.res$confint[2,1])) + list(Bzy.est.gmm = gmm.res$beta[2,1], + Bzy.ci.upper.gmm = gmm.res$confint[2,2], + Bzy.ci.lower.gmm = gmm.res$confint[2,1])) - mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient') + tryCatch({ + mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient') (mod.calibrated.mle) (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) result <- append(result, list( Bxy.est.mecor = mecor.ci['Estimate'], - Bxy.upper.mecor = mecor.ci['UCI'], - Bxy.lower.mecor = mecor.ci['LCI']) + Bxy.ci.upper.mecor = mecor.ci['UCI'], + Bxy.ci.lower.mecor = mecor.ci['LCI']) ) - (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',]) + (mecor.ci <- summary(mod.calibrated.mle)$c$ci['z',]) result <- append(result, list( - Bgy.est.mecor = mecor.ci['Estimate'], - Bgy.upper.mecor = mecor.ci['UCI'], - Bgy.lower.mecor = mecor.ci['LCI']) + Bzy.est.mecor = mecor.ci['Estimate'], + Bzy.ci.upper.mecor = mecor.ci['UCI'], + Bzy.ci.lower.mecor = mecor.ci['LCI']) ) - + }, + error = function(e){ + message("An error occurred:\n",e) + result$error <- paste0(result$error, '\n', e) + } + ) ## clean up memory ## rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))