X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/f8f58301e0285118f7b669a96ed9367a9914ba02..69948cae1e691191fc86e6abdaa485bc98f38f1f:/simulations/simulation_base.R diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R index 345d14e..73544e9 100644 --- a/simulations/simulation_base.R +++ b/simulations/simulation_base.R @@ -4,82 +4,359 @@ options(amelia.parallel="no", amelia.ncpus=1) library(Amelia) library(Zelig) +library(bbmle) +library(matrixStats) # for numerically stable logsumexps -logistic <- function(x) {1/(1+exp(-1*x))} +source("pl_methods.R") +source("measerr_methods.R") ## for my more generic function. -run_simulation <- function(df, result){ +## This uses the pseudolikelihood approach from Carroll page 349. +## assumes MAR +## assumes differential error, but that only depends on Y +## inefficient, because pseudolikelihood + +## This uses the pseudo-likelihood approach from Carroll page 346. +my.pseudo.mle <- function(df){ + p1.est <- mean(df[w_pred==1]$y.obs==1,na.rm=T) + p0.est <- mean(df[w_pred==0]$y.obs==0,na.rm=T) + + nll <- function(B0, Bxy, Bzy){ - accuracy <- df[,mean(w_pred==x)] + pw <- vector(mode='numeric',length=nrow(df)) + dfw1 <- df[w_pred==1] + dfw0 <- df[w_pred==0] + pw[df$w_pred==1] <- plogis(B0 + Bxy * dfw1$x + Bzy * dfw1$z, log=T) + pw[df$w_pred==0] <- plogis(B0 + Bxy * dfw0$x + Bzy * dfw0$z, lower.tail=FALSE, log=T) + + probs <- colLogSumExps(rbind(log(1 - p0.est), log(p1.est + p0.est - 1) + pw)) + return(-1*sum(probs)) + } + + mlefit <- mle2(minuslogl = nll, start = list(B0=0.0, Bxy=0.0, Bzy=0.0), control=list(maxit=1e6),method='L-BFGS-B') + return(mlefit) + +} + + +## This uses the likelihood approach from Carroll page 353. +## assumes that we have a good measurement error model +my.mle <- function(df){ + + ## liklihood for observed responses + nll <- function(B0, Bxy, Bzy, gamma0, gamma_y, gamma_z, gamma_yz){ + df.obs <- df[!is.na(y.obs)] + yobs0 <- df.obs$y==0 + yobs1 <- df.obs$y==1 + p.y.obs <- vector(mode='numeric', length=nrow(df.obs)) + + p.y.obs[yobs1] <- plogis(B0 + Bxy * df.obs[yobs1]$x + Bzy*df.obs[yobs1]$z,log=T) + p.y.obs[yobs0] <- plogis(B0 + Bxy * df.obs[yobs0]$x + Bzy*df.obs[yobs0]$z,lower.tail=FALSE,log=T) + + wobs0 <- df.obs$w_pred==0 + wobs1 <- df.obs$w_pred==1 + p.w.obs <- vector(mode='numeric', length=nrow(df.obs)) + + p.w.obs[wobs1] <- plogis(gamma0 + gamma_y * df.obs[wobs1]$y + gamma_z*df.obs[wobs1]$z + df.obs[wobs1]$z*df.obs[wobs1]$y* gamma_yz, log=T) + p.w.obs[wobs0] <- plogis(gamma0 + gamma_y * df.obs[wobs0]$y + gamma_z*df.obs[wobs0]$z + df.obs[wobs0]$z*df.obs[wobs0]$y* gamma_yz, lower.tail=FALSE, log=T) + + p.obs <- p.w.obs + p.y.obs + + df.unobs <- df[is.na(y.obs)] + + p.unobs.0 <- vector(mode='numeric',length=nrow(df.unobs)) + p.unobs.1 <- vector(mode='numeric',length=nrow(df.unobs)) + + wunobs.0 <- df.unobs$w_pred == 0 + wunobs.1 <- df.unobs$w_pred == 1 + + p.unobs.0[wunobs.1] <- plogis(B0 + Bxy * df.unobs[wunobs.1]$x + Bzy*df.unobs[wunobs.1]$z, log=T) + plogis(gamma0 + gamma_y + gamma_z*df.unobs[wunobs.1]$z + df.unobs[wunobs.1]$z*gamma_yz, log=T) + + p.unobs.0[wunobs.0] <- plogis(B0 + Bxy * df.unobs[wunobs.0]$x + Bzy*df.unobs[wunobs.0]$z, log=T) + plogis(gamma0 + gamma_y + gamma_z*df.unobs[wunobs.0]$z + df.unobs[wunobs.0]$z*gamma_yz, lower.tail=FALSE, log=T) + + p.unobs.1[wunobs.1] <- plogis(B0 + Bxy * df.unobs[wunobs.1]$x + Bzy*df.unobs[wunobs.1]$z, log=T, lower.tail=FALSE) + plogis(gamma0 + gamma_z*df.unobs[wunobs.1]$z, log=T) + + p.unobs.1[wunobs.0] <- plogis(B0 + Bxy * df.unobs[wunobs.0]$x + Bzy*df.unobs[wunobs.0]$z, log=T, lower.tail=FALSE) + plogis(gamma0 + gamma_z*df.unobs[wunobs.0]$z, lower.tail=FALSE, log=T) + + p.unobs <- colLogSumExps(rbind(p.unobs.1, p.unobs.0)) + + p <- c(p.obs, p.unobs) + + return(-1*(sum(p))) + } + + mlefit <- mle2(minuslogl = nll, start = list(B0=0, Bxy=0,Bzy=0, gamma0=0, gamma_y=0, gamma_z=0, gamma_yz=0), control=list(maxit=1e6),method='L-BFGS-B') + + return(mlefit) +} + +run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formula=w_pred~y){ + + (accuracy <- df[,mean(w_pred==y)]) result <- append(result, list(accuracy=accuracy)) + (error.cor.z <- cor(df$z, df$y - df$w_pred)) + (error.cor.x <- cor(df$x, df$y - df$w_pred)) + (error.cor.y <- cor(df$y, df$y - df$w_pred)) + result <- append(result, list(error.cor.x = error.cor.x, + error.cor.z = error.cor.z, + error.cor.y = error.cor.y)) + + model.null <- glm(y~1, data=df,family=binomial(link='logit')) + (model.true <- glm(y ~ x + z, data=df,family=binomial(link='logit'))) + (lik.ratio <- exp(logLik(model.true) - logLik(model.null))) - (model.true <- lm(y ~ x + g, data=df)) true.ci.Bxy <- confint(model.true)['x',] - true.ci.Bgy <- confint(model.true)['g',] + true.ci.Bzy <- confint(model.true)['z',] + + result <- append(result, list(cor.xz=cor(df$x,df$z))) + result <- append(result, list(lik.ratio=lik.ratio)) result <- append(result, list(Bxy.est.true=coef(model.true)['x'], - Bgy.est.true=coef(model.true)['g'], + Bzy.est.true=coef(model.true)['z'], Bxy.ci.upper.true = true.ci.Bxy[2], Bxy.ci.lower.true = true.ci.Bxy[1], - Bgy.ci.upper.true = true.ci.Bgy[2], - Bgy.ci.lower.true = true.ci.Bgy[1])) + Bzy.ci.upper.true = true.ci.Bzy[2], + Bzy.ci.lower.true = true.ci.Bzy[1])) - (model.feasible <- lm(y~x.obs+g,data=df)) + (model.feasible <- glm(y.obs~x+z,data=df,family=binomial(link='logit'))) + + feasible.ci.Bxy <- confint(model.feasible)['x',] + result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x'], + Bxy.ci.upper.feasible = feasible.ci.Bxy[2], + Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) + + feasible.ci.Bzy <- confint(model.feasible)['z',] + result <- append(result, list(Bzy.est.feasible=coef(model.feasible)['z'], + Bzy.ci.upper.feasible = feasible.ci.Bzy[2], + Bzy.ci.lower.feasible = feasible.ci.Bzy[1])) + + (model.naive <- glm(w_pred~x+z, data=df, family=binomial(link='logit'))) + + naive.ci.Bxy <- confint(model.naive)['x',] + naive.ci.Bzy <- confint(model.naive)['z',] + + result <- append(result, list(Bxy.est.naive=coef(model.naive)['x'], + Bzy.est.naive=coef(model.naive)['z'], + Bxy.ci.upper.naive = naive.ci.Bxy[2], + Bxy.ci.lower.naive = naive.ci.Bxy[1], + Bzy.ci.upper.naive = naive.ci.Bzy[2], + Bzy.ci.lower.naive = naive.ci.Bzy[1])) + + + (model.naive.cont <- lm(w~x+z, data=df)) + naivecont.ci.Bxy <- confint(model.naive.cont)['x',] + naivecont.ci.Bzy <- confint(model.naive.cont)['z',] + + ## my implementation of liklihood based correction + + temp.df <- copy(df) + temp.df[,y:=y.obs] + mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula) + fischer.info <- solve(mod.caroll.lik$hessian) + coef <- mod.caroll.lik$par + ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96 + result <- append(result, + list(Bxy.est.mle = coef['x'], + Bxy.ci.upper.mle = ci.upper['x'], + Bxy.ci.lower.mle = ci.lower['x'], + Bzy.est.mle = coef['z'], + Bzy.ci.upper.mle = ci.upper['z'], + Bzy.ci.lower.mle = ci.lower['z'])) + + + ## my implementatoin of liklihood based correction + mod.zhang <- zhang.mle.dv(df) + coef <- coef(mod.zhang) + ci <- confint(mod.zhang,method='quad') + + result <- append(result, + list(Bxy.est.zhang = coef['Bxy'], + Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], + Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], + Bzy.est.zhang = coef['Bzy'], + Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], + Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) + + + + # amelia says use normal distribution for binary variables. + amelia_result <- list(Bxy.est.amelia.full = NA, + Bxy.ci.upper.amelia.full = NA, + Bxy.ci.lower.amelia.full = NA, + Bzy.est.amelia.full = NA, + Bzy.ci.upper.amelia.full = NA, + Bzy.ci.lower.amelia.full = NA + ) + + tryCatch({ + amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('y','ystar','w')) + mod.amelia.k <- zelig(y.obs~x+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) + (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + est.x.mi <- coefse['x','Estimate'] + est.x.se <- coefse['x','Std.Error'] + + est.z.mi <- coefse['z','Estimate'] + est.z.se <- coefse['z','Std.Error'] + amelia_result <- list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se, + Bzy.est.amelia.full = est.z.mi, + Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, + Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se + ) + }, + error = function(e){ + result[['error']] <- e} + ) + result <- append(result,amelia_result) + + return(result) + +} + + +## outcome_formula, proxy_formula, and truth_formula are passed to measerr_mle +run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NULL, truth_formula=NULL){ + + accuracy <- df[,mean(w_pred==x)] + accuracy.y0 <- df[y<=0,mean(w_pred==x)] + accuracy.y1 <- df[y>=0,mean(w_pred==x)] + cor.y.xi <- cor(df$x - df$w_pred, df$y) + + fnr <- df[w_pred==0,mean(w_pred!=x)] + fnr.y0 <- df[(w_pred==0) & (y<=0),mean(w_pred!=x)] + fnr.y1 <- df[(w_pred==0) & (y>=0),mean(w_pred!=x)] + + fpr <- df[w_pred==1,mean(w_pred!=x)] + fpr.y0 <- df[(w_pred==1) & (y<=0),mean(w_pred!=x)] + fpr.y1 <- df[(w_pred==1) & (y>=0),mean(w_pred!=x)] + cor.resid.w_pred <- cor(resid(lm(y~x+z,df)),df$w_pred) + + result <- append(result, list(accuracy=accuracy, + accuracy.y0=accuracy.y0, + accuracy.y1=accuracy.y1, + cor.y.xi=cor.y.xi, + fnr=fnr, + fnr.y0=fnr.y0, + fnr.y1=fnr.y1, + fpr=fpr, + fpr.y0=fpr.y0, + fpr.y1=fpr.y1, + cor.resid.w_pred=cor.resid.w_pred + )) + + result <- append(result, list(cor.xz=cor(df$x,df$z))) + (model.true <- lm(y ~ x + z, data=df)) + true.ci.Bxy <- confint(model.true)['x',] + true.ci.Bzy <- confint(model.true)['z',] + + result <- append(result, list(Bxy.est.true=coef(model.true)['x'], + Bzy.est.true=coef(model.true)['z'], + Bxy.ci.upper.true = true.ci.Bxy[2], + Bxy.ci.lower.true = true.ci.Bxy[1], + Bzy.ci.upper.true = true.ci.Bzy[2], + Bzy.ci.lower.true = true.ci.Bzy[1])) + + (model.feasible <- lm(y~x.obs+z,data=df)) feasible.ci.Bxy <- confint(model.feasible)['x.obs',] result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'], Bxy.ci.upper.feasible = feasible.ci.Bxy[2], Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) - feasible.ci.Bgy <- confint(model.feasible)['g',] - result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'], - Bgy.ci.upper.feasible = feasible.ci.Bgy[2], - Bgy.ci.lower.feasible = feasible.ci.Bgy[1])) + feasible.ci.Bzy <- confint(model.feasible)['z',] + result <- append(result, list(Bzy.est.feasible=coef(model.feasible)['z'], + Bzy.ci.upper.feasible = feasible.ci.Bzy[2], + Bzy.ci.lower.feasible = feasible.ci.Bzy[1])) - (model.naive <- lm(y~w+g, data=df)) + (model.naive <- lm(y~w_pred+z, data=df)) - naive.ci.Bxy <- confint(model.naive)['w',] - naive.ci.Bgy <- confint(model.naive)['g',] + naive.ci.Bxy <- confint(model.naive)['w_pred',] + naive.ci.Bzy <- confint(model.naive)['z',] - result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], - Bgy.est.naive=coef(model.naive)['g'], + result <- append(result, list(Bxy.est.naive=coef(model.naive)['w_pred'], + Bzy.est.naive=coef(model.naive)['z'], Bxy.ci.upper.naive = naive.ci.Bxy[2], Bxy.ci.lower.naive = naive.ci.Bxy[1], - Bgy.ci.upper.naive = naive.ci.Bgy[2], - Bgy.ci.lower.naive = naive.ci.Bgy[1])) - + Bzy.ci.upper.naive = naive.ci.Bzy[2], + Bzy.ci.lower.naive = naive.ci.Bzy[1])) - ## multiple imputation when k is observed - ## amelia does great at this one. - noms <- c() - if(length(unique(df$x.obs)) <=2){ - noms <- c(noms, 'x.obs') - } + amelia_result <- list( + Bxy.est.amelia.full = NULL, + Bxy.ci.upper.amelia.full = NULL, + Bxy.ci.lower.amelia.full = NULL, + Bzy.est.amelia.full = NULL, + Bzy.ci.upper.amelia.full = NULL, + Bzy.ci.lower.amelia.full = NULL + ) - if(length(unique(df$g)) <=2){ - noms <- c(noms, 'g') - } + tryCatch({ + amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w')) + mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) + (coefse <- combine_coef_se(mod.amelia.k)) + est.x.mi <- coefse['x.obs','Estimate'] + est.x.se <- coefse['x.obs','Std.Error'] + est.z.mi <- coefse['z','Estimate'] + est.z.se <- coefse['z','Std.Error'] - amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=noms) - mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE) - (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + amelia_result <- list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se, + Bzy.est.amelia.full = est.z.mi, + Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, + Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se + ) + + }, + + error = function(e){ + result[['error']] <- e} + ) + + + result <- append(result, amelia_result) - est.x.mi <- coefse['x.obs','Estimate'] - est.x.se <- coefse['x.obs','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.full = est.x.mi, - Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se - )) - est.g.mi <- coefse['g','Estimate'] - est.g.se <- coefse['g','Std.Error'] + mle_result <- list(Bxy.est.mle = NULL, + Bxy.ci.upper.mle = NULL, + Bxy.ci.lower.mle = NULL, + Bzy.est.mle = NULL, + Bzy.ci.upper.mle = NULL, + Bzy.ci.lower.mle = NULL) + tryCatch({ + temp.df <- copy(df) + temp.df <- temp.df[,x:=x.obs] + mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula) + fischer.info <- solve(mod.caroll.lik$hessian) + coef <- mod.caroll.lik$par + ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96 + mle_result <- list(Bxy.est.mle = coef['x'], + Bxy.ci.upper.mle = ci.upper['x'], + Bxy.ci.lower.mle = ci.lower['x'], + Bzy.est.mle = coef['z'], + Bzy.ci.upper.mle = ci.upper['z'], + Bzy.ci.lower.mle = ci.lower['z']) + }, + + error=function(e) {result[['error']] <- as.character(e) + }) + + + result <- append(result, mle_result) + + mod.zhang.lik <- zhang.mle.iv(df) + coef <- coef(mod.zhang.lik) + ci <- confint(mod.zhang.lik,method='quad') result <- append(result, - list(Bgy.est.amelia.full = est.g.mi, - Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, - Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se - )) + list(Bxy.est.zhang = coef['Bxy'], + Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], + Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], + Bzy.est.zhang = coef['Bzy'], + Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], + Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms) @@ -112,10 +389,10 @@ run_simulation <- function(df, result){ df <- df[order(x.obs)] y <- df[,y] x <- df[,x.obs] - g <- df[,g] - w <- df[,w] + z <- df[,z] + w <- df[,w_pred] # gmm gets pretty close - (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=TRUE)) + (gmm.res <- predicted_covariates(y, x, z, w, v, train, p, max_iter=100, verbose=TRUE)) result <- append(result, list(Bxy.est.gmm = gmm.res$beta[1,1], @@ -125,28 +402,34 @@ run_simulation <- function(df, result){ )) result <- append(result, - list(Bgy.est.gmm = gmm.res$beta[2,1], - Bgy.ci.upper.gmm = gmm.res$confint[2,2], - Bgy.ci.lower.gmm = gmm.res$confint[2,1])) - + list(Bzy.est.gmm = gmm.res$beta[2,1], + Bzy.ci.upper.gmm = gmm.res$confint[2,2], + Bzy.ci.lower.gmm = gmm.res$confint[2,1])) - mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient') - (mod.calibrated.mle) - (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) - result <- append(result, list( - Bxy.est.mecor = mecor.ci['Estimate'], - Bxy.upper.mecor = mecor.ci['UCI'], - Bxy.lower.mecor = mecor.ci['LCI']) - ) - (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',]) + ## tryCatch({ + ## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient') + ## (mod.calibrated.mle) + ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) + ## result <- append(result, list( + ## Bxy.est.mecor = mecor.ci['Estimate'], + ## Bxy.ci.upper.mecor = mecor.ci['UCI'], + ## Bxy.ci.lower.mecor = mecor.ci['LCI']) + ## ) - result <- append(result, list( - Bgy.est.mecor = mecor.ci['Estimate'], - Bgy.upper.mecor = mecor.ci['UCI'], - Bgy.lower.mecor = mecor.ci['LCI']) - ) + ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['z',]) + ## result <- append(result, list( + ## Bzy.est.mecor = mecor.ci['Estimate'], + ## Bzy.ci.upper.mecor = mecor.ci['UCI'], + ## Bzy.ci.lower.mecor = mecor.ci['LCI']) + ## ) + ## }, + ## error = function(e){ + ## message("An error occurred:\n",e) + ## result$error <- paste0(result$error, '\n', e) + ## } + ## ) ## clean up memory ## rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))