X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/46e2d1fe4876a9ed906b723f9e5f74fcc949e339..69948cae1e691191fc86e6abdaa485bc98f38f1f:/simulations/simulation_base.R diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R index 0f03276..73544e9 100644 --- a/simulations/simulation_base.R +++ b/simulations/simulation_base.R @@ -7,6 +7,7 @@ library(Zelig) library(bbmle) library(matrixStats) # for numerically stable logsumexps +source("pl_methods.R") source("measerr_methods.R") ## for my more generic function. ## This uses the pseudolikelihood approach from Carroll page 349. @@ -36,85 +37,7 @@ my.pseudo.mle <- function(df){ } - -## model from Zhang's arxiv paper, with predictions for y -## Zhang got this model from Hausman 1998 -### I think this is actually eqivalent to the pseudo.mle method -zhang.mle.iv <- function(df){ - nll <- function(B0=0, Bxy=0, Bzy=0, sigma_y=0.1, ppv=0.9, npv=0.9){ - df.obs <- df[!is.na(x.obs)] - df.unobs <- df[is.na(x.obs)] - - ## fpr = 1 - TNR - ### Problem: accounting for uncertainty in ppv / npv - - ll.w1x1.obs <- with(df.obs[(w_pred==1)], dbinom(x.obs,size=1,prob=ppv,log=T)) - ll.w0x0.obs <- with(df.obs[(w_pred==0)], dbinom(1-x.obs,size=1,prob=npv,log=T)) - - ## fnr = 1 - TPR - ll.y.obs <- with(df.obs, dnorm(y, B0 + Bxy * x + Bzy * z, sd=sigma_y,log=T)) - ll <- sum(ll.y.obs) - ll <- ll + sum(ll.w1x1.obs) + sum(ll.w0x0.obs) - - # unobserved case; integrate out x - ll.x.1 <- with(df.unobs, dnorm(y, B0 + Bxy + Bzy * z, sd = sigma_y, log=T)) - ll.x.0 <- with(df.unobs, dnorm(y, B0 + Bzy * z, sd = sigma_y,log=T)) - - ## case x == 1 - lls.x.1 <- colLogSumExps(rbind(log(ppv) + ll.x.1, log(1-ppv) + ll.x.0)) - - ## case x == 0 - lls.x.0 <- colLogSumExps(rbind(log(1-npv) + ll.x.1, log(npv) + ll.x.0)) - - lls <- colLogSumExps(rbind(lls.x.1, lls.x.0)) - ll <- ll + sum(lls) - return(-ll) - } - mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6), lower=list(sigma_y=0.0001, B0=-Inf, Bxy=-Inf, Bzy=-Inf,ppv=0.00001, npv=0.00001), - upper=list(sigma_y=Inf, B0=Inf, Bxy=Inf, Bzy=Inf, ppv=0.99999,npv=0.99999),method='L-BFGS-B') - return(mlefit) -} - -## this is equivalent to the pseudo-liklihood model from Carolla -zhang.mle.dv <- function(df){ - - nll <- function(B0=0, Bxy=0, Bzy=0, ppv=0.9, npv=0.9){ - df.obs <- df[!is.na(y.obs)] - - ## fpr = 1 - TNR - ll.w0y0 <- with(df.obs[y.obs==0],dbinom(1-w_pred,1,npv,log=TRUE)) - ll.w1y1 <- with(df.obs[y.obs==1],dbinom(w_pred,1,ppv,log=TRUE)) - - # observed case - ll.y.obs <- vector(mode='numeric', length=nrow(df.obs)) - ll.y.obs[df.obs$y.obs==1] <- with(df.obs[y.obs==1], plogis(B0 + Bxy * x + Bzy * z,log=T)) - ll.y.obs[df.obs$y.obs==0] <- with(df.obs[y.obs==0], plogis(B0 + Bxy * x + Bzy * z,log=T,lower.tail=FALSE)) - - ll <- sum(ll.y.obs) + sum(ll.w0y0) + sum(ll.w1y1) - - # unobserved case; integrate out y - ## case y = 1 - ll.y.1 <- vector(mode='numeric', length=nrow(df)) - pi.y.1 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T)) - ## P(w=1| y=1)P(y=1) + P(w=0|y=1)P(y=1) = P(w=1,y=1) + P(w=0,y=1) - lls.y.1 <- colLogSumExps(rbind(log(ppv) + pi.y.1, log(1-ppv) + pi.y.1)) - - ## case y = 0 - ll.y.0 <- vector(mode='numeric', length=nrow(df)) - pi.y.0 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T,lower.tail=FALSE)) - - ## P(w=1 | y=0)P(y=0) + P(w=0|y=0)P(y=0) = P(w=1,y=0) + P(w=0,y=0) - lls.y.0 <- colLogSumExps(rbind(log(npv) + pi.y.0, log(1-npv) + pi.y.0)) - - lls <- colLogSumExps(rbind(lls.y.1, lls.y.0)) - ll <- ll + sum(lls) - return(-ll) - } - mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6),method='L-BFGS-B',lower=list(B0=-Inf, Bxy=-Inf, Bzy=-Inf, ppv=0.001,npv=0.001), - upper=list(B0=Inf, Bxy=Inf, Bzy=Inf,ppv=0.999,npv=0.999)) - return(mlefit) -} - + ## This uses the likelihood approach from Carroll page 353. ## assumes that we have a good measurement error model my.mle <- function(df){ @@ -168,13 +91,25 @@ my.mle <- function(df){ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formula=w_pred~y){ - accuracy <- df[,mean(w_pred==y)] + (accuracy <- df[,mean(w_pred==y)]) result <- append(result, list(accuracy=accuracy)) - + (error.cor.z <- cor(df$z, df$y - df$w_pred)) + (error.cor.x <- cor(df$x, df$y - df$w_pred)) + (error.cor.y <- cor(df$y, df$y - df$w_pred)) + result <- append(result, list(error.cor.x = error.cor.x, + error.cor.z = error.cor.z, + error.cor.y = error.cor.y)) + + model.null <- glm(y~1, data=df,family=binomial(link='logit')) (model.true <- glm(y ~ x + z, data=df,family=binomial(link='logit'))) + (lik.ratio <- exp(logLik(model.true) - logLik(model.null))) + true.ci.Bxy <- confint(model.true)['x',] true.ci.Bzy <- confint(model.true)['z',] + result <- append(result, list(cor.xz=cor(df$x,df$z))) + result <- append(result, list(lik.ratio=lik.ratio)) + result <- append(result, list(Bxy.est.true=coef(model.true)['x'], Bzy.est.true=coef(model.true)['z'], Bxy.ci.upper.true = true.ci.Bxy[2], @@ -211,15 +146,15 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu naivecont.ci.Bxy <- confint(model.naive.cont)['x',] naivecont.ci.Bzy <- confint(model.naive.cont)['z',] - ## my implementatoin of liklihood based correction + ## my implementation of liklihood based correction temp.df <- copy(df) temp.df[,y:=y.obs] mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula) - fisher.info <- solve(mod.caroll.lik$hessian) + fischer.info <- solve(mod.caroll.lik$hessian) coef <- mod.caroll.lik$par - ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 - ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 + ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96 result <- append(result, list(Bxy.est.mle = coef['x'], Bxy.ci.upper.mle = ci.upper['x'], @@ -241,36 +176,39 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu Bzy.est.zhang = coef['Bzy'], Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) - + + # amelia says use normal distribution for binary variables. + amelia_result <- list(Bxy.est.amelia.full = NA, + Bxy.ci.upper.amelia.full = NA, + Bxy.ci.lower.amelia.full = NA, + Bzy.est.amelia.full = NA, + Bzy.ci.upper.amelia.full = NA, + Bzy.ci.lower.amelia.full = NA + ) + tryCatch({ amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('y','ystar','w')) mod.amelia.k <- zelig(y.obs~x+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) est.x.mi <- coefse['x','Estimate'] est.x.se <- coefse['x','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.full = est.x.mi, - Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se - )) est.z.mi <- coefse['z','Estimate'] est.z.se <- coefse['z','Std.Error'] - - result <- append(result, - list(Bzy.est.amelia.full = est.z.mi, - Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, - Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se - )) - + amelia_result <- list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se, + Bzy.est.amelia.full = est.z.mi, + Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, + Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se + ) }, error = function(e){ - message("An error occurred:\n",e) - result$error <- paste0(result$error,'\n', e) - }) - + result[['error']] <- e} + ) + result <- append(result,amelia_result) return(result) @@ -278,11 +216,36 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu ## outcome_formula, proxy_formula, and truth_formula are passed to measerr_mle -run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=w_pred~x, truth_formula=x~z){ +run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NULL, truth_formula=NULL){ accuracy <- df[,mean(w_pred==x)] - result <- append(result, list(accuracy=accuracy)) - + accuracy.y0 <- df[y<=0,mean(w_pred==x)] + accuracy.y1 <- df[y>=0,mean(w_pred==x)] + cor.y.xi <- cor(df$x - df$w_pred, df$y) + + fnr <- df[w_pred==0,mean(w_pred!=x)] + fnr.y0 <- df[(w_pred==0) & (y<=0),mean(w_pred!=x)] + fnr.y1 <- df[(w_pred==0) & (y>=0),mean(w_pred!=x)] + + fpr <- df[w_pred==1,mean(w_pred!=x)] + fpr.y0 <- df[(w_pred==1) & (y<=0),mean(w_pred!=x)] + fpr.y1 <- df[(w_pred==1) & (y>=0),mean(w_pred!=x)] + cor.resid.w_pred <- cor(resid(lm(y~x+z,df)),df$w_pred) + + result <- append(result, list(accuracy=accuracy, + accuracy.y0=accuracy.y0, + accuracy.y1=accuracy.y1, + cor.y.xi=cor.y.xi, + fnr=fnr, + fnr.y0=fnr.y0, + fnr.y1=fnr.y1, + fpr=fpr, + fpr.y0=fpr.y0, + fpr.y1=fpr.y1, + cor.resid.w_pred=cor.resid.w_pred + )) + + result <- append(result, list(cor.xz=cor(df$x,df$z))) (model.true <- lm(y ~ x + z, data=df)) true.ci.Bxy <- confint(model.true)['x',] true.ci.Bzy <- confint(model.true)['z',] @@ -317,79 +280,83 @@ run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=w_p Bxy.ci.lower.naive = naive.ci.Bxy[1], Bzy.ci.upper.naive = naive.ci.Bzy[2], Bzy.ci.lower.naive = naive.ci.Bzy[1])) - - tryCatch({ - amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred')) - mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) - (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + amelia_result <- list( + Bxy.est.amelia.full = NULL, + Bxy.ci.upper.amelia.full = NULL, + Bxy.ci.lower.amelia.full = NULL, + Bzy.est.amelia.full = NULL, + Bzy.ci.upper.amelia.full = NULL, + Bzy.ci.lower.amelia.full = NULL + ) - est.x.mi <- coefse['x.obs','Estimate'] - est.x.se <- coefse['x.obs','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.full = est.x.mi, - Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se - )) + tryCatch({ + amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w')) + mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) + (coefse <- combine_coef_se(mod.amelia.k)) - est.z.mi <- coefse['z','Estimate'] - est.z.se <- coefse['z','Std.Error'] + est.x.mi <- coefse['x.obs','Estimate'] + est.x.se <- coefse['x.obs','Std.Error'] + est.z.mi <- coefse['z','Estimate'] + est.z.se <- coefse['z','Std.Error'] - result <- append(result, - list(Bzy.est.amelia.full = est.z.mi, - Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, - Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se - )) + amelia_result <- list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se, + Bzy.est.amelia.full = est.z.mi, + Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, + Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se + ) }, + error = function(e){ - message("An error occurred:\n",e) - result$error <-paste0(result$error,'\n', e) - } + result[['error']] <- e} ) + + result <- append(result, amelia_result) + + + mle_result <- list(Bxy.est.mle = NULL, + Bxy.ci.upper.mle = NULL, + Bxy.ci.lower.mle = NULL, + Bzy.est.mle = NULL, + Bzy.ci.upper.mle = NULL, + Bzy.ci.lower.mle = NULL) + tryCatch({ temp.df <- copy(df) temp.df <- temp.df[,x:=x.obs] mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula) - fisher.info <- solve(mod.caroll.lik$hessian) + fischer.info <- solve(mod.caroll.lik$hessian) coef <- mod.caroll.lik$par - ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 - ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 - - - result <- append(result, - list(Bxy.est.mle = coef['x'], - Bxy.ci.upper.mle = ci.upper['x'], - Bxy.ci.lower.mle = ci.lower['x'], - Bzy.est.mle = coef['z'], - Bzy.ci.upper.mle = ci.upper['z'], - Bzy.ci.lower.mle = ci.lower['z'])) + ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96 + mle_result <- list(Bxy.est.mle = coef['x'], + Bxy.ci.upper.mle = ci.upper['x'], + Bxy.ci.lower.mle = ci.lower['x'], + Bzy.est.mle = coef['z'], + Bzy.ci.upper.mle = ci.upper['z'], + Bzy.ci.lower.mle = ci.lower['z']) }, - error = function(e){ - message("An error occurred:\n",e) - result$error <- paste0(result$error,'\n', e) + error=function(e) {result[['error']] <- as.character(e) }) - tryCatch({ - - mod.zhang.lik <- zhang.mle.iv(df) - coef <- coef(mod.zhang.lik) - ci <- confint(mod.zhang.lik,method='quad') - result <- append(result, - list(Bxy.est.zhang = coef['Bxy'], - Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], - Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], - Bzy.est.zhang = coef['Bzy'], - Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], - Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) - }, + + result <- append(result, mle_result) - error = function(e){ - message("An error occurred:\n",e) - result$error <- paste0(result$error,'\n', e) - }) + mod.zhang.lik <- zhang.mle.iv(df) + coef <- coef(mod.zhang.lik) + ci <- confint(mod.zhang.lik,method='quad') + result <- append(result, + list(Bxy.est.zhang = coef['Bxy'], + Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], + Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], + Bzy.est.zhang = coef['Bzy'], + Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], + Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms) @@ -440,29 +407,29 @@ run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=w_p Bzy.ci.lower.gmm = gmm.res$confint[2,1])) - tryCatch({ - mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient') - (mod.calibrated.mle) - (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) - result <- append(result, list( - Bxy.est.mecor = mecor.ci['Estimate'], - Bxy.ci.upper.mecor = mecor.ci['UCI'], - Bxy.ci.lower.mecor = mecor.ci['LCI']) - ) - - (mecor.ci <- summary(mod.calibrated.mle)$c$ci['z',]) - - result <- append(result, list( - Bzy.est.mecor = mecor.ci['Estimate'], - Bzy.ci.upper.mecor = mecor.ci['UCI'], - Bzy.ci.lower.mecor = mecor.ci['LCI']) - ) - }, - error = function(e){ - message("An error occurred:\n",e) - result$error <- paste0(result$error, '\n', e) - } - ) + ## tryCatch({ + ## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient') + ## (mod.calibrated.mle) + ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) + ## result <- append(result, list( + ## Bxy.est.mecor = mecor.ci['Estimate'], + ## Bxy.ci.upper.mecor = mecor.ci['UCI'], + ## Bxy.ci.lower.mecor = mecor.ci['LCI']) + ## ) + + ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['z',]) + + ## result <- append(result, list( + ## Bzy.est.mecor = mecor.ci['Estimate'], + ## Bzy.ci.upper.mecor = mecor.ci['UCI'], + ## Bzy.ci.lower.mecor = mecor.ci['LCI']) + ## ) + ## }, + ## error = function(e){ + ## message("An error occurred:\n",e) + ## result$error <- paste0(result$error, '\n', e) + ## } + ## ) ## clean up memory ## rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))