X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/d0c5766bdf867a81a2477d2cac1d40812110af90..69948cae1e691191fc86e6abdaa485bc98f38f1f:/simulations/simulation_base.R diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R index 27f0276..73544e9 100644 --- a/simulations/simulation_base.R +++ b/simulations/simulation_base.R @@ -7,6 +7,7 @@ library(Zelig) library(bbmle) library(matrixStats) # for numerically stable logsumexps +source("pl_methods.R") source("measerr_methods.R") ## for my more generic function. ## This uses the pseudolikelihood approach from Carroll page 349. @@ -36,124 +37,6 @@ my.pseudo.mle <- function(df){ } - -## model from Zhang's arxiv paper, with predictions for y -## Zhang got this model from Hausman 1998 -### I think this is actually eqivalent to the pseudo.mle method -zhang.mle.iv <- function(df){ - df.obs <- df[!is.na(x.obs)] - df.unobs <- df[is.na(x.obs)] - - tn <- df.obs[(w_pred == 0) & (x.obs == w_pred),.N] - pn <- df.obs[(w_pred==0), .N] - npv <- tn / pn - - tp <- df.obs[(w_pred==1) & (x.obs == w_pred),.N] - pp <- df.obs[(w_pred==1),.N] - ppv <- tp / pp - - nll <- function(B0=0, Bxy=0, Bzy=0, sigma_y=0.1){ - - ## fpr = 1 - TNR - ### Problem: accounting for uncertainty in ppv / npv - - ## fnr = 1 - TPR - ll.y.obs <- with(df.obs, dnorm(y, B0 + Bxy * x + Bzy * z, sd=sigma_y,log=T)) - ll <- sum(ll.y.obs) - - # unobserved case; integrate out x - ll.x.1 <- with(df.unobs, dnorm(y, B0 + Bxy + Bzy * z, sd = sigma_y, log=T)) - ll.x.0 <- with(df.unobs, dnorm(y, B0 + Bzy * z, sd = sigma_y,log=T)) - - ## case x == 1 - lls.x.1 <- colLogSumExps(rbind(log(ppv) + ll.x.1, log(1-ppv) + ll.x.0)) - - ## case x == 0 - lls.x.0 <- colLogSumExps(rbind(log(1-npv) + ll.x.1, log(npv) + ll.x.0)) - - lls <- colLogSumExps(rbind(df.unobs$w_pred * lls.x.1, (1-df.unobs$w_pred) * lls.x.0)) - ll <- ll + sum(lls) - return(-ll) - } - mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6), lower=list(sigma_y=0.0001, B0=-Inf, Bxy=-Inf, Bzy=-Inf), - upper=list(sigma_y=Inf, B0=Inf, Bxy=Inf, Bzy=Inf),method='L-BFGS-B') - return(mlefit) -} - -## this is equivalent to the pseudo-liklihood model from Caroll -## zhang.mle.dv <- function(df){ - -## nll <- function(B0=0, Bxy=0, Bzy=0, ppv=0.9, npv=0.9){ -## df.obs <- df[!is.na(y.obs)] - -## ## fpr = 1 - TNR -## ll.w0y0 <- with(df.obs[y.obs==0],dbinom(1-w_pred,1,npv,log=TRUE)) -## ll.w1y1 <- with(df.obs[y.obs==1],dbinom(w_pred,1,ppv,log=TRUE)) - -## # observed case -## ll.y.obs <- vector(mode='numeric', length=nrow(df.obs)) -## ll.y.obs[df.obs$y.obs==1] <- with(df.obs[y.obs==1], plogis(B0 + Bxy * x + Bzy * z,log=T)) -## ll.y.obs[df.obs$y.obs==0] <- with(df.obs[y.obs==0], plogis(B0 + Bxy * x + Bzy * z,log=T,lower.tail=FALSE)) - -## ll <- sum(ll.y.obs) + sum(ll.w0y0) + sum(ll.w1y1) - -## # unobserved case; integrate out y -## ## case y = 1 -## ll.y.1 <- vector(mode='numeric', length=nrow(df)) -## pi.y.1 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T)) -## ## P(w=1| y=1)P(y=1) + P(w=0|y=1)P(y=1) = P(w=1,y=1) + P(w=0,y=1) -## lls.y.1 <- colLogSumExps(rbind(log(ppv) + pi.y.1, log(1-ppv) + pi.y.1)) - -## ## case y = 0 -## ll.y.0 <- vector(mode='numeric', length=nrow(df)) -## pi.y.0 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T,lower.tail=FALSE)) - -## ## P(w=1 | y=0)P(y=0) + P(w=0|y=0)P(y=0) = P(w=1,y=0) + P(w=0,y=0) -## lls.y.0 <- colLogSumExps(rbind(log(npv) + pi.y.0, log(1-npv) + pi.y.0)) - -## lls <- colLogSumExps(rbind(lls.y.1, lls.y.0)) -## ll <- ll + sum(lls) -## return(-ll) -## } -## mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6),method='L-BFGS-B',lower=list(B0=-Inf, Bxy=-Inf, Bzy=-Inf, ppv=0.001,npv=0.001), -## upper=list(B0=Inf, Bxy=Inf, Bzy=Inf,ppv=0.999,npv=0.999)) -## return(mlefit) -## } - -zhang.mle.dv <- function(df){ - df.obs <- df[!is.na(y.obs)] - df.unobs <- df[is.na(y.obs)] - - fp <- df.obs[(w_pred==1) & (y.obs != w_pred),.N] - p <- df.obs[(w_pred==1),.N] - fpr <- fp / p - fn <- df.obs[(w_pred==0) & (y.obs != w_pred), .N] - n <- df.obs[(w_pred==0),.N] - fnr <- fn / n - - nll <- function(B0=0, Bxy=0, Bzy=0){ - - - ## observed case - ll.y.obs <- vector(mode='numeric', length=nrow(df.obs)) - ll.y.obs[df.obs$y.obs==1] <- with(df.obs[y.obs==1], plogis(B0 + Bxy * x + Bzy * z,log=T)) - ll.y.obs[df.obs$y.obs==0] <- with(df.obs[y.obs==0], plogis(B0 + Bxy * x + Bzy * z,log=T,lower.tail=FALSE)) - - ll <- sum(ll.y.obs) - - pi.y.1 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T)) - pi.y.0 <- with(df,plogis(B0 + Bxy * x + Bzy*z, log=T,lower.tail=FALSE)) - - lls <- with(df.unobs, colLogSumExps(rbind(w_pred * colLogSumExps(rbind(log(fpr), log(1 - fnr - fpr)+pi.y.1)), - (1-w_pred) * colLogSumExps(rbind(log(1-fpr), log(1 - fnr - fpr)+pi.y.0))))) - - ll <- ll + sum(lls) - return(-ll) - } - mlefit <- mle2(minuslogl = nll, control=list(maxit=1e6),method='L-BFGS-B',lower=c(B0=-Inf, Bxy=-Inf, Bzy=-Inf), - upper=c(B0=Inf, Bxy=Inf, Bzy=Inf)) - return(mlefit) -} ## This uses the likelihood approach from Carroll page 353. ## assumes that we have a good measurement error model @@ -208,10 +91,14 @@ my.mle <- function(df){ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formula=w_pred~y){ - accuracy <- df[,mean(w_pred==y)] + (accuracy <- df[,mean(w_pred==y)]) result <- append(result, list(accuracy=accuracy)) - error.cor.x <- cor(df$x, df$w - df$x) - result <- append(result, list(error.cor.x = error.cor.x)) + (error.cor.z <- cor(df$z, df$y - df$w_pred)) + (error.cor.x <- cor(df$x, df$y - df$w_pred)) + (error.cor.y <- cor(df$y, df$y - df$w_pred)) + result <- append(result, list(error.cor.x = error.cor.x, + error.cor.z = error.cor.z, + error.cor.y = error.cor.y)) model.null <- glm(y~1, data=df,family=binomial(link='logit')) (model.true <- glm(y ~ x + z, data=df,family=binomial(link='logit'))) @@ -220,7 +107,7 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu true.ci.Bxy <- confint(model.true)['x',] true.ci.Bzy <- confint(model.true)['z',] - + result <- append(result, list(cor.xz=cor(df$x,df$z))) result <- append(result, list(lik.ratio=lik.ratio)) result <- append(result, list(Bxy.est.true=coef(model.true)['x'], @@ -264,10 +151,10 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu temp.df <- copy(df) temp.df[,y:=y.obs] mod.caroll.lik <- measerr_mle_dv(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula) - fisher.info <- solve(mod.caroll.lik$hessian) + fischer.info <- solve(mod.caroll.lik$hessian) coef <- mod.caroll.lik$par - ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 - ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 + ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96 result <- append(result, list(Bxy.est.mle = coef['x'], Bxy.ci.upper.mle = ci.upper['x'], @@ -293,33 +180,35 @@ run_simulation_depvar <- function(df, result, outcome_formula=y~x+z, proxy_formu # amelia says use normal distribution for binary variables. + amelia_result <- list(Bxy.est.amelia.full = NA, + Bxy.ci.upper.amelia.full = NA, + Bxy.ci.lower.amelia.full = NA, + Bzy.est.amelia.full = NA, + Bzy.ci.upper.amelia.full = NA, + Bzy.ci.lower.amelia.full = NA + ) + tryCatch({ amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('y','ystar','w')) mod.amelia.k <- zelig(y.obs~x+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) est.x.mi <- coefse['x','Estimate'] est.x.se <- coefse['x','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.full = est.x.mi, - Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se - )) est.z.mi <- coefse['z','Estimate'] est.z.se <- coefse['z','Std.Error'] - - result <- append(result, - list(Bzy.est.amelia.full = est.z.mi, - Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, - Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se - )) - + amelia_result <- list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se, + Bzy.est.amelia.full = est.z.mi, + Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, + Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se + ) }, error = function(e){ - message("An error occurred:\n",e) - result$error <- paste0(result$error,'\n', e) - }) - + result[['error']] <- e} + ) + result <- append(result,amelia_result) return(result) @@ -391,79 +280,83 @@ run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NUL Bxy.ci.lower.naive = naive.ci.Bxy[1], Bzy.ci.upper.naive = naive.ci.Bzy[2], Bzy.ci.lower.naive = naive.ci.Bzy[1])) - - tryCatch({ - amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w')) - mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) - (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + amelia_result <- list( + Bxy.est.amelia.full = NULL, + Bxy.ci.upper.amelia.full = NULL, + Bxy.ci.lower.amelia.full = NULL, + Bzy.est.amelia.full = NULL, + Bzy.ci.upper.amelia.full = NULL, + Bzy.ci.lower.amelia.full = NULL + ) - est.x.mi <- coefse['x.obs','Estimate'] - est.x.se <- coefse['x.obs','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.full = est.x.mi, - Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se - )) + tryCatch({ + amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w')) + mod.amelia.k <- zelig(y~x.obs+z, model='ls', data=amelia.out.k$imputations, cite=FALSE) + (coefse <- combine_coef_se(mod.amelia.k)) - est.z.mi <- coefse['z','Estimate'] - est.z.se <- coefse['z','Std.Error'] + est.x.mi <- coefse['x.obs','Estimate'] + est.x.se <- coefse['x.obs','Std.Error'] + est.z.mi <- coefse['z','Estimate'] + est.z.se <- coefse['z','Std.Error'] - result <- append(result, - list(Bzy.est.amelia.full = est.z.mi, - Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, - Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se - )) + amelia_result <- list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se, + Bzy.est.amelia.full = est.z.mi, + Bzy.ci.upper.amelia.full = est.z.mi + 1.96 * est.z.se, + Bzy.ci.lower.amelia.full = est.z.mi - 1.96 * est.z.se + ) }, + error = function(e){ - message("An error occurred:\n",e) - result$error <-paste0(result$error,'\n', e) - } + result[['error']] <- e} ) + + result <- append(result, amelia_result) + + + mle_result <- list(Bxy.est.mle = NULL, + Bxy.ci.upper.mle = NULL, + Bxy.ci.lower.mle = NULL, + Bzy.est.mle = NULL, + Bzy.ci.upper.mle = NULL, + Bzy.ci.lower.mle = NULL) + tryCatch({ temp.df <- copy(df) temp.df <- temp.df[,x:=x.obs] mod.caroll.lik <- measerr_mle(temp.df, outcome_formula=outcome_formula, proxy_formula=proxy_formula, truth_formula=truth_formula) - fisher.info <- solve(mod.caroll.lik$hessian) + fischer.info <- solve(mod.caroll.lik$hessian) coef <- mod.caroll.lik$par - ci.upper <- coef + sqrt(diag(fisher.info)) * 1.96 - ci.lower <- coef - sqrt(diag(fisher.info)) * 1.96 - - - result <- append(result, - list(Bxy.est.mle = coef['x'], - Bxy.ci.upper.mle = ci.upper['x'], - Bxy.ci.lower.mle = ci.lower['x'], - Bzy.est.mle = coef['z'], - Bzy.ci.upper.mle = ci.upper['z'], - Bzy.ci.lower.mle = ci.lower['z'])) + ci.upper <- coef + sqrt(diag(fischer.info)) * 1.96 + ci.lower <- coef - sqrt(diag(fischer.info)) * 1.96 + mle_result <- list(Bxy.est.mle = coef['x'], + Bxy.ci.upper.mle = ci.upper['x'], + Bxy.ci.lower.mle = ci.lower['x'], + Bzy.est.mle = coef['z'], + Bzy.ci.upper.mle = ci.upper['z'], + Bzy.ci.lower.mle = ci.lower['z']) }, - error = function(e){ - message("An error occurred:\n",e) - result$error <- paste0(result$error,'\n', e) + error=function(e) {result[['error']] <- as.character(e) }) - tryCatch({ - - mod.zhang.lik <- zhang.mle.iv(df) - coef <- coef(mod.zhang.lik) - ci <- confint(mod.zhang.lik,method='quad') - result <- append(result, - list(Bxy.est.zhang = coef['Bxy'], - Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], - Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], - Bzy.est.zhang = coef['Bzy'], - Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], - Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) - }, + + result <- append(result, mle_result) - error = function(e){ - message("An error occurred:\n",e) - result$error <- paste0(result$error,'\n', e) - }) + mod.zhang.lik <- zhang.mle.iv(df) + coef <- coef(mod.zhang.lik) + ci <- confint(mod.zhang.lik,method='quad') + result <- append(result, + list(Bxy.est.zhang = coef['Bxy'], + Bxy.ci.upper.zhang = ci['Bxy','97.5 %'], + Bxy.ci.lower.zhang = ci['Bxy','2.5 %'], + Bzy.est.zhang = coef['Bzy'], + Bzy.ci.upper.zhang = ci['Bzy','97.5 %'], + Bzy.ci.lower.zhang = ci['Bzy','2.5 %'])) ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms) @@ -514,29 +407,29 @@ run_simulation <- function(df, result, outcome_formula=y~x+z, proxy_formula=NUL Bzy.ci.lower.gmm = gmm.res$confint[2,1])) - tryCatch({ - mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient') - (mod.calibrated.mle) - (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) - result <- append(result, list( - Bxy.est.mecor = mecor.ci['Estimate'], - Bxy.ci.upper.mecor = mecor.ci['UCI'], - Bxy.ci.lower.mecor = mecor.ci['LCI']) - ) - - (mecor.ci <- summary(mod.calibrated.mle)$c$ci['z',]) - - result <- append(result, list( - Bzy.est.mecor = mecor.ci['Estimate'], - Bzy.ci.upper.mecor = mecor.ci['UCI'], - Bzy.ci.lower.mecor = mecor.ci['LCI']) - ) - }, - error = function(e){ - message("An error occurred:\n",e) - result$error <- paste0(result$error, '\n', e) - } - ) + ## tryCatch({ + ## mod.calibrated.mle <- mecor(y ~ MeasError(w_pred, reference = x.obs) + z, df, B=400, method='efficient') + ## (mod.calibrated.mle) + ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) + ## result <- append(result, list( + ## Bxy.est.mecor = mecor.ci['Estimate'], + ## Bxy.ci.upper.mecor = mecor.ci['UCI'], + ## Bxy.ci.lower.mecor = mecor.ci['LCI']) + ## ) + + ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['z',]) + + ## result <- append(result, list( + ## Bzy.est.mecor = mecor.ci['Estimate'], + ## Bzy.ci.upper.mecor = mecor.ci['UCI'], + ## Bzy.ci.lower.mecor = mecor.ci['LCI']) + ## ) + ## }, + ## error = function(e){ + ## message("An error occurred:\n",e) + ## result$error <- paste0(result$error, '\n', e) + ## } + ## ) ## clean up memory ## rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible"))