### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate ### This is the same as example 2, only instead of x->k we have k->x. ### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. ### Even when you include the proxy variable in the regression. ### But with some ground truth and multiple imputation, you can fix it. library(argparser) library(mecor) library(ggplot2) library(data.table) library(filelock) library(arrow) library(Amelia) library(Zelig) library(predictionError) options(amelia.parallel="no", amelia.ncpus=1) source("simulation_base_mecor.R") ## SETUP: ### we want to estimate x -> y; x is MAR ### we have x -> k; k -> w; x -> w is used to predict x via the model w. ### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments ### The labels x are binary, but the model provides a continuous predictor ### simulation: #### how much power do we get from the model in the first place? (sweeping N and m) #### logistic <- function(x) {1/(1+exp(-1*x))} simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ set.seed(seed) ## the true value of x g <- rbinom(N, 1, 0.5) xprime <- Bgx * g + rnorm(N,0,1) k <- Bkx*xprime + rnorm(N,0,3) x <- as.integer(logistic(scale(xprime)) > 0.5) y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 df <- data.table(x=x,k=k,y=y,g=g) if( m < N){ df <- df[sample(nrow(df), m), x.obs := x] } else { df <- df[, x.obs := x] } w.model <- glm(x ~ k,df, family=binomial(link='logit')) w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1) ## y = B0 + B1x + e df[,':='(w=w, w_pred = as.integer(w>0.5))] return(df) } schennach <- function(df){ fwx <- glm(x.obs~w, df, family=binomial(link='logit')) df[,xstar_pred := predict(fwx, df)] gxt <- lm(y ~ xstar_pred+g, df) } parser <- arg_parser("Simulate data and fit corrected models") parser <- add_argument(parser, "--N", default=100, help="number of observations of w") parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations") parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng') parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather') args <- parse_args(parser) B0 <- 0 Bxy <- 0.2 Bkx <- 0.5 rows <- list() B0 <- 0 Bxy <- 0.4 Bgy <- 0.25 Bkx <- 3.2 Bgx <- -0.5 outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed)) outfile_lock <- lock(paste0(args$outfile, '_lock')) if(file.exists(args$outfile)){ logdata <- read_feather(args$outfile) logdata <- rbind(logdata,as.data.table(outline)) } else { logdata <- as.data.table(outline) } print(outline) write_feather(logdata, args$outfile) unlock(outfile_lock) ## Ns <- c(1e6, 5e4, 1000) ## ms <- c(100, 250, 500, 1000) ## seeds <- 1:500 ## rowssets <- list() ## library(doParallel) ## options(mc.cores = parallel::detectCores()) ## cl <- makeCluster(20) ## registerDoParallel(cl) ## ## library(future) ## ## plan(multiprocess,workers=40,gc=TRUE) ## for(N in Ns){ ## print(N) ## for(m in ms){ ## if(N>m){ ## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'), ## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar% ## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter) ## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))} ## rowsets <- append(rowssets, list(data.table(new.rows))) ## } ## } ## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) ## } ## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w}, ## packages=c('mecor','Amelia','Zelig','predictionError'), ## seed=TRUE))) ## df <- rbindlist(rowsets) ## write_feather(df,"example_2B_simulation.feather") ## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ ## result <- list() ## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) ## result <- append(result, list(N=N, ## m=m, ## B0=B0, ## Bxy=Bxy, ## Bgy=Bgy, ## Bkx=Bkx, ## seed=seed)) ## (accuracy <- df[,.(mean(w_pred==x))]) ## result <- append(result, list(accuracy=accuracy)) ## (model.true <- lm(y ~ x + g, data=df)) ## true.ci.Bxy <- confint(model.true)['x',] ## true.ci.Bgy <- confint(model.true)['g',] ## result <- append(result, list(Bxy.est.true=coef(model.true)['x'], ## Bgy.est.true=coef(model.true)['g'], ## Bxy.ci.upper.true = true.ci.Bxy[2], ## Bxy.ci.lower.true = true.ci.Bxy[1], ## Bgy.ci.upper.true = true.ci.Bgy[2], ## Bgy.ci.lower.true = true.ci.Bgy[1])) ## (model.feasible <- lm(y~x.obs+g,data=df)) ## feasible.ci.Bxy <- confint(model.feasible)['x.obs',] ## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'], ## Bxy.ci.upper.feasible = feasible.ci.Bxy[2], ## Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) ## feasible.ci.Bgy <- confint(model.feasible)['g',] ## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'], ## Bgy.ci.upper.feasible = feasible.ci.Bgy[2], ## Bgy.ci.lower.feasible = feasible.ci.Bgy[1])) ## (model.naive <- lm(y~w+g, data=df)) ## naive.ci.Bxy <- confint(model.naive)['w',] ## naive.ci.Bgy <- confint(model.naive)['g',] ## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], ## Bgy.est.naive=coef(model.naive)['g'], ## Bxy.ci.upper.naive = naive.ci.Bxy[2], ## Bxy.ci.lower.naive = naive.ci.Bxy[1], ## Bgy.ci.upper.naive = naive.ci.Bgy[2], ## Bgy.ci.lower.naive = naive.ci.Bgy[1])) ## ## multiple imputation when k is observed ## ## amelia does great at this one. ## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w')) ## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE) ## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) ## est.x.mi <- coefse['x.obs','Estimate'] ## est.x.se <- coefse['x.obs','Std.Error'] ## result <- append(result, ## list(Bxy.est.amelia.full = est.x.mi, ## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, ## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se ## )) ## est.g.mi <- coefse['g','Estimate'] ## est.g.se <- coefse['g','Std.Error'] ## result <- append(result, ## list(Bgy.est.amelia.full = est.g.mi, ## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, ## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se ## )) ## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w")) ## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) ## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)) ## est.x.mi <- coefse['x.obs','Estimate'] ## est.x.se <- coefse['x.obs','Std.Error'] ## result <- append(result, ## list(Bxy.est.amelia.nok = est.x.mi, ## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, ## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se ## )) ## est.g.mi <- coefse['g','Estimate'] ## est.g.se <- coefse['g','Std.Error'] ## result <- append(result, ## list(Bgy.est.amelia.nok = est.g.mi, ## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, ## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se ## )) ## p <- v <- train <- rep(0,N) ## M <- m ## p[(M+1):(N)] <- 1 ## v[1:(M)] <- 1 ## df <- df[order(x.obs)] ## y <- df[,y] ## x <- df[,x.obs] ## g <- df[,g] ## w <- df[,w] ## # gmm gets pretty close ## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)) ## result <- append(result, ## list(Bxy.est.gmm = gmm.res$beta[1,1], ## Bxy.ci.upper.gmm = gmm.res$confint[1,2], ## Bxy.ci.lower.gmm = gmm.res$confint[1,1])) ## result <- append(result, ## list(Bgy.est.gmm = gmm.res$beta[2,1], ## Bgy.ci.upper.gmm = gmm.res$confint[2,2], ## Bgy.ci.lower.gmm = gmm.res$confint[2,1])) ## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient') ## (mod.calibrated.mle) ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) ## result <- append(result, list( ## Bxy.est.mecor = mecor.ci['Estimate'], ## Bxy.upper.mecor = mecor.ci['UCI'], ## Bxy.lower.mecor = mecor.ci['LCI']) ## ) ## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',]) ## result <- append(result, list( ## Bxy.est.mecor = mecor.ci['Estimate'], ## Bxy.upper.mecor = mecor.ci['UCI'], ## Bxy.lower.mecor = mecor.ci['LCI']) ## ) ## return(result) ## }