### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate ### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. ### Even when you include the proxy variable in the regression. ### But with some ground truth and multiple imputation, you can fix it. library(argparser) library(mecor) library(ggplot2) library(data.table) library(filelock) library(arrow) library(Amelia) library(Zelig) library(predictionError) options(amelia.parallel="multicore", amelia.ncpus=40) ## SETUP: ### we want to estimate g -> y and x -> y; g is observed, x is MAR ### we have k -> x; g -> x; g->k; k is used to predict x via the model w. ### we have k -> w; x -> w; w is observed. ### for illustration, g is binary (e.g., gender==male). ### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments ### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment ### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it. ### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer. ### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont. ### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race. ### simulation: #### how much power do we get from the model in the first place? (sweeping N and m) #### logistic <- function(x) {1/(1+exp(-1*x))} simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ set.seed(seed) ## the true value of x g <- rbinom(N, 1, 0.5) k <- rnorm(N, 0, 1) x <- Bkx*k + Bgx * g + rnorm(N,0,1) w.model <- lm(x ~ k) w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1) ## y = B0 + B1x + e y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 df <- data.table(x=x,k=k,y=y,w=w,g=g) if( m < N){ df <- df[sample(nrow(df), m), x.obs := x] } else { df <- df[, x.obs := x] } return(df) } run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ result <- list() df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) result <- append(result, list(N=N, m=m, B0=B0, Bxy=Bxy, Bgy=Bgy, Bkx=Bkx, seed=seed)) correlation <- cor(df$w,df$x) result <- append(result, list(correlation=correlation)) model.true <- lm(y ~ x + g, data=df) true.ci.Bxy <- confint(model.true)['x',] true.ci.Bgy <- confint(model.true)['g',] result <- append(result, list(Bxy.est.true=coef(model.true)['x'], Bgy.est.true=coef(model.true)['g'], Bxy.ci.upper.true = true.ci.Bxy[2], Bxy.ci.lower.true = true.ci.Bxy[1], Bgy.ci.upper.true = true.ci.Bgy[2], Bgy.ci.lower.true = true.ci.Bgy[1])) model.naive <- lm(y~w+g, data=df) naive.ci.Bxy <- confint(model.naive)['w',] naive.ci.Bgy <- confint(model.naive)['g',] result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], Bgy.est.naive=coef(model.naive)['g'], Bxy.ci.upper.naive = naive.ci.Bxy[2], Bxy.ci.lower.naive = naive.ci.Bxy[1], Bgy.ci.upper.naive = naive.ci.Bgy[2], Bgy.ci.lower.naive = naive.ci.Bgy[1])) ## multiple imputation when k is observed amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x')) mod.amelia.k <- zelig(y~x.obs+g+k, model='ls', data=amelia.out.k$imputations, cite=FALSE) coefse <- combine_coef_se(mod.amelia.k, messages=FALSE) est.x.mi <- coefse['x.obs','Estimate'] est.x.se <- coefse['x.obs','Std.Error'] result <- append(result, list(Bxy.est.amelia.full = est.x.mi, Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se )) est.g.mi <- coefse['g','Estimate'] est.g.se <- coefse['g','Std.Error'] result <- append(result, list(Bgy.est.amelia.full = est.g.mi, Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se )) ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k")) mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE) est.x.mi <- coefse['x.obs','Estimate'] est.x.se <- coefse['x.obs','Std.Error'] result <- append(result, list(Bxy.est.amelia.nok = est.x.mi, Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se )) est.g.mi <- coefse['g','Estimate'] est.g.se <- coefse['g','Std.Error'] result <- append(result, list(Bgy.est.amelia.nok = est.g.mi, Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se )) p <- v <- train <- rep(0,N) M <- m p[(M+1):(N)] <- 1 train[0:(M/2)] <- 1 v[(M/2+1):(M)] <- 1 df <- df[order(x.obs)] y <- df[,y] x <- df[,x.obs] g <- df[,g] w <- df[,w] gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE) result <- append(result, list(Bxy.est.gmm = gmm.res$beta[1,1], Bxy.ci.upper.gmm = gmm.res$confint[1,2], Bxy.ci.lower.gmm = gmm.res$confint[1,1], Bgy.est.gmm = gmm.res$beta[2,1], Bgy.ci.upper.gmm = gmm.res$confint[2,2], Bgy.ci.lower.gmm = gmm.res$confint[2,1])) return(result) } Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500) ms <- c(30, 50, 100, 200, 300, 500) B0 <- 0 Bxy <- 1 Bgy <- 0.3 Bkx <- 3 Bgx <- -4 seeds <- 1:100 rows <- list() for(N in Ns){ print(N) for(m in ms){ if(N>m){ for(seed in seeds){ rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed))) } } } } result <- rbindlist(rows) write_feather(result, "example_2_simulation_continuous.feather")