X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/6057688060b5bf2a94f2b96b65b275a91991c0f3..e41d11afb9a80180feff844666e3ee463d20a7cd:/simulations/mi_simulations/example_2_binary.R diff --git a/simulations/mi_simulations/example_2_binary.R b/simulations/mi_simulations/example_2_binary.R deleted file mode 100644 index 0e0d65c..0000000 --- a/simulations/mi_simulations/example_2_binary.R +++ /dev/null @@ -1,169 +0,0 @@ -### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate -### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. -### Even when you include the proxy variable in the regression. -### But with some ground truth and multiple imputation, you can fix it. - -library(argparser) -library(mecor) -library(ggplot2) -library(data.table) -library(filelock) -library(arrow) -library(Amelia) -library(Zelig) - -options(amelia.parallel="multicore", - amelia.ncpus=40) - -## SETUP: -### we want to estimate g -> y and x -> y; g is observed, x is MAR -### we have k -> x; g -> x; g->k; k is used to predict x via the model w. -### we have k -> w; x -> w; w is observed. -### for illustration, g is binary (e.g., gender==male). -### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments -### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment -### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it. -### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer. -### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont. -### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race. - -### simulation: -#### how much power do we get from the model in the first place? (sweeping N and m) -#### -logistic <- function(x) {1/(1+exp(-1*x))} - -simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ - set.seed(seed) - - ## the true value of x - - g <- rbinom(N, 1, 0.5) - k <- rnorm(N, 0, 1) - xprime <- Bkx*k + Bgx * g - x <- rbinom(N, 1, logistic(xprime - mean(xprime))) - w.model <- glm(x ~ k,family='binomial') - w <- as.integer(predict(w.model,data.frame(k=k),type='response') > 0.5) - ## y = B0 + B1x + e - y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 - df <- data.table(x=x,k=k,y=y,w=w,g=g) - if( m < N){ - df <- df[sample(nrow(df), m), x.obs := x] - } else { - df <- df[, x.obs := x] - } - - return(df) -} - - -run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ - result <- list() - df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) - - result <- append(result, list(N=N, - m=m, - B0=B0, - Bxy=Bxy, - Bgy=Bgy, - Bkx=Bkx, - seed=seed)) - - accuracy <- df[,.(mean(w==x))]$V1 - result <- append(result, list(accuracy=accuracy)) - - model.true <- lm(y ~ x + g, data=df) - true.ci.Bxy <- confint(model.true)['x',] - true.ci.Bgy <- confint(model.true)['g',] - - result <- append(result, list(Bxy.est.true=coef(model.true)['x'], - Bgy.est.true=coef(model.true)['g'], - Bxy.ci.upper.true = true.ci.Bxy[2], - Bxy.ci.lower.true = true.ci.Bxy[1], - Bgy.ci.upper.true = true.ci.Bgy[2], - Bgy.ci.lower.true = true.ci.Bgy[1])) - - - model.naive <- lm(y~w+g, data=df) - - naive.ci.Bxy <- confint(model.naive)['w',] - naive.ci.Bgy <- confint(model.naive)['g',] - - result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], - Bgy.est.naive=coef(model.naive)['g'], - Bxy.ci.upper.naive = naive.ci.Bxy[2], - Bxy.ci.lower.naive = naive.ci.Bxy[1], - Bgy.ci.upper.naive = naive.ci.Bgy[2], - Bgy.ci.lower.naive = naive.ci.Bgy[1])) - - - ## multiple imputation when k is observed - amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'),noms=c("x.obs","w","g")) - mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE) - coefse <- combine_coef_se(mod.amelia.k, messages=FALSE) - - est.x.mi <- coefse['x.obs','Estimate'] - est.x.se <- coefse['x.obs','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.full = est.x.mi, - Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se - )) - - est.g.mi <- coefse['g','Estimate'] - est.g.se <- coefse['g','Std.Error'] - - result <- append(result, - list(Bgy.est.amelia.full = est.g.mi, - Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, - Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se - )) - - ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. - amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs","w",'g')) - mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) - coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE) - - est.x.mi <- coefse['x.obs','Estimate'] - est.x.se <- coefse['x.obs','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.nok = est.x.mi, - Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se - )) - - est.g.mi <- coefse['g','Estimate'] - est.g.se <- coefse['g','Std.Error'] - - result <- append(result, - list(Bgy.est.amelia.nok = est.g.mi, - Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, - Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se - )) - - return(result) -} - -Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500) -ms <- c(30, 50, 100, 200, 300, 500) -B0 <- 0 -Bxy <- 1 -Bgy <- 0.3 -Bkx <- 3 -Bgx <- -4 -seeds <- 1:100 - -rows <- list() - -for(N in Ns){ - print(N) - for(m in ms){ - if(N>m){ - for(seed in seeds){ - rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed))) - } - } - } -} - -result <- rbindlist(rows) -write_feather(result, "example_2_simulation.feather")