X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/blobdiff_plain/6057688060b5bf2a94f2b96b65b275a91991c0f3..e41d11afb9a80180feff844666e3ee463d20a7cd:/simulations/mi_simulations/example_2_binary.R

diff --git a/simulations/mi_simulations/example_2_binary.R b/simulations/mi_simulations/example_2_binary.R
deleted file mode 100644
index 0e0d65c..0000000
--- a/simulations/mi_simulations/example_2_binary.R
+++ /dev/null
@@ -1,169 +0,0 @@
-### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
-### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
-### Even when you include the proxy variable in the regression.
-### But with some ground truth and multiple imputation, you can fix it.
-
-library(argparser)
-library(mecor)
-library(ggplot2)
-library(data.table)
-library(filelock)
-library(arrow)
-library(Amelia)
-library(Zelig)
-
-options(amelia.parallel="multicore",
-        amelia.ncpus=40)
-
-## SETUP:
-### we want to estimate g -> y and x -> y; g is observed, x is MAR
-### we have k -> x; g -> x; g->k; k is used to predict x via the model w.
-### we have k -> w; x -> w; w is observed.
-### for illustration, g is binary (e.g., gender==male).
-### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
-### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment
-### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it.
-### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer.
-### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont.
-### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race. 
-
-### simulation:
-#### how much power do we get from the model in the first place? (sweeping N and m)
-#### 
-logistic <- function(x) {1/(1+exp(-1*x))}
-
-simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
-    set.seed(seed)
-
-    ## the true value of x
-
-    g <- rbinom(N, 1, 0.5)
-    k <- rnorm(N, 0, 1)
-    xprime <- Bkx*k + Bgx * g
-    x <- rbinom(N, 1, logistic(xprime - mean(xprime)))
-    w.model <- glm(x ~ k,family='binomial')
-    w <- as.integer(predict(w.model,data.frame(k=k),type='response') > 0.5)
-    ## y = B0 + B1x + e
-    y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0
-    df <- data.table(x=x,k=k,y=y,w=w,g=g)
-    if( m < N){
-        df <- df[sample(nrow(df), m), x.obs := x]
-    } else {
-        df <- df[, x.obs := x]
-    }
-
-    return(df)
-}
-
-
-run_simulation <-  function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
-    result <- list()
-    df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
-
-    result <- append(result, list(N=N,
-                                  m=m,
-                                  B0=B0,
-                                  Bxy=Bxy,
-                                  Bgy=Bgy,
-                                  Bkx=Bkx,
-                                  seed=seed))
-
-    accuracy <- df[,.(mean(w==x))]$V1
-    result <- append(result, list(accuracy=accuracy))
-
-    model.true <- lm(y ~ x + g, data=df)
-    true.ci.Bxy <- confint(model.true)['x',]
-    true.ci.Bgy <- confint(model.true)['g',]
-
-    result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
-                                  Bgy.est.true=coef(model.true)['g'],
-                                  Bxy.ci.upper.true = true.ci.Bxy[2],
-                                  Bxy.ci.lower.true = true.ci.Bxy[1],
-                                  Bgy.ci.upper.true = true.ci.Bgy[2],
-                                  Bgy.ci.lower.true = true.ci.Bgy[1]))
-                                  
-
-    model.naive <- lm(y~w+g, data=df)
-    
-    naive.ci.Bxy <- confint(model.naive)['w',]
-    naive.ci.Bgy <- confint(model.naive)['g',]
-
-    result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
-                                  Bgy.est.naive=coef(model.naive)['g'],
-                                  Bxy.ci.upper.naive = naive.ci.Bxy[2],
-                                  Bxy.ci.lower.naive = naive.ci.Bxy[1],
-                                  Bgy.ci.upper.naive = naive.ci.Bgy[2],
-                                  Bgy.ci.lower.naive = naive.ci.Bgy[1]))
-                                  
-
-    ## multiple imputation when k is observed
-    amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'),noms=c("x.obs","w","g"))
-    mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE)
-    coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)
-
-    est.x.mi <- coefse['x.obs','Estimate']
-    est.x.se <- coefse['x.obs','Std.Error']
-    result <- append(result,
-                     list(Bxy.est.amelia.full = est.x.mi,
-                          Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
-                          Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
-                          ))
-
-    est.g.mi <- coefse['g','Estimate']
-    est.g.se <- coefse['g','Std.Error']
-
-    result <- append(result,
-                     list(Bgy.est.amelia.full = est.g.mi,
-                          Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
-                          Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
-                          ))
-
-    ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
-    amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs","w",'g'))
-    mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
-    coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)
-
-    est.x.mi <- coefse['x.obs','Estimate']
-    est.x.se <- coefse['x.obs','Std.Error']
-    result <- append(result,
-                     list(Bxy.est.amelia.nok = est.x.mi,
-                          Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
-                          Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
-                          ))
-
-    est.g.mi <- coefse['g','Estimate']
-    est.g.se <- coefse['g','Std.Error']
-
-    result <- append(result,
-                     list(Bgy.est.amelia.nok = est.g.mi,
-                          Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
-                          Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
-                          ))
-
-    return(result)
-}
-
-Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500)
-ms <- c(30, 50, 100, 200, 300, 500)
-B0 <- 0
-Bxy <- 1
-Bgy <- 0.3
-Bkx <- 3
-Bgx <- -4
-seeds <- 1:100
-
-rows <- list()
-
-for(N in Ns){
-    print(N)
-    for(m in ms){
-        if(N>m){
-            for(seed in seeds){
-                rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)))
-            }
-        }
-    }
-}
-
-result <- rbindlist(rows)
-write_feather(result, "example_2_simulation.feather")