simulations/example_2_continuous.R

   1 ### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
   2 ### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
   3 ### Even when you include the proxy variable in the regression.
   4 ### But with some ground truth and multiple imputation, you can fix it.
   5
   6 library(argparser)
   7 library(mecor)
   8 library(ggplot2)
   9 library(data.table)
  10 library(filelock)
  11 library(arrow)
  12 library(Amelia)
  13 library(Zelig)
  14 library(predictionError)
  15 options(amelia.parallel="multicore",
  16         amelia.ncpus=40)
  17
  18 ## SETUP:
  19 ### we want to estimate g -> y and x -> y; g is observed, x is MAR
  20 ### we have k -> x; g -> x; g->k; k is used to predict x via the model w.
  21 ### we have k -> w; x -> w; w is observed.
  22 ### for illustration, g is binary (e.g., gender==male).
  23 ### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
  24 ### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment
  25 ### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it.
  26 ### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer.
  27 ### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont.
  28 ### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race.
  29
  30 ### simulation:
  31 #### how much power do we get from the model in the first place? (sweeping N and m)
  32 ####
  33 logistic <- function(x) {1/(1+exp(-1*x))}
  34
  35 simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
  36     set.seed(seed)
  37
  38     ## the true value of x
  39
  40     g <- rbinom(N, 1, 0.5)
  41     k <- rnorm(N, 0, 1)
  42     x <- Bkx*k + Bgx * g + rnorm(N,0,1)
  43     w.model <- lm(x ~ k)
  44     w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
  45     ## y = B0 + B1x + e
  46     y <-  Bxy * x  + Bgy * g + rnorm(N, 0, 1) + B0
  47     df <- data.table(x=x,k=k,y=y,w=w,g=g)
  48     if( m < N){
  49         df <- df[sample(nrow(df), m), x.obs := x]
  50     } else {
  51         df <- df[, x.obs := x]
  52     }
  53
  54     return(df)
  55 }
  56
  57
  58 run_simulation <-  function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
  59     result <- list()
  60     df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
  61
  62     result <- append(result, list(N=N,
  63                                   m=m,
  64                                   B0=B0,
  65                                   Bxy=Bxy,
  66                                   Bgy=Bgy,
  67                                   Bkx=Bkx,
  68                                   seed=seed))
  69
  70     correlation <- cor(df$w,df$x)
  71     result <- append(result, list(correlation=correlation))
  72
  73     model.true <- lm(y ~ x + g, data=df)
  74     true.ci.Bxy <- confint(model.true)['x',]
  75     true.ci.Bgy <- confint(model.true)['g',]
  76
  77     result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
  78                                   Bgy.est.true=coef(model.true)['g'],
  79                                   Bxy.ci.upper.true = true.ci.Bxy[2],
  80                                   Bxy.ci.lower.true = true.ci.Bxy[1],
  81                                   Bgy.ci.upper.true = true.ci.Bgy[2],
  82                                   Bgy.ci.lower.true = true.ci.Bgy[1]))
  83
  84
  85     model.naive <- lm(y~w+g, data=df)
  86
  87     naive.ci.Bxy <- confint(model.naive)['w',]
  88     naive.ci.Bgy <- confint(model.naive)['g',]
  89
  90     result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
  91                                   Bgy.est.naive=coef(model.naive)['g'],
  92                                   Bxy.ci.upper.naive = naive.ci.Bxy[2],
  93                                   Bxy.ci.lower.naive = naive.ci.Bxy[1],
  94                                   Bgy.ci.upper.naive = naive.ci.Bgy[2],
  95                                   Bgy.ci.lower.naive = naive.ci.Bgy[1]))
  96
  97
  98     ## multiple imputation when k is observed
  99     amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'))
 100     mod.amelia.k <- zelig(y~x.obs+g+k, model='ls', data=amelia.out.k$imputations, cite=FALSE)
 101     coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)
 102
 103     est.x.mi <- coefse['x.obs','Estimate']
 104     est.x.se <- coefse['x.obs','Std.Error']
 105     result <- append(result,
 106                      list(Bxy.est.amelia.full = est.x.mi,
 107                           Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
 108                           Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
 109                           ))
 110
 111     est.g.mi <- coefse['g','Estimate']
 112     est.g.se <- coefse['g','Std.Error']
 113
 114     result <- append(result,
 115                      list(Bgy.est.amelia.full = est.g.mi,
 116                           Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
 117                           Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
 118                           ))
 119
 120     ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
 121     amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"))
 122     mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
 123     coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)
 124
 125     est.x.mi <- coefse['x.obs','Estimate']
 126     est.x.se <- coefse['x.obs','Std.Error']
 127     result <- append(result,
 128                      list(Bxy.est.amelia.nok = est.x.mi,
 129                           Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
 130                           Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
 131                           ))
 132
 133     est.g.mi <- coefse['g','Estimate']
 134     est.g.se <- coefse['g','Std.Error']
 135
 136     result <- append(result,
 137                      list(Bgy.est.amelia.nok = est.g.mi,
 138                           Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
 139                           Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
 140                           ))
 141
 142     p <- v <- train <- rep(0,N)
 143     M <- m
 144     p[(M+1):(N)] <- 1
 145     train[0:(M/2)] <- 1
 146     v[(M/2+1):(M)] <- 1
 147     df <- df[order(x.obs)]
 148     y <- df[,y]
 149     x <- df[,x.obs]
 150     g <- df[,g]
 151     w <- df[,w]
 152     gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)
 153     result <- append(result,
 154                      list(Bxy.est.gmm = gmm.res$beta[1,1],
 155                           Bxy.ci.upper.gmm = gmm.res$confint[1,2],
 156                           Bxy.ci.lower.gmm = gmm.res$confint[1,1],
 157                           Bgy.est.gmm = gmm.res$beta[2,1],
 158                           Bgy.ci.upper.gmm = gmm.res$confint[2,2],
 159                           Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
 160
 161     return(result)
 162 }
 163
 164 Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500)
 165 ms <- c(30, 50, 100, 200, 300, 500)
 166 B0 <- 0
 167 Bxy <- 1
 168 Bgy <- 0.3
 169 Bkx <- 3
 170 Bgx <- -4
 171 seeds <- 1:100
 172
 173 rows <- list()
 174
 175 for(N in Ns){
 176     print(N)
 177     for(m in ms){
 178         if(N>m){
 179             for(seed in seeds){
 180                 rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)))
 181             }
 182         }
 183     }
 184 }
 185
 186 result <- rbindlist(rows)
 187 write_feather(result, "example_2_simulation_continuous.feather")