1 ### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate
2 ### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign.
3 ### Even when you include the proxy variable in the regression.
4 ### But with some ground truth and multiple imputation, you can fix it.
14 library(predictionError)
15 options(amelia.parallel="multicore",
19 ### we want to estimate g -> y and x -> y; g is observed, x is MAR
20 ### we have k -> x; g -> x; g->k; k is used to predict x via the model w.
21 ### we have k -> w; x -> w; w is observed.
22 ### for illustration, g is binary (e.g., gender==male).
23 ### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments
24 ### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment
25 ### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it.
26 ### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer.
27 ### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont.
28 ### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race.
31 #### how much power do we get from the model in the first place? (sweeping N and m)
33 logistic <- function(x) {1/(1+exp(-1*x))}
35 simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
38 ## the true value of x
40 g <- rbinom(N, 1, 0.5)
42 x <- Bkx*k + Bgx * g + rnorm(N,0,1)
44 w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1)
46 y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0
47 df <- data.table(x=x,k=k,y=y,w=w,g=g)
49 df <- df[sample(nrow(df), m), x.obs := x]
51 df <- df[, x.obs := x]
58 run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){
60 df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)
62 result <- append(result, list(N=N,
70 correlation <- cor(df$w,df$x)
71 result <- append(result, list(correlation=correlation))
73 model.true <- lm(y ~ x + g, data=df)
74 true.ci.Bxy <- confint(model.true)['x',]
75 true.ci.Bgy <- confint(model.true)['g',]
77 result <- append(result, list(Bxy.est.true=coef(model.true)['x'],
78 Bgy.est.true=coef(model.true)['g'],
79 Bxy.ci.upper.true = true.ci.Bxy[2],
80 Bxy.ci.lower.true = true.ci.Bxy[1],
81 Bgy.ci.upper.true = true.ci.Bgy[2],
82 Bgy.ci.lower.true = true.ci.Bgy[1]))
85 model.naive <- lm(y~w+g, data=df)
87 naive.ci.Bxy <- confint(model.naive)['w',]
88 naive.ci.Bgy <- confint(model.naive)['g',]
90 result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'],
91 Bgy.est.naive=coef(model.naive)['g'],
92 Bxy.ci.upper.naive = naive.ci.Bxy[2],
93 Bxy.ci.lower.naive = naive.ci.Bxy[1],
94 Bgy.ci.upper.naive = naive.ci.Bgy[2],
95 Bgy.ci.lower.naive = naive.ci.Bgy[1]))
98 ## multiple imputation when k is observed
99 amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x'))
100 mod.amelia.k <- zelig(y~x.obs+g+k, model='ls', data=amelia.out.k$imputations, cite=FALSE)
101 coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)
103 est.x.mi <- coefse['x.obs','Estimate']
104 est.x.se <- coefse['x.obs','Std.Error']
105 result <- append(result,
106 list(Bxy.est.amelia.full = est.x.mi,
107 Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se,
108 Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se
111 est.g.mi <- coefse['g','Estimate']
112 est.g.se <- coefse['g','Std.Error']
114 result <- append(result,
115 list(Bgy.est.amelia.full = est.g.mi,
116 Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se,
117 Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se
120 ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model.
121 amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"))
122 mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE)
123 coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)
125 est.x.mi <- coefse['x.obs','Estimate']
126 est.x.se <- coefse['x.obs','Std.Error']
127 result <- append(result,
128 list(Bxy.est.amelia.nok = est.x.mi,
129 Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se,
130 Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se
133 est.g.mi <- coefse['g','Estimate']
134 est.g.se <- coefse['g','Std.Error']
136 result <- append(result,
137 list(Bgy.est.amelia.nok = est.g.mi,
138 Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se,
139 Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se
142 p <- v <- train <- rep(0,N)
147 df <- df[order(x.obs)]
152 gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)
153 result <- append(result,
154 list(Bxy.est.gmm = gmm.res$beta[1,1],
155 Bxy.ci.upper.gmm = gmm.res$confint[1,2],
156 Bxy.ci.lower.gmm = gmm.res$confint[1,1],
157 Bgy.est.gmm = gmm.res$beta[2,1],
158 Bgy.ci.upper.gmm = gmm.res$confint[2,2],
159 Bgy.ci.lower.gmm = gmm.res$confint[2,1]))
164 Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500)
165 ms <- c(30, 50, 100, 200, 300, 500)
180 rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed)))
186 result <- rbindlist(rows)
187 write_feather(result, "example_2_simulation_continuous.feather")