From f8f58301e0285118f7b669a96ed9367a9914ba02 Mon Sep 17 00:00:00 2001 From: Nathan TeBlunthuis Date: Mon, 27 Jun 2022 22:48:20 -0700 Subject: [PATCH 1/1] make first simulation with precise accuracies and explained variances --- simulations/01_two_covariates.R | 85 ++++++ .../{example_3.R => 02_indep_differential.R} | 47 +-- simulations/example_2_B.R | 276 ------------------ simulations/simulation_base.R | 44 +-- 4 files changed, 112 insertions(+), 340 deletions(-) create mode 100644 simulations/01_two_covariates.R rename simulations/{example_3.R => 02_indep_differential.R} (70%) delete mode 100644 simulations/example_2_B.R diff --git a/simulations/01_two_covariates.R b/simulations/01_two_covariates.R new file mode 100644 index 0000000..c52a3dc --- /dev/null +++ b/simulations/01_two_covariates.R @@ -0,0 +1,85 @@ +### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate +### This is the same as example 2, only instead of x->k we have k->x. +### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. +### Even when you include the proxy variable in the regression. +### But with some ground truth and multiple imputation, you can fix it. + +library(argparser) +library(mecor) +library(ggplot2) +library(data.table) +library(filelock) +library(arrow) +library(Amelia) +library(Zelig) +library(predictionError) +options(amelia.parallel="no", + amelia.ncpus=1) + +source("simulation_base.R") + +## SETUP: +### we want to estimate x -> y; x is MAR +### we have x -> k; k -> w; x -> w is used to predict x via the model w. +### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments +### The labels x are binary, but the model provides a continuous predictor + +### simulation: +#### how much power do we get from the model in the first place? (sweeping N and m) +#### + +simulate_data <- function(N, m, B0=0, Bxy=0.2, Bgy=-0.2, Bgx=0.2, y_explained_variance=0.025, gx_explained_variance=0.15, prediction_accuracy=0.73, seed=1){ + set.seed(seed) + g <- rbinom(N, 1, 0.5) + + x.var.epsilon <- var(Bgx *g) * ((1-gx_explained_variance)/gx_explained_variance) + x.epsilon <- rnorm(N,sd=sqrt(x.var.epsilon)) + xprime <- Bgx * g + x.epsilon + x <- as.integer(logistic(scale(xprime)) > 0.5) + + y.var.epsilon <- (var(Bgy * g) + var(Bxy *x) + 2*cov(Bxy*x,Bgy*g)) * ((1-y_explained_variance)/y_explained_variance) + y.epsilon <- rnorm(N, sd = sqrt(y.var.epsilon)) + y <- Bgy * g + Bxy * x + y.epsilon + + df <- data.table(x=x,xprime=xprime,y=y,g=g) + + if(m < N){ + df <- df[sample(nrow(df), m), x.obs := x] + } else { + df <- df[, x.obs := x] + } + + df <- df[,w_pred:=x] + + df <- df[sample(1:N,(1-prediction_accuracy)*N),w_pred:=(w_pred-1)**2] + df <- df[,':='(w=w, w_pred = w_pred)] + return(df) +} + +parser <- arg_parser("Simulate data and fit corrected models") +parser <- add_argument(parser, "--N", default=500, help="number of observations of w") +parser <- add_argument(parser, "--m", default=100, help="m the number of ground truth observations") +parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng') +parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather') +args <- parse_args(parser) + +B0 <- 0 +Bxy <- 0.2 +Bgy <- -0.2 +Bgx <- 0.5 + +df <- simulate_data(args$N, args$m, B0, Bxy, Bgy, Bgx, seed=args$seed, y_explained_variance = 0.025, gx_explained_variance = 0.15) +result <- list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bgx'=Bgx, 'seed'=args$seed) +outline <- run_simulation(df, result) + +outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE) +if(file.exists(args$outfile)){ + logdata <- read_feather(args$outfile) + logdata <- rbind(logdata,as.data.table(outline)) +} else { + logdata <- as.data.table(outline) +} + +print(outline) +write_feather(logdata, args$outfile) +unlock(outfile_lock) diff --git a/simulations/example_3.R b/simulations/02_indep_differential.R similarity index 70% rename from simulations/example_3.R rename to simulations/02_indep_differential.R index 389dba1..5a7784b 100644 --- a/simulations/example_3.R +++ b/simulations/02_indep_differential.R @@ -1,6 +1,5 @@ -### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate +### EXAMPLE 1: demonstrates how measurement error can lead to a type sign error in a covariate ### What kind of data invalidates fong + tyler? -### This is the same as example 2, only instead of x->k we have k->x. ### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. ### Even when you include the proxy variable in the regression. ### But with some ground truth and multiple imputation, you can fix it. @@ -32,7 +31,7 @@ source("simulation_base.R") ## one way to do it is by adding correlation to x.obs and y that isn't in w. ## in other words, the model is missing an important feature of x.obs that's related to y. -simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ +simulate_data <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed, xy.explained.variance=0.01, u.explained.variance=0.1){ set.seed(seed) ## the true value of x @@ -40,7 +39,7 @@ simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ g <- rbinom(N, 1, 0.5) # make w and y dependent - u <- rnorm(N,0,Bxy) + u <- rnorm(N,0,) xprime <- Bgx * g + rnorm(N,0,1) @@ -48,7 +47,7 @@ simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ x <- as.integer(logistic(scale(xprime)) > 0.5) - y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u + y <- Bxy * x + Bgy * g + B0 + u + rnorm(N, 0, 1) df <- data.table(x=x,k=k,y=y,g=g) @@ -69,42 +68,6 @@ simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ return(df) } -## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ -## set.seed(seed) - -## ## the true value of x - -## g <- rbinom(N, 1, 0.5) - -## # make w and y dependent -## u <- rnorm(N,0,5) - -## xprime <- Bgx * g + rnorm(N,0,1) - -## k <- Bkx*xprime + rnorm(N,0,3) - -## x <- as.integer(logistic(scale(xprime+0.3)) > 0.5) - -## y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u - -## df <- data.table(x=x,k=k,y=y,g=g) - -## w.model <- glm(x ~ k, df, family=binomial(link='logit')) - -## if( m < N){ -## df <- df[sample(nrow(df), m), x.obs := x] -## } else { -## df <- df[, x.obs := x] -## } - -## w <- predict(w.model,data.frame(k=k)) + u -## ## y = B0 + B1x + e - -## df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)] -## return(df) -## } - - schennach <- function(df){ fwx <- glm(x.obs~w, df, family=binomial(link='logit')) @@ -128,7 +91,7 @@ Bkx <- 2 Bgx <- 0 -outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) +outline <- run_simulation(simulate_data(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed)) outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE) diff --git a/simulations/example_2_B.R b/simulations/example_2_B.R deleted file mode 100644 index 91623a1..0000000 --- a/simulations/example_2_B.R +++ /dev/null @@ -1,276 +0,0 @@ -### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate -### This is the same as example 2, only instead of x->k we have k->x. -### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. -### Even when you include the proxy variable in the regression. -### But with some ground truth and multiple imputation, you can fix it. - -library(argparser) -library(mecor) -library(ggplot2) -library(data.table) -library(filelock) -library(arrow) -library(Amelia) -library(Zelig) -library(predictionError) -options(amelia.parallel="no", - amelia.ncpus=1) - -source("simulation_base.R") - -## SETUP: -### we want to estimate x -> y; x is MAR -### we have x -> k; k -> w; x -> w is used to predict x via the model w. -### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments -### The labels x are binary, but the model provides a continuous predictor - -### simulation: -#### how much power do we get from the model in the first place? (sweeping N and m) -#### -logistic <- function(x) {1/(1+exp(-1*x))} - -simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ - set.seed(seed) - - ## the true value of x - - g <- rbinom(N, 1, 0.5) - xprime <- Bgx * g + rnorm(N,0,1) - - k <- Bkx*xprime + rnorm(N,0,3) - - x <- as.integer(logistic(scale(xprime)) > 0.5) - - y <- Bxy * x + Bgy * g + rnorm(N, 0, 2) + B0 - df <- data.table(x=x,k=k,y=y,g=g) - - if( m < N){ - df <- df[sample(nrow(df), m), x.obs := x] - } else { - df <- df[, x.obs := x] - } - - w.model <- glm(x ~ k,df, family=binomial(link='logit')) - w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1) - ## y = B0 + B1x + e - - df[,':='(w=w, w_pred = as.integer(w>0.5))] - return(df) -} - -schennach <- function(df){ - - fwx <- glm(x.obs~w, df, family=binomial(link='logit')) - df[,xstar_pred := predict(fwx, df)] - gxt <- lm(y ~ xstar_pred+g, df) - -} - -parser <- arg_parser("Simulate data and fit corrected models") -parser <- add_argument(parser, "--N", default=100, help="number of observations of w") -parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations") -parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng') -parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather') -args <- parse_args(parser) - -rows <- list() - -B0 <- 0 -Bxy <- 0.2 -Bgy <- -0.2 -Bkx <- 1 -Bgx <- 3 - -outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) - ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed)) - -outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE) -if(file.exists(args$outfile)){ - logdata <- read_feather(args$outfile) - logdata <- rbind(logdata,as.data.table(outline)) -} else { - logdata <- as.data.table(outline) -} - -print(outline) -write_feather(logdata, args$outfile) -unlock(outfile_lock) - -## Ns <- c(1e6, 5e4, 1000) -## ms <- c(100, 250, 500, 1000) -## seeds <- 1:500 - -## rowssets <- list() -## library(doParallel) -## options(mc.cores = parallel::detectCores()) -## cl <- makeCluster(20) -## registerDoParallel(cl) - -## ## library(future) - -## ## plan(multiprocess,workers=40,gc=TRUE) - -## for(N in Ns){ -## print(N) -## for(m in ms){ -## if(N>m){ -## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'), -## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar% - -## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter) -## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))} -## rowsets <- append(rowssets, list(data.table(new.rows))) -## } - -## } -## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) -## } - ## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w}, - ## packages=c('mecor','Amelia','Zelig','predictionError'), - ## seed=TRUE))) - - -## df <- rbindlist(rowsets) - -## write_feather(df,"example_2B_simulation.feather") - -## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ -## result <- list() -## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) - -## result <- append(result, list(N=N, -## m=m, -## B0=B0, -## Bxy=Bxy, -## Bgy=Bgy, -## Bkx=Bkx, -## seed=seed)) - -## (accuracy <- df[,.(mean(w_pred==x))]) -## result <- append(result, list(accuracy=accuracy)) - -## (model.true <- lm(y ~ x + g, data=df)) -## true.ci.Bxy <- confint(model.true)['x',] -## true.ci.Bgy <- confint(model.true)['g',] - -## result <- append(result, list(Bxy.est.true=coef(model.true)['x'], -## Bgy.est.true=coef(model.true)['g'], -## Bxy.ci.upper.true = true.ci.Bxy[2], -## Bxy.ci.lower.true = true.ci.Bxy[1], -## Bgy.ci.upper.true = true.ci.Bgy[2], -## Bgy.ci.lower.true = true.ci.Bgy[1])) - -## (model.feasible <- lm(y~x.obs+g,data=df)) - -## feasible.ci.Bxy <- confint(model.feasible)['x.obs',] -## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'], -## Bxy.ci.upper.feasible = feasible.ci.Bxy[2], -## Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) - -## feasible.ci.Bgy <- confint(model.feasible)['g',] -## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'], -## Bgy.ci.upper.feasible = feasible.ci.Bgy[2], -## Bgy.ci.lower.feasible = feasible.ci.Bgy[1])) - -## (model.naive <- lm(y~w+g, data=df)) - -## naive.ci.Bxy <- confint(model.naive)['w',] -## naive.ci.Bgy <- confint(model.naive)['g',] - -## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], -## Bgy.est.naive=coef(model.naive)['g'], -## Bxy.ci.upper.naive = naive.ci.Bxy[2], -## Bxy.ci.lower.naive = naive.ci.Bxy[1], -## Bgy.ci.upper.naive = naive.ci.Bgy[2], -## Bgy.ci.lower.naive = naive.ci.Bgy[1])) - - -## ## multiple imputation when k is observed -## ## amelia does great at this one. -## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w')) -## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE) -## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) - -## est.x.mi <- coefse['x.obs','Estimate'] -## est.x.se <- coefse['x.obs','Std.Error'] -## result <- append(result, -## list(Bxy.est.amelia.full = est.x.mi, -## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, -## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se -## )) - -## est.g.mi <- coefse['g','Estimate'] -## est.g.se <- coefse['g','Std.Error'] - -## result <- append(result, -## list(Bgy.est.amelia.full = est.g.mi, -## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, -## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se -## )) - -## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. -## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w")) -## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) -## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)) - -## est.x.mi <- coefse['x.obs','Estimate'] -## est.x.se <- coefse['x.obs','Std.Error'] -## result <- append(result, -## list(Bxy.est.amelia.nok = est.x.mi, -## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, -## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se -## )) - -## est.g.mi <- coefse['g','Estimate'] -## est.g.se <- coefse['g','Std.Error'] - -## result <- append(result, -## list(Bgy.est.amelia.nok = est.g.mi, -## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, -## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se -## )) - -## p <- v <- train <- rep(0,N) -## M <- m -## p[(M+1):(N)] <- 1 -## v[1:(M)] <- 1 -## df <- df[order(x.obs)] -## y <- df[,y] -## x <- df[,x.obs] -## g <- df[,g] -## w <- df[,w] -## # gmm gets pretty close -## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)) - -## result <- append(result, -## list(Bxy.est.gmm = gmm.res$beta[1,1], -## Bxy.ci.upper.gmm = gmm.res$confint[1,2], -## Bxy.ci.lower.gmm = gmm.res$confint[1,1])) - -## result <- append(result, -## list(Bgy.est.gmm = gmm.res$beta[2,1], -## Bgy.ci.upper.gmm = gmm.res$confint[2,2], -## Bgy.ci.lower.gmm = gmm.res$confint[2,1])) - - -## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient') -## (mod.calibrated.mle) -## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) -## result <- append(result, list( -## Bxy.est.mecor = mecor.ci['Estimate'], -## Bxy.upper.mecor = mecor.ci['UCI'], -## Bxy.lower.mecor = mecor.ci['LCI']) -## ) - -## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',]) - -## result <- append(result, list( -## Bxy.est.mecor = mecor.ci['Estimate'], -## Bxy.upper.mecor = mecor.ci['UCI'], -## Bxy.lower.mecor = mecor.ci['LCI']) -## ) - - -## return(result) -## } - diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R index 26916ab..345d14e 100644 --- a/simulations/simulation_base.R +++ b/simulations/simulation_base.R @@ -82,26 +82,26 @@ run_simulation <- function(df, result){ )) ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. - amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"), noms=noms) - mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) - (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)) - - est.x.mi <- coefse['x.obs','Estimate'] - est.x.se <- coefse['x.obs','Std.Error'] - result <- append(result, - list(Bxy.est.amelia.nok = est.x.mi, - Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, - Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se - )) - - est.g.mi <- coefse['g','Estimate'] - est.g.se <- coefse['g','Std.Error'] - - result <- append(result, - list(Bgy.est.amelia.nok = est.g.mi, - Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, - Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se - )) + ## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","w_pred"), noms=noms) + ## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) + ## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)) + + ## est.x.mi <- coefse['x.obs','Estimate'] + ## est.x.se <- coefse['x.obs','Std.Error'] + ## result <- append(result, + ## list(Bxy.est.amelia.nok = est.x.mi, + ## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, + ## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se + ## )) + + ## est.g.mi <- coefse['g','Estimate'] + ## est.g.se <- coefse['g','Std.Error'] + + ## result <- append(result, + ## list(Bgy.est.amelia.nok = est.g.mi, + ## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, + ## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se + ## )) N <- nrow(df) m <- nrow(df[!is.na(x.obs)]) @@ -148,8 +148,8 @@ run_simulation <- function(df, result){ ) ## clean up memory - rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible")) +## rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible")) - gc() +## gc() return(result) } -- 2.39.5