From: Nathan TeBlunthuis Date: Wed, 15 Jun 2022 03:35:50 +0000 (-0700) Subject: add missing simulation code X-Git-Url: https://code.communitydata.science/ml_measurement_error_public.git/commitdiff_plain/e41d11afb9a80180feff844666e3ee463d20a7cd?ds=inline;hp=6057688060b5bf2a94f2b96b65b275a91991c0f3 add missing simulation code --- diff --git a/simulations/Makefile b/simulations/Makefile new file mode 100644 index 0000000..264b408 --- /dev/null +++ b/simulations/Makefile @@ -0,0 +1,61 @@ + +SHELL=bash + +Ns=[1000,10000,25000] +ms=[50, 100, 250, 500] +seeds=[$(shell seq -s, 1 250)] +all:remembr.RDS + +srun=srun -A comdata -p compute-bigmem --time=10:00:00 --mem 4G -c 1 + +example_1_jobs: example_1.R + grid_sweep.py --command "Rscript example_1.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_1.feather"]}' --outfile example_1_jobs + +example_1.feather: example_1_jobs + rm -f example_1.feather + sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_1_jobs +# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_1_jobs + +example_2_jobs: example_2.R + grid_sweep.py --command "Rscript example_2.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2.feather"]}' --outfile example_2_jobs + +example_2.feather: example_2_jobs + rm -f example_2.feather + sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_jobs +# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_jobs + +example_2_B_jobs: example_2_B.R + grid_sweep.py --command "Rscript example_2_B.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B.feather"]}' --outfile example_2_B_jobs + +example_2_B.feather: example_2_B_jobs + rm -f example_2_B.feather + sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_jobs + +example_3_jobs: example_3.R + grid_sweep.py --command "Rscript example_3.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_3.feather"]}' --outfile example_3_jobs + +example_3.feather: example_3_jobs + rm -f example_3.feather + sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_3_jobs + + +remembr.RDS:example_2_B.feather example_1.feather example_2.feather example_3.feather + ${srun} Rscript plot_example.R --infile example_1.feather --name "plot.df.example.1" + ${srun} Rscript plot_example.R --infile example_2.feather --name "plot.df.example.2" + ${srun} Rscript plot_example.R --infile example_2_B.feather --name "plot.df.example.2B" + ${srun} Rscript plot_example.R --infile example_3.feather --name "plot.df.example.3" + + + +# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_jobs + +# example_2_B_mecor_jobs: +# grid_sweep.py --command "Rscript example_2_B_mecor.R" --arg_dict '{"N":${Ns},"m":${ms}, "seed":${seeds}, "outfile":["example_2_B_mecor.feather"]}' --outfile example_2_B_mecor_jobs + +# example_2_B_mecor.feather:example_2_B_mecor.R example_2_B_mecor_jobs +# rm -f example_2_B_mecor.feather +# sbatch --wait --verbose --array=1-3000 run_simulation.sbatch 0 example_2_B_mecor_jobs +# sbatch --wait --verbose --array=3001-6001 run_simulation.sbatch 0 example_2_B_mecor_jobs + + + diff --git a/simulations/example_1.R b/simulations/example_1.R new file mode 100644 index 0000000..40cdd85 --- /dev/null +++ b/simulations/example_1.R @@ -0,0 +1,203 @@ +### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate +### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. +### Even when you include the proxy variable in the regression. +### But with some ground truth and multiple imputation, you can fix it. + +library(argparser) +library(mecor) +library(ggplot2) +library(data.table) +library(filelock) +library(arrow) +library(Amelia) +library(Zelig) +library(predictionError) + +source("simulation_base.R") + +## SETUP: +### we want to estimate x -> y; x is MAR +### we have x -> k; k -> w; x -> w is used to predict x via the model w. +### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments +### The labels x are binary, but the model provides a continuous predictor + +### simulation: +#### how much power do we get from the model in the first place? (sweeping N and m) +#### +logistic <- function(x) {1/(1+exp(-1*x))} + +simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ + set.seed(seed) + + ## the true value of x + + g <- rbinom(N, 1, 0.5) + xprime <- Bgx * g + rnorm(N,0,1) + + k <- Bkx*xprime + rnorm(N,0,3) + + x <- as.integer(logistic(scale(xprime)) > 0.5) + + y <- Bxy * x + Bgy * g + rnorm(N, 0, 2) + B0 + df <- data.table(x=x,k=k,y=y,g=g) + + if( m < N){ + df <- df[sample(nrow(df), m), x.obs := x] + } else { + df <- df[, x.obs := x] + } + + w.model <- glm(x ~ k,df, family=binomial(link='logit')) + w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1) + ## y = B0 + B1x + e + + df[,':='(w=w, w_pred = as.integer(w>0.5))] + return(df) +} + + +parser <- arg_parser("Simulate data and fit corrected models") +parser <- add_argument(parser, "--N", default=1000, help="number of observations of w") +parser <- add_argument(parser, "--m", default=100, help="m the number of ground truth observations") +parser <- add_argument(parser, "--seed", default=432, help='seed for the rng') +parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather') +args <- parse_args(parser) + + +B0 <- 0 +Bxy <- 0.2 +Bgy <- 0 +Bkx <- 3.2 +Bgx <- 0 + +df <- simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) + +outline <- run_simulation(df + ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=0, 'Bkx'=Bkx, 'Bgx'=0, 'seed'=args$seed)) + +outfile_lock <- lock(paste0(args$outfile, '_lock')) +if(file.exists(args$outfile)){ + logdata <- read_feather(args$outfile) + logdata <- rbind(logdata,as.data.table(outline)) +} else { + logdata <- as.data.table(outline) +} + +print(outline) +write_feather(logdata, args$outfile) +unlock(outfile_lock) + +## for(N in Ns){ +## print(N) +## for(m in ms){ +## if(N>m){ +## for(seed in seeds){ +## rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bkx, seed))) +## } +## } +## } +## } + + +## run_simulation <- function(N, m, B0, Bxy, Bkx, seed){ +## result <- list() +## df <- simulate_latent_cocause(N, m, B0, Bxy, Bkx, seed) + +## result <- append(result, list(N=N, +## m=m, +## B0=B0, +## Bxy=Bxy, +## Bkx=Bkx, +## seed=seed)) + +## (correlation <- cor(df$w,df$x,method='spearman')) +## result <- append(result, list(correlation=correlation)) + +## (accuracy <- mean(df$x == df$w_pred)) + +## result <- append(result, list(accuracy=accuracy)) + +## (model.true <- lm(y ~ x, data=df)) + +## (cor(resid(model.true),df$w)) + +## true.ci.Bxy <- confint(model.true)['x',] + +## result <- append(result, list(Bxy.est.true=coef(model.true)['x'], +## Bxy.ci.upper.true = true.ci.Bxy[2], +## Bxy.ci.lower.true = true.ci.Bxy[1])) + +## (model.naive <- lm(y~w, data=df)) + +## (model.feasible <- lm(y~x.obs,data=df)) + +## feasible.ci.Bxy <- confint(model.feasible)['x.obs',] +## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'], +## Bxy.ci.upper.feasible = feasible.ci.Bxy[2], +## Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) + + +## naive.ci.Bxy <- confint(model.naive)['w',] + +## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], +## Bxy.ci.upper.naive = naive.ci.Bxy[2], +## Bxy.ci.lower.naive = naive.ci.Bxy[1])) + + +## ## multiple imputation when k is observed + +## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'), noms=c("x.obs"),lgstc=c('w')) +## mod.amelia.k <- zelig(y~x.obs, model='ls', data=amelia.out.k$imputations, cite=FALSE) +## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + +## est.x.mi <- coefse['x.obs','Estimate'] +## est.x.se <- coefse['x.obs','Std.Error'] +## result <- append(result, +## list(Bxy.est.amelia.full = est.x.mi, +## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, +## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se +## )) + + +## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. +## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"),noms=c("x.obs"),lgstc=c('w')) +## mod.amelia.nok <- zelig(y~x.obs, model='ls', data=amelia.out.nok$imputations, cite=FALSE) +## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)) + +## est.x.mi <- coefse['x.obs','Estimate'] +## est.x.se <- coefse['x.obs','Std.Error'] +## result <- append(result, +## list(Bxy.est.amelia.nok = est.x.mi, +## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, +## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se +## )) + +## p <- v <- train <- rep(0,N) +## M <- m +## p[(M+1):(N)] <- 1 +## v[1:(M)] <- 1 +## df <- df[order(x.obs)] +## y <- df[,y] +## x <- df[,x.obs] +## w <- df[,w] +## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)) + +## result <- append(result, +## list(Bxy.est.gmm = gmm.res$beta[1,1], +## Bxy.ci.upper.gmm = gmm.res$confint[1,2], +## Bxy.ci.lower.gmm = gmm.res$confint[1,1])) + +## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs), df, B=400, method='efficient') + +## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) + +## result <- append(result, list( +## Bxy.est.mecor = mecor.ci['Estimate'], +## Bxy.upper.mecor = mecor.ci['UCI'], +## Bxy.lower.mecor = mecor.ci['LCI']) +## ) + + +## return(result) +## } + diff --git a/simulations/example_1.feather b/simulations/example_1.feather new file mode 100644 index 0000000..b8efa3f Binary files /dev/null and b/simulations/example_1.feather differ diff --git a/simulations/example_2.R b/simulations/example_2.R new file mode 100644 index 0000000..2ec23f7 --- /dev/null +++ b/simulations/example_2.R @@ -0,0 +1,87 @@ +### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate +### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. +### Even when you include the proxy variable in the regression. +### But with some ground truth and multiple imputation, you can fix it. + +library(argparser) +library(mecor) +library(ggplot2) +library(data.table) +library(filelock) +library(arrow) +library(Amelia) +library(Zelig) +library(predictionError) +options(amelia.parallel="no", + amelia.ncpus=1) + +## SETUP: +### we want to estimate x -> y; x is MAR +### we have x -> k; k -> w; x -> w is used to predict x via the model w. +### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments +### The labels x are binary, but the model provides a continuous predictor + +### simulation: +#### how much power do we get from the model in the first place? (sweeping N and m) +#### +source("simulation_base.R") + +simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ + set.seed(seed) + + ## the true value of x + + g <- rbinom(N, 1, 0.5) + k <- rnorm(N, 0, 1) + xprime <- Bkx*k + Bgx * g + rnorm(N,0,1) + xvec <- scale(xprime) + + y <- Bxy * xvec + Bgy * g + rnorm(N, 0, 1) + B0 + + df <- data.table(x=xvec,k=k,y=y,g=g) + names(df) <- c('x','k','y','g') + if( m < N){ + df <- df[sample(nrow(df), m), x.obs := x] + } else { + df <- df[, x.obs := x] + } + + w.model <- lm(x ~ k,df) + w <- predict(w.model,data.frame(k=k)) + w <- logistic(w + rnorm(N,0,sd(w)*0.1)) + ## y = B0 + B1x + e + + df[,':='(w=w, w_pred = as.integer(w>0.5))] + return(df) +} + + +parser <- arg_parser("Simulate data and fit corrected models") +parser <- add_argument(parser, "--N", default=1000, help="number of observations of w") +parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations") +parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng') +parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather') +args <- parse_args(parser) + +Ns <- c(1000, 10000, 1e6) +ms <- c(100, 250, 500, 1000) +B0 <- 0 +Bxy <- 0.2 +Bgy <- -0.2 +Bkx <- 2 +Bgx <- 3 + +outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) + ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed)) + +outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE) +if(file.exists(args$outfile)){ + logdata <- read_feather(args$outfile) + logdata <- rbind(logdata,as.data.table(outline)) +} else { + logdata <- as.data.table(outline) +} + +print(outline) +write_feather(logdata, args$outfile) +unlock(outfile_lock) diff --git a/simulations/example_2_B.R b/simulations/example_2_B.R new file mode 100644 index 0000000..91623a1 --- /dev/null +++ b/simulations/example_2_B.R @@ -0,0 +1,276 @@ +### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate +### This is the same as example 2, only instead of x->k we have k->x. +### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. +### Even when you include the proxy variable in the regression. +### But with some ground truth and multiple imputation, you can fix it. + +library(argparser) +library(mecor) +library(ggplot2) +library(data.table) +library(filelock) +library(arrow) +library(Amelia) +library(Zelig) +library(predictionError) +options(amelia.parallel="no", + amelia.ncpus=1) + +source("simulation_base.R") + +## SETUP: +### we want to estimate x -> y; x is MAR +### we have x -> k; k -> w; x -> w is used to predict x via the model w. +### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments +### The labels x are binary, but the model provides a continuous predictor + +### simulation: +#### how much power do we get from the model in the first place? (sweeping N and m) +#### +logistic <- function(x) {1/(1+exp(-1*x))} + +simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ + set.seed(seed) + + ## the true value of x + + g <- rbinom(N, 1, 0.5) + xprime <- Bgx * g + rnorm(N,0,1) + + k <- Bkx*xprime + rnorm(N,0,3) + + x <- as.integer(logistic(scale(xprime)) > 0.5) + + y <- Bxy * x + Bgy * g + rnorm(N, 0, 2) + B0 + df <- data.table(x=x,k=k,y=y,g=g) + + if( m < N){ + df <- df[sample(nrow(df), m), x.obs := x] + } else { + df <- df[, x.obs := x] + } + + w.model <- glm(x ~ k,df, family=binomial(link='logit')) + w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1) + ## y = B0 + B1x + e + + df[,':='(w=w, w_pred = as.integer(w>0.5))] + return(df) +} + +schennach <- function(df){ + + fwx <- glm(x.obs~w, df, family=binomial(link='logit')) + df[,xstar_pred := predict(fwx, df)] + gxt <- lm(y ~ xstar_pred+g, df) + +} + +parser <- arg_parser("Simulate data and fit corrected models") +parser <- add_argument(parser, "--N", default=100, help="number of observations of w") +parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations") +parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng') +parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather') +args <- parse_args(parser) + +rows <- list() + +B0 <- 0 +Bxy <- 0.2 +Bgy <- -0.2 +Bkx <- 1 +Bgx <- 3 + +outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) + ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed)) + +outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE) +if(file.exists(args$outfile)){ + logdata <- read_feather(args$outfile) + logdata <- rbind(logdata,as.data.table(outline)) +} else { + logdata <- as.data.table(outline) +} + +print(outline) +write_feather(logdata, args$outfile) +unlock(outfile_lock) + +## Ns <- c(1e6, 5e4, 1000) +## ms <- c(100, 250, 500, 1000) +## seeds <- 1:500 + +## rowssets <- list() +## library(doParallel) +## options(mc.cores = parallel::detectCores()) +## cl <- makeCluster(20) +## registerDoParallel(cl) + +## ## library(future) + +## ## plan(multiprocess,workers=40,gc=TRUE) + +## for(N in Ns){ +## print(N) +## for(m in ms){ +## if(N>m){ +## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'), +## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar% + +## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter) +## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))} +## rowsets <- append(rowssets, list(data.table(new.rows))) +## } + +## } +## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) +## } + ## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w}, + ## packages=c('mecor','Amelia','Zelig','predictionError'), + ## seed=TRUE))) + + +## df <- rbindlist(rowsets) + +## write_feather(df,"example_2B_simulation.feather") + +## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ +## result <- list() +## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) + +## result <- append(result, list(N=N, +## m=m, +## B0=B0, +## Bxy=Bxy, +## Bgy=Bgy, +## Bkx=Bkx, +## seed=seed)) + +## (accuracy <- df[,.(mean(w_pred==x))]) +## result <- append(result, list(accuracy=accuracy)) + +## (model.true <- lm(y ~ x + g, data=df)) +## true.ci.Bxy <- confint(model.true)['x',] +## true.ci.Bgy <- confint(model.true)['g',] + +## result <- append(result, list(Bxy.est.true=coef(model.true)['x'], +## Bgy.est.true=coef(model.true)['g'], +## Bxy.ci.upper.true = true.ci.Bxy[2], +## Bxy.ci.lower.true = true.ci.Bxy[1], +## Bgy.ci.upper.true = true.ci.Bgy[2], +## Bgy.ci.lower.true = true.ci.Bgy[1])) + +## (model.feasible <- lm(y~x.obs+g,data=df)) + +## feasible.ci.Bxy <- confint(model.feasible)['x.obs',] +## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'], +## Bxy.ci.upper.feasible = feasible.ci.Bxy[2], +## Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) + +## feasible.ci.Bgy <- confint(model.feasible)['g',] +## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'], +## Bgy.ci.upper.feasible = feasible.ci.Bgy[2], +## Bgy.ci.lower.feasible = feasible.ci.Bgy[1])) + +## (model.naive <- lm(y~w+g, data=df)) + +## naive.ci.Bxy <- confint(model.naive)['w',] +## naive.ci.Bgy <- confint(model.naive)['g',] + +## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], +## Bgy.est.naive=coef(model.naive)['g'], +## Bxy.ci.upper.naive = naive.ci.Bxy[2], +## Bxy.ci.lower.naive = naive.ci.Bxy[1], +## Bgy.ci.upper.naive = naive.ci.Bgy[2], +## Bgy.ci.lower.naive = naive.ci.Bgy[1])) + + +## ## multiple imputation when k is observed +## ## amelia does great at this one. +## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w')) +## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE) +## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + +## est.x.mi <- coefse['x.obs','Estimate'] +## est.x.se <- coefse['x.obs','Std.Error'] +## result <- append(result, +## list(Bxy.est.amelia.full = est.x.mi, +## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, +## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se +## )) + +## est.g.mi <- coefse['g','Estimate'] +## est.g.se <- coefse['g','Std.Error'] + +## result <- append(result, +## list(Bgy.est.amelia.full = est.g.mi, +## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, +## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se +## )) + +## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. +## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w")) +## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) +## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)) + +## est.x.mi <- coefse['x.obs','Estimate'] +## est.x.se <- coefse['x.obs','Std.Error'] +## result <- append(result, +## list(Bxy.est.amelia.nok = est.x.mi, +## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, +## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se +## )) + +## est.g.mi <- coefse['g','Estimate'] +## est.g.se <- coefse['g','Std.Error'] + +## result <- append(result, +## list(Bgy.est.amelia.nok = est.g.mi, +## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, +## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se +## )) + +## p <- v <- train <- rep(0,N) +## M <- m +## p[(M+1):(N)] <- 1 +## v[1:(M)] <- 1 +## df <- df[order(x.obs)] +## y <- df[,y] +## x <- df[,x.obs] +## g <- df[,g] +## w <- df[,w] +## # gmm gets pretty close +## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)) + +## result <- append(result, +## list(Bxy.est.gmm = gmm.res$beta[1,1], +## Bxy.ci.upper.gmm = gmm.res$confint[1,2], +## Bxy.ci.lower.gmm = gmm.res$confint[1,1])) + +## result <- append(result, +## list(Bgy.est.gmm = gmm.res$beta[2,1], +## Bgy.ci.upper.gmm = gmm.res$confint[2,2], +## Bgy.ci.lower.gmm = gmm.res$confint[2,1])) + + +## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient') +## (mod.calibrated.mle) +## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) +## result <- append(result, list( +## Bxy.est.mecor = mecor.ci['Estimate'], +## Bxy.upper.mecor = mecor.ci['UCI'], +## Bxy.lower.mecor = mecor.ci['LCI']) +## ) + +## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',]) + +## result <- append(result, list( +## Bxy.est.mecor = mecor.ci['Estimate'], +## Bxy.upper.mecor = mecor.ci['UCI'], +## Bxy.lower.mecor = mecor.ci['LCI']) +## ) + + +## return(result) +## } + diff --git a/simulations/example_2_B.feather b/simulations/example_2_B.feather new file mode 100644 index 0000000..8d0ecbd Binary files /dev/null and b/simulations/example_2_B.feather differ diff --git a/simulations/example_2_B_mecor.R b/simulations/example_2_B_mecor.R new file mode 100644 index 0000000..74ad695 --- /dev/null +++ b/simulations/example_2_B_mecor.R @@ -0,0 +1,281 @@ +### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate +### This is the same as example 2, only instead of x->k we have k->x. +### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. +### Even when you include the proxy variable in the regression. +### But with some ground truth and multiple imputation, you can fix it. + +library(argparser) +library(mecor) +library(ggplot2) +library(data.table) +library(filelock) +library(arrow) +library(Amelia) +library(Zelig) +library(predictionError) +options(amelia.parallel="no", + amelia.ncpus=1) + +source("simulation_base_mecor.R") + +## SETUP: +### we want to estimate x -> y; x is MAR +### we have x -> k; k -> w; x -> w is used to predict x via the model w. +### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments +### The labels x are binary, but the model provides a continuous predictor + +### simulation: +#### how much power do we get from the model in the first place? (sweeping N and m) +#### +logistic <- function(x) {1/(1+exp(-1*x))} + +simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ + set.seed(seed) + + ## the true value of x + + g <- rbinom(N, 1, 0.5) + xprime <- Bgx * g + rnorm(N,0,1) + + k <- Bkx*xprime + rnorm(N,0,3) + + x <- as.integer(logistic(scale(xprime)) > 0.5) + + y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + df <- data.table(x=x,k=k,y=y,g=g) + + if( m < N){ + df <- df[sample(nrow(df), m), x.obs := x] + } else { + df <- df[, x.obs := x] + } + + w.model <- glm(x ~ k,df, family=binomial(link='logit')) + w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1) + ## y = B0 + B1x + e + + df[,':='(w=w, w_pred = as.integer(w>0.5))] + return(df) +} + +schennach <- function(df){ + + fwx <- glm(x.obs~w, df, family=binomial(link='logit')) + df[,xstar_pred := predict(fwx, df)] + gxt <- lm(y ~ xstar_pred+g, df) + +} + +parser <- arg_parser("Simulate data and fit corrected models") +parser <- add_argument(parser, "--N", default=100, help="number of observations of w") +parser <- add_argument(parser, "--m", default=20, help="m the number of ground truth observations") +parser <- add_argument(parser, "--seed", default=4321, help='seed for the rng') +parser <- add_argument(parser, "--outfile", help='output file', default='example_2_B.feather') +args <- parse_args(parser) + +B0 <- 0 +Bxy <- 0.2 + +Bkx <- 0.5 + +rows <- list() + +B0 <- 0 +Bxy <- 0.4 +Bgy <- 0.25 +Bkx <- 3.2 +Bgx <- -0.5 + +outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) + ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed)) + +outfile_lock <- lock(paste0(args$outfile, '_lock')) +if(file.exists(args$outfile)){ + logdata <- read_feather(args$outfile) + logdata <- rbind(logdata,as.data.table(outline)) +} else { + logdata <- as.data.table(outline) +} + +print(outline) +write_feather(logdata, args$outfile) +unlock(outfile_lock) + +## Ns <- c(1e6, 5e4, 1000) +## ms <- c(100, 250, 500, 1000) +## seeds <- 1:500 + +## rowssets <- list() +## library(doParallel) +## options(mc.cores = parallel::detectCores()) +## cl <- makeCluster(20) +## registerDoParallel(cl) + +## ## library(future) + +## ## plan(multiprocess,workers=40,gc=TRUE) + +## for(N in Ns){ +## print(N) +## for(m in ms){ +## if(N>m){ +## new.rows <- foreach(iter=seeds, .combine=rbind, .packages = c('mecor','Amelia','Zelig','predictionError','data.table'), +## .export = c("run_simulation","simulate_latent_cocause","logistic","N","m","B0","Bxy","Bgy","Bkx","Bgx")) %dopar% + +## {run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, iter) +## ,list('N'=N,'m'=m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=iter))} +## rowsets <- append(rowssets, list(data.table(new.rows))) +## } + +## } +## ## rows <- append(rows, list(future({run_simulation(simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) +## } + ## ,list(N=N,m=m,B0=B0,Bxy=Bxy,Bgy=Bgy, Bkx=Bkx, Bgx=Bgx, seed=seed))w}, + ## packages=c('mecor','Amelia','Zelig','predictionError'), + ## seed=TRUE))) + + +## df <- rbindlist(rowsets) + +## write_feather(df,"example_2B_simulation.feather") + +## run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ +## result <- list() +## df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) + +## result <- append(result, list(N=N, +## m=m, +## B0=B0, +## Bxy=Bxy, +## Bgy=Bgy, +## Bkx=Bkx, +## seed=seed)) + +## (accuracy <- df[,.(mean(w_pred==x))]) +## result <- append(result, list(accuracy=accuracy)) + +## (model.true <- lm(y ~ x + g, data=df)) +## true.ci.Bxy <- confint(model.true)['x',] +## true.ci.Bgy <- confint(model.true)['g',] + +## result <- append(result, list(Bxy.est.true=coef(model.true)['x'], +## Bgy.est.true=coef(model.true)['g'], +## Bxy.ci.upper.true = true.ci.Bxy[2], +## Bxy.ci.lower.true = true.ci.Bxy[1], +## Bgy.ci.upper.true = true.ci.Bgy[2], +## Bgy.ci.lower.true = true.ci.Bgy[1])) + +## (model.feasible <- lm(y~x.obs+g,data=df)) + +## feasible.ci.Bxy <- confint(model.feasible)['x.obs',] +## result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'], +## Bxy.ci.upper.feasible = feasible.ci.Bxy[2], +## Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) + +## feasible.ci.Bgy <- confint(model.feasible)['g',] +## result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'], +## Bgy.ci.upper.feasible = feasible.ci.Bgy[2], +## Bgy.ci.lower.feasible = feasible.ci.Bgy[1])) + +## (model.naive <- lm(y~w+g, data=df)) + +## naive.ci.Bxy <- confint(model.naive)['w',] +## naive.ci.Bgy <- confint(model.naive)['g',] + +## result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], +## Bgy.est.naive=coef(model.naive)['g'], +## Bxy.ci.upper.naive = naive.ci.Bxy[2], +## Bxy.ci.lower.naive = naive.ci.Bxy[1], +## Bgy.ci.upper.naive = naive.ci.Bgy[2], +## Bgy.ci.lower.naive = naive.ci.Bgy[1])) + + +## ## multiple imputation when k is observed +## ## amelia does great at this one. +## amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=c("x.obs","g"),lgstc=c('w')) +## mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE) +## (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + +## est.x.mi <- coefse['x.obs','Estimate'] +## est.x.se <- coefse['x.obs','Std.Error'] +## result <- append(result, +## list(Bxy.est.amelia.full = est.x.mi, +## Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, +## Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se +## )) + +## est.g.mi <- coefse['g','Estimate'] +## est.g.se <- coefse['g','Std.Error'] + +## result <- append(result, +## list(Bgy.est.amelia.full = est.g.mi, +## Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, +## Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se +## )) + +## ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. +## amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k"), noms=c("x.obs",'g'),lgstc = c("w")) +## mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) +## (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)) + +## est.x.mi <- coefse['x.obs','Estimate'] +## est.x.se <- coefse['x.obs','Std.Error'] +## result <- append(result, +## list(Bxy.est.amelia.nok = est.x.mi, +## Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, +## Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se +## )) + +## est.g.mi <- coefse['g','Estimate'] +## est.g.se <- coefse['g','Std.Error'] + +## result <- append(result, +## list(Bgy.est.amelia.nok = est.g.mi, +## Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, +## Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se +## )) + +## p <- v <- train <- rep(0,N) +## M <- m +## p[(M+1):(N)] <- 1 +## v[1:(M)] <- 1 +## df <- df[order(x.obs)] +## y <- df[,y] +## x <- df[,x.obs] +## g <- df[,g] +## w <- df[,w] +## # gmm gets pretty close +## (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE)) + +## result <- append(result, +## list(Bxy.est.gmm = gmm.res$beta[1,1], +## Bxy.ci.upper.gmm = gmm.res$confint[1,2], +## Bxy.ci.lower.gmm = gmm.res$confint[1,1])) + +## result <- append(result, +## list(Bgy.est.gmm = gmm.res$beta[2,1], +## Bgy.ci.upper.gmm = gmm.res$confint[2,2], +## Bgy.ci.lower.gmm = gmm.res$confint[2,1])) + + +## mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient') +## (mod.calibrated.mle) +## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) +## result <- append(result, list( +## Bxy.est.mecor = mecor.ci['Estimate'], +## Bxy.upper.mecor = mecor.ci['UCI'], +## Bxy.lower.mecor = mecor.ci['LCI']) +## ) + +## (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',]) + +## result <- append(result, list( +## Bxy.est.mecor = mecor.ci['Estimate'], +## Bxy.upper.mecor = mecor.ci['UCI'], +## Bxy.lower.mecor = mecor.ci['LCI']) +## ) + + +## return(result) +## } + diff --git a/simulations/mi_simulations/example_2_binary.R b/simulations/example_2_binary.R similarity index 100% rename from simulations/mi_simulations/example_2_binary.R rename to simulations/example_2_binary.R diff --git a/simulations/example_2_continuous.R b/simulations/example_2_continuous.R new file mode 100644 index 0000000..3ab63b5 --- /dev/null +++ b/simulations/example_2_continuous.R @@ -0,0 +1,187 @@ +### EXAMPLE 2: demonstrates how measurement error can lead to a type sign error in a covariate +### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. +### Even when you include the proxy variable in the regression. +### But with some ground truth and multiple imputation, you can fix it. + +library(argparser) +library(mecor) +library(ggplot2) +library(data.table) +library(filelock) +library(arrow) +library(Amelia) +library(Zelig) +library(predictionError) +options(amelia.parallel="multicore", + amelia.ncpus=40) + +## SETUP: +### we want to estimate g -> y and x -> y; g is observed, x is MAR +### we have k -> x; g -> x; g->k; k is used to predict x via the model w. +### we have k -> w; x -> w; w is observed. +### for illustration, g is binary (e.g., gender==male). +### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments +### Whether a comment is "racial harassment" depends on context, like the kind of person (i.e.,) the race of the person making the comment +### e.g., a Black person saying "n-word" is less likely to be racial harassement than if a white person does it. +### Say we have a language model that predicts "racial harassment," but it doesn't know the race of the writer. +### Our content analyzers can see signals of the writer's race (e.g., a profile or avatar). So our "ground truth" takes this into accont. +### Our goal is to predict an outcome (say that someone gets banned from the platform) as a function of whether they made a racial harassing comment and of their race. + +### simulation: +#### how much power do we get from the model in the first place? (sweeping N and m) +#### +logistic <- function(x) {1/(1+exp(-1*x))} + +simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ + set.seed(seed) + + ## the true value of x + + g <- rbinom(N, 1, 0.5) + k <- rnorm(N, 0, 1) + x <- Bkx*k + Bgx * g + rnorm(N,0,1) + w.model <- lm(x ~ k) + w <- predict(w.model,data.frame(k=k)) + rnorm(N,0,1) + ## y = B0 + B1x + e + y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + df <- data.table(x=x,k=k,y=y,w=w,g=g) + if( m < N){ + df <- df[sample(nrow(df), m), x.obs := x] + } else { + df <- df[, x.obs := x] + } + + return(df) +} + + +run_simulation <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ + result <- list() + df <- simulate_latent_cocause(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed) + + result <- append(result, list(N=N, + m=m, + B0=B0, + Bxy=Bxy, + Bgy=Bgy, + Bkx=Bkx, + seed=seed)) + + correlation <- cor(df$w,df$x) + result <- append(result, list(correlation=correlation)) + + model.true <- lm(y ~ x + g, data=df) + true.ci.Bxy <- confint(model.true)['x',] + true.ci.Bgy <- confint(model.true)['g',] + + result <- append(result, list(Bxy.est.true=coef(model.true)['x'], + Bgy.est.true=coef(model.true)['g'], + Bxy.ci.upper.true = true.ci.Bxy[2], + Bxy.ci.lower.true = true.ci.Bxy[1], + Bgy.ci.upper.true = true.ci.Bgy[2], + Bgy.ci.lower.true = true.ci.Bgy[1])) + + + model.naive <- lm(y~w+g, data=df) + + naive.ci.Bxy <- confint(model.naive)['w',] + naive.ci.Bgy <- confint(model.naive)['g',] + + result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], + Bgy.est.naive=coef(model.naive)['g'], + Bxy.ci.upper.naive = naive.ci.Bxy[2], + Bxy.ci.lower.naive = naive.ci.Bxy[1], + Bgy.ci.upper.naive = naive.ci.Bgy[2], + Bgy.ci.lower.naive = naive.ci.Bgy[1])) + + + ## multiple imputation when k is observed + amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x')) + mod.amelia.k <- zelig(y~x.obs+g+k, model='ls', data=amelia.out.k$imputations, cite=FALSE) + coefse <- combine_coef_se(mod.amelia.k, messages=FALSE) + + est.x.mi <- coefse['x.obs','Estimate'] + est.x.se <- coefse['x.obs','Std.Error'] + result <- append(result, + list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se + )) + + est.g.mi <- coefse['g','Estimate'] + est.g.se <- coefse['g','Std.Error'] + + result <- append(result, + list(Bgy.est.amelia.full = est.g.mi, + Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, + Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se + )) + + ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. + amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k")) + mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) + coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE) + + est.x.mi <- coefse['x.obs','Estimate'] + est.x.se <- coefse['x.obs','Std.Error'] + result <- append(result, + list(Bxy.est.amelia.nok = est.x.mi, + Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se + )) + + est.g.mi <- coefse['g','Estimate'] + est.g.se <- coefse['g','Std.Error'] + + result <- append(result, + list(Bgy.est.amelia.nok = est.g.mi, + Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, + Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se + )) + + p <- v <- train <- rep(0,N) + M <- m + p[(M+1):(N)] <- 1 + train[0:(M/2)] <- 1 + v[(M/2+1):(M)] <- 1 + df <- df[order(x.obs)] + y <- df[,y] + x <- df[,x.obs] + g <- df[,g] + w <- df[,w] + gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=FALSE) + result <- append(result, + list(Bxy.est.gmm = gmm.res$beta[1,1], + Bxy.ci.upper.gmm = gmm.res$confint[1,2], + Bxy.ci.lower.gmm = gmm.res$confint[1,1], + Bgy.est.gmm = gmm.res$beta[2,1], + Bgy.ci.upper.gmm = gmm.res$confint[2,2], + Bgy.ci.lower.gmm = gmm.res$confint[2,1])) + + return(result) +} + +Ns <- c(100, 200, 300, 400, 500, 1000, 2500, 5000, 7500) +ms <- c(30, 50, 100, 200, 300, 500) +B0 <- 0 +Bxy <- 1 +Bgy <- 0.3 +Bkx <- 3 +Bgx <- -4 +seeds <- 1:100 + +rows <- list() + +for(N in Ns){ + print(N) + for(m in ms){ + if(N>m){ + for(seed in seeds){ + rows <- append(rows, list(run_simulation(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed))) + } + } + } +} + +result <- rbindlist(rows) +write_feather(result, "example_2_simulation_continuous.feather") diff --git a/simulations/example_3.R b/simulations/example_3.R new file mode 100644 index 0000000..389dba1 --- /dev/null +++ b/simulations/example_3.R @@ -0,0 +1,144 @@ +### EXAMPLE 2_b: demonstrates how measurement error can lead to a type sign error in a covariate +### What kind of data invalidates fong + tyler? +### This is the same as example 2, only instead of x->k we have k->x. +### Even when you have a good predictor, if it's biased against a covariate you can get the wrong sign. +### Even when you include the proxy variable in the regression. +### But with some ground truth and multiple imputation, you can fix it. + +library(argparser) +library(mecor) +library(ggplot2) +library(data.table) +library(filelock) +library(arrow) +library(Amelia) +library(Zelig) +library(predictionError) +options(amelia.parallel="no", + amelia.ncpus=1) +setDTthreads(40) + +source("simulation_base.R") + +## SETUP: +### we want to estimate x -> y; x is MAR +### we have x -> k; k -> w; x -> w is used to predict x via the model w. +### A realistic scenario is that we have an NLP model predicting something like "racial harassment" in social media comments +### The labels x are binary, but the model provides a continuous predictor + +### simulation: +#### how much power do we get from the model in the first place? (sweeping N and m) +#### + +## one way to do it is by adding correlation to x.obs and y that isn't in w. +## in other words, the model is missing an important feature of x.obs that's related to y. +simulate_latent_cocause <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ + set.seed(seed) + + ## the true value of x + + g <- rbinom(N, 1, 0.5) + + # make w and y dependent + u <- rnorm(N,0,Bxy) + + xprime <- Bgx * g + rnorm(N,0,1) + + k <- Bkx*xprime + rnorm(N,0,1.5) + 1.1*Bkx*u + + x <- as.integer(logistic(scale(xprime)) > 0.5) + + y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u + + df <- data.table(x=x,k=k,y=y,g=g) + + w.model <- glm(x ~ k,df, family=binomial(link='logit')) + + if( m < N){ + df <- df[sample(nrow(df), m), x.obs := x] + } else { + df <- df[, x.obs := x] + } + + df[, x.obs := x.obs] + + w <- predict(w.model, df) + rnorm(N, 0, 1) + ## y = B0 + B1x + e + + df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)] + return(df) +} + +## simulate_latent_cocause_2 <- function(N, m, B0, Bxy, Bgy, Bkx, Bgx, seed){ +## set.seed(seed) + +## ## the true value of x + +## g <- rbinom(N, 1, 0.5) + +## # make w and y dependent +## u <- rnorm(N,0,5) + +## xprime <- Bgx * g + rnorm(N,0,1) + +## k <- Bkx*xprime + rnorm(N,0,3) + +## x <- as.integer(logistic(scale(xprime+0.3)) > 0.5) + +## y <- Bxy * x + Bgy * g + rnorm(N, 0, 1) + B0 + u + +## df <- data.table(x=x,k=k,y=y,g=g) + +## w.model <- glm(x ~ k, df, family=binomial(link='logit')) + +## if( m < N){ +## df <- df[sample(nrow(df), m), x.obs := x] +## } else { +## df <- df[, x.obs := x] +## } + +## w <- predict(w.model,data.frame(k=k)) + u +## ## y = B0 + B1x + e + +## df[,':='(w=w, w_pred = as.integer(w>0.5),u=u)] +## return(df) +## } + + +schennach <- function(df){ + + fwx <- glm(x.obs~w, df, family=binomial(link='logit')) + df[,xstar_pred := predict(fwx, df)] + gxt <- lm(y ~ xstar_pred+g, df) + +} + + +parser <- arg_parser("Simulate data and fit corrected models") +parser <- add_argument(parser, "--N", default=5000, help="number of observations of w") +parser <- add_argument(parser, "--m", default=200, help="m the number of ground truth observations") +parser <- add_argument(parser, "--seed", default=432, help='seed for the rng') +parser <- add_argument(parser, "--outfile", help='output file', default='example_2.feather') +args <- parse_args(parser) + +B0 <- 0 +Bxy <- 0.2 +Bgy <- 0 +Bkx <- 2 +Bgx <- 0 + + +outline <- run_simulation(simulate_latent_cocause(args$N, args$m, B0, Bxy, Bgy, Bkx, Bgx, args$seed) + ,list('N'=args$N,'m'=args$m,'B0'=B0,'Bxy'=Bxy,'Bgy'=Bgy, 'Bkx'=Bkx, 'Bgx'=Bgx, 'seed'=args$seed)) + +outfile_lock <- lock(paste0(args$outfile, '_lock'),exclusive=TRUE) +if(file.exists(args$outfile)){ + logdata <- read_feather(args$outfile) + logdata <- rbind(logdata,as.data.table(outline)) +} else { + logdata <- as.data.table(outline) +} + +print(outline) +write_feather(logdata, args$outfile) +unlock(outfile_lock) diff --git a/simulations/example_3.feather b/simulations/example_3.feather new file mode 100644 index 0000000..f130915 Binary files /dev/null and b/simulations/example_3.feather differ diff --git a/simulations/plot_example_1.R b/simulations/plot_example_1.R new file mode 100644 index 0000000..d883d24 --- /dev/null +++ b/simulations/plot_example_1.R @@ -0,0 +1,175 @@ +source("RemembR/R/RemembeR.R") +library(arrow) +library(data.table) +library(ggplot2) + +df <- data.table(read_feather("example_1.feather")) + +x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)] +x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)), + zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive), + bias = Bxy - Bxy.est.naive, + sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))] + +x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + mean.estimate=mean(Bxy.est.naive), + var.estimate=var(Bxy.est.naive), + Bxy=mean(Bxy), + variable='x', + method='Naive' + ), + by=c('N','m')] + +x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)] + +x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full), + zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full), + bias = Bxy.est.true - Bxy.est.amelia.full, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))] + +x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Multiple imputation' + ), + by=c('N','m')] + + + +g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)] +g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full), + zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full), + bias = Bgy.est.amelia.full - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))] + +g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Multiple imputation' + ), + by=c('N','m')] + +x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)] +x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok), + zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok), + bias = Bxy.est.amelia.nok - Bxy.est.true, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))] + + x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Multiple imputation (Classifier features unobserved)' + ), + by=c('N','m')] + +g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)] +g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok), + zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok), + bias = Bgy.est.amelia.nok - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))] + +g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Multiple imputation (Classifier features unobserved)' + ), + by=c('N','m')] + + +x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)] + +x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor), + zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor), + bias = Bxy.est.mecor - Bxy.est.true, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))] + +x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Regression Calibration' + ), + by=c('N','m')] + +g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)] + +g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor), + zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor), + bias = Bgy.est.mecor - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))] + +g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Regression Calibration' + ), + by=c('N','m')] + +## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)] + +## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor), +## zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor), +## bias = Bgy.est.mecor - Bgy.est.true, +## sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))] + +## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)), +## mean.bias = mean(bias), +## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), +## variable='g', +## method='Regression Calibration' +## ), +## by=c('N','m')] + + +x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)] +x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm), + zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm), + bias = Bxy.est.gmm - Bxy.est.true, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))] + +x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='2SLS+gmm' + ), + by=c('N','m')] + +g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)] +g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm), + zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm), + bias = Bgy.est.gmm - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))] + +g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='2SLS+gmm' + ), + by=c('N','m')] + +plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot)) + +remember(plot.df,'example.1.plot.df') + + +ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") + +ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") + +ests <- df[,.(Bxy.est.true = mean(Bxy.est.true), + Bxy.est.naive = mean(Bxy.est.naive), + Bxy.est.feasible = mean(Bxy.est.feasible), + Bxy.est.amelia.full = mean(Bxy.est.amelia.full), + Bxy.est.amelia.nok = mean(Bxy.est.amelia.nok), + Bxy.est.mecor = mean(Bxy.est.mecor), + Bxy.est.gmm = mean(Bxy.est.gmm)), + by=c("N","m")] diff --git a/simulations/plot_example_2.R b/simulations/plot_example_2.R new file mode 100644 index 0000000..d5ca2b6 --- /dev/null +++ b/simulations/plot_example_2.R @@ -0,0 +1,102 @@ +library(arrow) +library(data.table) +library(ggplot2) + +df <- data.table(read_feather("example_2_simulation.feather")) + +x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)] +x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)), + zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive), + bias = Bxy - Bxy.est.naive, + sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))] + +x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Naive' + ), + by=c('N','m')] + +g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)] +g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)), + zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive), + bias = Bgy - Bgy.est.naive, + sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))] + +g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Naive' + ), + by=c('N','m')] + + + +x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full), + zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full), + bias = Bxy.est.true - Bxy.est.amelia.full, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))] + +x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Multiple imputation' + ), + by=c('N','m')] + + + +g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)] +g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full), + zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full), + bias = Bgy.est.amelia.full - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))] + +g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Multiple imputation' + ), + by=c('N','m')] + + + + +x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)] +x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok), + zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok), + bias = Bxy.est.amelia.nok - Bxy.est.true, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))] + + x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Multiple imputation (Classifier features unobserved)' + ), + by=c('N','m')] + +g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)] +g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok), + zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok), + bias = Bgy.est.amelia.nok - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))] + +g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Multiple imputation (Classifier features unobserved)' + ), + by=c('N','m')] + + +plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot)) + +ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") + +kggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='C') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") diff --git a/simulations/plot_example_2_B.R b/simulations/plot_example_2_B.R new file mode 100644 index 0000000..62c6848 --- /dev/null +++ b/simulations/plot_example_2_B.R @@ -0,0 +1,277 @@ +source("RemembR/R/RemembeR.R") +library(arrow) +library(data.table) +library(ggplot2) +library(filelock) +library(argparse) + +l <- filelock::lock("example_2_B.feather_lock",exclusive=FALSE) +df <- data.table(read_feather("example_2_B.feather")) +filelock::unlock(l) + +parser <- arg_parser("Simulate data and fit corrected models.") +parser <- add_argument(parser, "--infile", default="", help="name of the file to read.") +parser <- add_argument(parser, "--name", default="", help="The name to safe the data to in the remember file.") +args <- parse_args(parser) + +build_plot_dataset <- function(df){ + x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)] + x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)), + zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive), + bias = Bxy - Bxy.est.naive, + Bxy.est.naive = Bxy.est.naive, + sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))] + + x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci), + mean.bias = mean(bias), + mean.est = mean(Bxy.est.naive), + var.est = var(Bxy.est.naive), + N.sims = .N, + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Naive' + ), + by=c('N','m')] + + + g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)] + g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)), + zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive), + bias = Bgy - Bgy.est.naive, + Bgy.est.naive = Bgy.est.naive, + sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))] + + g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci), + mean.bias = mean(bias), + mean.est = mean(Bgy.est.naive), + var.est = var(Bgy.est.naive), + N.sims = .N, + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Naive' + ), + by=c('N','m')] + + + x.feasible <- df[,.(N, m, Bxy, Bxy.est.feasible, Bxy.ci.lower.feasible, Bxy.ci.upper.feasible)] + x.feasible <- x.feasible[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.feasible) & (Bxy <= Bxy.ci.upper.feasible)), + zero.in.ci = (0 >= Bxy.ci.lower.feasible) & (0 <= Bxy.ci.upper.feasible), + bias = Bxy - Bxy.est.feasible, + Bxy.est.feasible = Bxy.est.feasible, + sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.feasible)))] + + x.feasible.plot <- x.feasible[,.(p.true.in.ci = mean(true.in.ci), + mean.bias = mean(bias), + mean.est = mean(Bxy.est.feasible), + var.est = var(Bxy.est.feasible), + N.sims = .N, + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Feasible' + ), + by=c('N','m')] + + + g.feasible <- df[,.(N, m, Bgy, Bgy.est.feasible, Bgy.ci.lower.feasible, Bgy.ci.upper.feasible)] + g.feasible <- g.feasible[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.feasible) & (Bgy <= Bgy.ci.upper.feasible)), + zero.in.ci = (0 >= Bgy.ci.lower.feasible) & (0 <= Bgy.ci.upper.feasible), + bias = Bgy - Bgy.est.feasible, + Bgy.est.feasible = Bgy.est.feasible, + sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.feasible)))] + + g.feasible.plot <- g.feasible[,.(p.true.in.ci = mean(true.in.ci), + mean.bias = mean(bias), + mean.est = mean(Bgy.est.feasible), + var.est = var(Bgy.est.feasible), + N.sims = .N, + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Feasible' + ), + by=c('N','m')] + + + + x.amelia.full <- df[,.(N, m, Bxy, Bxy.est.true, Bxy.ci.lower.amelia.full, Bxy.ci.upper.amelia.full, Bxy.est.amelia.full)] + + x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full), + zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full), + bias = Bxy.est.true - Bxy.est.amelia.full, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))] + + x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + mean.est = mean(Bxy.est.amelia.full), + var.est = var(Bxy.est.amelia.full), + N.sims = .N, + variable='x', + method='Multiple imputation' + ), + by=c('N','m')] + + + g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)] + g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full), + zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full), + bias = Bgy.est.amelia.full - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))] + + g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + mean.est = mean(Bgy.est.amelia.full), + var.est = var(Bgy.est.amelia.full), + N.sims = .N, + variable='g', + method='Multiple imputation' + ), + by=c('N','m')] + + x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)] + x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok), + zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok), + bias = Bxy.est.amelia.nok - Bxy.est.true, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))] + + x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + mean.est = mean(Bxy.est.amelia.nok), + var.est = var(Bxy.est.amelia.nok), + N.sims = .N, + variable='x', + method='Multiple imputation (Classifier features unobserved)' + ), + by=c('N','m')] + + g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)] + g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok), + zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok), + bias = Bgy.est.amelia.nok - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))] + + g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + mean.est = mean(Bgy.est.amelia.nok), + var.est = var(Bgy.est.amelia.nok), + N.sims = .N, + variable='g', + method="Multiple imputation (Classifier features unobserved)" + ), + by=c('N','m')] + + + x.mecor <- df[,.(N,m,Bxy.est.true, Bxy.est.mecor,Bxy.lower.mecor, Bxy.upper.mecor)] + + x.mecor <- x.mecor[,':='(true.in.ci = (Bxy.est.true >= Bxy.lower.mecor) & (Bxy.est.true <= Bxy.upper.mecor), + zero.in.ci = (0 >= Bxy.lower.mecor) & (0 <= Bxy.upper.mecor), + bias = Bxy.est.mecor - Bxy.est.true, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.mecor))] + + x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + mean.est = mean(Bxy.est.mecor), + var.est = var(Bxy.est.mecor), + N.sims = .N, + variable='x', + method='Regression Calibration' + ), + by=c('N','m')] + + g.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.lower.mecor, Bgy.upper.mecor)] + + g.mecor <- g.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.lower.mecor) & (Bgy.est.true <= Bgy.upper.mecor), + zero.in.ci = (0 >= Bgy.lower.mecor) & (0 <= Bgy.upper.mecor), + bias = Bgy.est.mecor - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))] + + g.mecor.plot <- g.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + mean.est = mean(Bgy.est.mecor), + var.est = var(Bgy.est.mecor), + N.sims = .N, + variable='g', + method='Regression Calibration' + ), + by=c('N','m')] + + ## x.mecor <- df[,.(N,m,Bgy.est.true, Bgy.est.mecor,Bgy.ci.lower.mecor, Bgy.ci.upper.mecor)] + + ## x.mecor <- x.mecor[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.mecor) & (Bgy.est.true <= Bgy.ci.upper.mecor), + ## zero.in.ci = (0 >= Bgy.ci.lower.mecor) & (0 <= Bgy.ci.upper.mecor), + ## bias = Bgy.est.mecor - Bgy.est.true, + ## sign.correct = sign(Bgy.est.true) == sign(Bgy.est.mecor))] + + ## x.mecor.plot <- x.mecor[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + ## mean.bias = mean(bias), + ## p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + ## variable='g', + ## method='Regression Calibration' + ## ), + ## by=c('N','m')] + + + x.gmm <- df[,.(N,m,Bxy.est.true, Bxy.est.gmm,Bxy.ci.lower.gmm, Bxy.ci.upper.gmm)] + x.gmm <- x.gmm[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.gmm) & (Bxy.est.true <= Bxy.ci.upper.gmm), + zero.in.ci = (0 >= Bxy.ci.lower.gmm) & (0 <= Bxy.ci.upper.gmm), + bias = Bxy.est.gmm - Bxy.est.true, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.gmm))] + + x.gmm.plot <- x.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + mean.est = mean(Bxy.est.gmm), + var.est = var(Bxy.est.gmm), + N.sims = .N, + variable='x', + method='2SLS+gmm' + ), + by=c('N','m')] + + g.gmm <- df[,.(N,m,Bgy.est.true, Bgy.est.gmm,Bgy.ci.lower.gmm, Bgy.ci.upper.gmm)] + g.gmm <- g.gmm[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.gmm) & (Bgy.est.true <= Bgy.ci.upper.gmm), + zero.in.ci = (0 >= Bgy.ci.lower.gmm) & (0 <= Bgy.ci.upper.gmm), + bias = Bgy.est.gmm - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.gmm))] + + g.gmm.plot <- g.gmm[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + mean.est = mean(Bgy.est.gmm), + var.est = var(Bgy.est.gmm), + N.sims = .N, + variable='g', + method='2SLS+gmm' + ), + by=c('N','m')] + + accuracy <- df[,mean(accuracy)] + return(plot.df) +} + +df <- read_feather(args$infile) +plot.df <- build_plot_dataset(df) +remember(plot.df,args$name)) + + +## df[gmm.ER_pval<0.05] + + + +## plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot, x.mecor.plot, g.mecor.plot, x.gmm.plot, g.gmm.plot, x.feasible.plot, g.feasible.plot),use.names=T) + +## plot.df[,accuracy := accuracy] + +## # plot.df <- plot.df[,":="(sd.est=sqrt(var.est)/N.sims)] + + + +## ggplot(plot.df[variable=='x'], aes(y=mean.est, ymax=mean.est + var.est/2, ymin=mean.est-var.est/2, x=method)) + geom_pointrange() + facet_grid(-m~N) + scale_x_discrete(labels=label_wrap_gen(10)) + +## ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") + +## ggplot(plot.df,aes(y=N,x=m,color=abs(mean.bias))) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c(option='D') + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") diff --git a/simulations/plot_example_2_continuous.R b/simulations/plot_example_2_continuous.R new file mode 100644 index 0000000..9b41076 --- /dev/null +++ b/simulations/plot_example_2_continuous.R @@ -0,0 +1,102 @@ +library(arrow) +library(data.table) +library(ggplot2) + +df <- data.table(read_feather("example_2_simulation.feather")) + +x.naive <- df[,.(N, m, Bxy, Bxy.est.naive, Bxy.ci.lower.naive, Bxy.ci.upper.naive)] +x.naive <- x.naive[,':='(true.in.ci = as.integer((Bxy >= Bxy.ci.lower.naive) & (Bxy <= Bxy.ci.upper.naive)), + zero.in.ci = (0 >= Bxy.ci.lower.naive) & (0 <= Bxy.ci.upper.naive), + bias = Bxy - Bxy.est.naive, + sign.correct = as.integer(sign(Bxy) == sign(Bxy.est.naive)))] + +x.naive.plot <- x.naive[,.(p.true.in.ci = mean(true.in.ci), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Naive' + ), + by=c('N','m')] + +g.naive <- df[,.(N, m, Bgy, Bgy.est.naive, Bgy.ci.lower.naive, Bgy.ci.upper.naive)] +g.naive <- g.naive[,':='(true.in.ci = as.integer((Bgy >= Bgy.ci.lower.naive) & (Bgy <= Bgy.ci.upper.naive)), + zero.in.ci = (0 >= Bgy.ci.lower.naive) & (0 <= Bgy.ci.upper.naive), + bias = Bgy - Bgy.est.naive, + sign.correct = as.integer(sign(Bgy) == sign(Bgy.est.naive)))] + +g.naive.plot <- g.naive[,.(p.true.in.ci = mean(true.in.ci), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Naive' + ), + by=c('N','m')] + + + +x.amelia.full <- x.amelia.full[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.full) & (Bxy.est.true <= Bxy.ci.upper.amelia.full), + zero.in.ci = (0 >= Bxy.ci.lower.amelia.full) & (0 <= Bxy.ci.upper.amelia.full), + bias = Bxy.est.true - Bxy.est.amelia.full, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.full))] + +x.amelia.full.plot <- x.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Multiple imputation' + ), + by=c('N','m')] + + + +g.amelia.full <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.full, Bgy.ci.lower.amelia.full, Bgy.ci.upper.amelia.full)] +g.amelia.full <- g.amelia.full[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.full) & (Bgy.est.true <= Bgy.ci.upper.amelia.full), + zero.in.ci = (0 >= Bgy.ci.lower.amelia.full) & (0 <= Bgy.ci.upper.amelia.full), + bias = Bgy.est.amelia.full - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.full))] + +g.amelia.full.plot <- g.amelia.full[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Multiple imputation' + ), + by=c('N','m')] + + + + +x.amelia.nok <- df[,.(N, m, Bxy.est.true, Bxy.est.amelia.nok, Bxy.ci.lower.amelia.nok, Bxy.ci.upper.amelia.nok)] +x.amelia.nok <- x.amelia.nok[,':='(true.in.ci = (Bxy.est.true >= Bxy.ci.lower.amelia.nok) & (Bxy.est.true <= Bxy.ci.upper.amelia.nok), + zero.in.ci = (0 >= Bxy.ci.lower.amelia.nok) & (0 <= Bxy.ci.upper.amelia.nok), + bias = Bxy.est.amelia.nok - Bxy.est.true, + sign.correct = sign(Bxy.est.true) == sign(Bxy.est.amelia.nok))] + + x.amelia.nok.plot <- x.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='x', + method='Multiple imputation (Classifier features unobserved)' + ), + by=c('N','m')] + +g.amelia.nok <- df[,.(N, m, Bgy.est.true, Bgy.est.amelia.nok, Bgy.ci.lower.amelia.nok, Bgy.ci.upper.amelia.nok)] +g.amelia.nok <- g.amelia.nok[,':='(true.in.ci = (Bgy.est.true >= Bgy.ci.lower.amelia.nok) & (Bgy.est.true <= Bgy.ci.upper.amelia.nok), + zero.in.ci = (0 >= Bgy.ci.lower.amelia.nok) & (0 <= Bgy.ci.upper.amelia.nok), + bias = Bgy.est.amelia.nok - Bgy.est.true, + sign.correct = sign(Bgy.est.true) == sign(Bgy.est.amelia.nok))] + +g.amelia.nok.plot <- g.amelia.nok[,.(p.true.in.ci = mean(as.integer(true.in.ci)), + mean.bias = mean(bias), + p.sign.correct = mean(as.integer(sign.correct & (! zero.in.ci))), + variable='g', + method='Multiple imputation (Classifier features unobserved)' + ), + by=c('N','m')] + + +plot.df <- rbindlist(list(x.naive.plot,g.naive.plot,x.amelia.full.plot,g.amelia.full.plot,x.amelia.nok.plot,g.amelia.nok.plot)) + +ggplot(plot.df,aes(y=N,x=m,color=p.sign.correct)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") + +ggplot(plot.df,aes(y=N,x=m,color=mean.bias)) + geom_point() + facet_grid(variable ~ method) + scale_color_viridis_c() + theme_minimal() + xlab("Number of gold standard labels") + ylab("Total sample size") diff --git a/simulations/simulation_base.R b/simulations/simulation_base.R new file mode 100644 index 0000000..26916ab --- /dev/null +++ b/simulations/simulation_base.R @@ -0,0 +1,155 @@ +library(predictionError) +library(mecor) +options(amelia.parallel="no", + amelia.ncpus=1) +library(Amelia) +library(Zelig) + +logistic <- function(x) {1/(1+exp(-1*x))} + +run_simulation <- function(df, result){ + + accuracy <- df[,mean(w_pred==x)] + result <- append(result, list(accuracy=accuracy)) + + (model.true <- lm(y ~ x + g, data=df)) + true.ci.Bxy <- confint(model.true)['x',] + true.ci.Bgy <- confint(model.true)['g',] + + result <- append(result, list(Bxy.est.true=coef(model.true)['x'], + Bgy.est.true=coef(model.true)['g'], + Bxy.ci.upper.true = true.ci.Bxy[2], + Bxy.ci.lower.true = true.ci.Bxy[1], + Bgy.ci.upper.true = true.ci.Bgy[2], + Bgy.ci.lower.true = true.ci.Bgy[1])) + + (model.feasible <- lm(y~x.obs+g,data=df)) + + feasible.ci.Bxy <- confint(model.feasible)['x.obs',] + result <- append(result, list(Bxy.est.feasible=coef(model.feasible)['x.obs'], + Bxy.ci.upper.feasible = feasible.ci.Bxy[2], + Bxy.ci.lower.feasible = feasible.ci.Bxy[1])) + + feasible.ci.Bgy <- confint(model.feasible)['g',] + result <- append(result, list(Bgy.est.feasible=coef(model.feasible)['g'], + Bgy.ci.upper.feasible = feasible.ci.Bgy[2], + Bgy.ci.lower.feasible = feasible.ci.Bgy[1])) + + (model.naive <- lm(y~w+g, data=df)) + + naive.ci.Bxy <- confint(model.naive)['w',] + naive.ci.Bgy <- confint(model.naive)['g',] + + result <- append(result, list(Bxy.est.naive=coef(model.naive)['w'], + Bgy.est.naive=coef(model.naive)['g'], + Bxy.ci.upper.naive = naive.ci.Bxy[2], + Bxy.ci.lower.naive = naive.ci.Bxy[1], + Bgy.ci.upper.naive = naive.ci.Bgy[2], + Bgy.ci.lower.naive = naive.ci.Bgy[1])) + + + ## multiple imputation when k is observed + ## amelia does great at this one. + noms <- c() + if(length(unique(df$x.obs)) <=2){ + noms <- c(noms, 'x.obs') + } + + if(length(unique(df$g)) <=2){ + noms <- c(noms, 'g') + } + + + amelia.out.k <- amelia(df, m=200, p2s=0, idvars=c('x','w_pred'),noms=noms) + mod.amelia.k <- zelig(y~x.obs+g, model='ls', data=amelia.out.k$imputations, cite=FALSE) + (coefse <- combine_coef_se(mod.amelia.k, messages=FALSE)) + + est.x.mi <- coefse['x.obs','Estimate'] + est.x.se <- coefse['x.obs','Std.Error'] + result <- append(result, + list(Bxy.est.amelia.full = est.x.mi, + Bxy.ci.upper.amelia.full = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.full = est.x.mi - 1.96 * est.x.se + )) + + est.g.mi <- coefse['g','Estimate'] + est.g.se <- coefse['g','Std.Error'] + + result <- append(result, + list(Bgy.est.amelia.full = est.g.mi, + Bgy.ci.upper.amelia.full = est.g.mi + 1.96 * est.g.se, + Bgy.ci.lower.amelia.full = est.g.mi - 1.96 * est.g.se + )) + + ## What if we can't observe k -- most realistic scenario. We can't include all the ML features in a model. + amelia.out.nok <- amelia(df, m=200, p2s=0, idvars=c("x","k","w_pred"), noms=noms) + mod.amelia.nok <- zelig(y~x.obs+g, model='ls', data=amelia.out.nok$imputations, cite=FALSE) + (coefse <- combine_coef_se(mod.amelia.nok, messages=FALSE)) + + est.x.mi <- coefse['x.obs','Estimate'] + est.x.se <- coefse['x.obs','Std.Error'] + result <- append(result, + list(Bxy.est.amelia.nok = est.x.mi, + Bxy.ci.upper.amelia.nok = est.x.mi + 1.96 * est.x.se, + Bxy.ci.lower.amelia.nok = est.x.mi - 1.96 * est.x.se + )) + + est.g.mi <- coefse['g','Estimate'] + est.g.se <- coefse['g','Std.Error'] + + result <- append(result, + list(Bgy.est.amelia.nok = est.g.mi, + Bgy.ci.upper.amelia.nok = est.g.mi + 1.96 * est.g.se, + Bgy.ci.lower.amelia.nok = est.g.mi - 1.96 * est.g.se + )) + + N <- nrow(df) + m <- nrow(df[!is.na(x.obs)]) + p <- v <- train <- rep(0,N) + M <- m + p[(M+1):(N)] <- 1 + v[1:(M)] <- 1 + df <- df[order(x.obs)] + y <- df[,y] + x <- df[,x.obs] + g <- df[,g] + w <- df[,w] + # gmm gets pretty close + (gmm.res <- predicted_covariates(y, x, g, w, v, train, p, max_iter=100, verbose=TRUE)) + + result <- append(result, + list(Bxy.est.gmm = gmm.res$beta[1,1], + Bxy.ci.upper.gmm = gmm.res$confint[1,2], + Bxy.ci.lower.gmm = gmm.res$confint[1,1], + gmm.ER_pval = gmm.res$ER_pval + )) + + result <- append(result, + list(Bgy.est.gmm = gmm.res$beta[2,1], + Bgy.ci.upper.gmm = gmm.res$confint[2,2], + Bgy.ci.lower.gmm = gmm.res$confint[2,1])) + + + mod.calibrated.mle <- mecor(y ~ MeasError(w, reference = x.obs) + g, df, B=400, method='efficient') + (mod.calibrated.mle) + (mecor.ci <- summary(mod.calibrated.mle)$c$ci['x.obs',]) + result <- append(result, list( + Bxy.est.mecor = mecor.ci['Estimate'], + Bxy.upper.mecor = mecor.ci['UCI'], + Bxy.lower.mecor = mecor.ci['LCI']) + ) + + (mecor.ci <- summary(mod.calibrated.mle)$c$ci['g',]) + + result <- append(result, list( + Bgy.est.mecor = mecor.ci['Estimate'], + Bgy.upper.mecor = mecor.ci['UCI'], + Bgy.lower.mecor = mecor.ci['LCI']) + ) + +## clean up memory + rm(list=c("df","y","x","g","w","v","train","p","amelia.out.k","amelia.out.nok", "mod.calibrated.mle","gmm.res","mod.amelia.k","mod.amelia.nok", "model.true","model.naive","model.feasible")) + + gc() + return(result) +}